# Diabetes - Random Forest

## Preparation

In [17]:
from os.path import join
import pandas as pd
import numpy as np
import joblib
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from collections import Counter
from sklearn.ensemble import RandomForestClassifier

input_dir = join('..', 'data', 'raw')
output_dir = join('..', 'data', 'result')
log_dir = join('..', 'log')

diabetes = pd.read_csv(join(input_dir, 'diabetes_prediction_dataset.csv'))
diabetes.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [18]:
df = diabetes.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


## Feature information:
- gender: Female or male
- age: patient's age
- hypertension: high blood pressure disease (0: No - 1: Yes)
- heart_disease: (0: No - 1: Yes)
- smoking_history: There are multiple labels for this feature, mainly "No info" and "never"
- bmi: body mass index - higher bmi tends to have higher risk of diabetes
- HbA1c_level: person's average blood sugar level over the past 2-3 months (higher => more chance of developing diabetes)
- blood_glucose_level: the amount of glucose in the bloodstream at a given time
- diabetes: (0: No - 1: Having diabetes)

The first thing we usually do when it comes to training new model is to explore and analyze the dataset. Luckily, there is a notebook created by pannmie that did a great job on EDA analysis, we could see more from here: https://www.kaggle.com/code/tumpanjawat/diabetes-eda-random-forest-hp/notebook.

## Preprocessing

From pannmie's EDA Analysis, we can see that:
- Too many duplicate rows of data (3854 rows)
- No missing values
- Right-skewed feature (BMI)
- Imbalance dataset (hypertension, heart_disease, diabetes)
- No encoding for categorical features 

We need to solve these challenges in order to improve the model's accuracy.
Also, from the notebook we can see that the authour had reengineered `smoking_history` features from 6 unique values into only 3, which is what I want to apply as well.

In [19]:
df = df.drop_duplicates()

df['bmi'] = np.log(df['bmi'])

df['gender'] = df['gender'].str.lower()
df['gender'] = df['gender'].map({'female': 0, 'male': 1, 'other': 2}).astype(int)

def recategorize_smoking(smoking_status):
    if smoking_status in ['never', 'No Info']:
        return 'non-smoker'
    elif smoking_status == 'current':
        return 'current'
    elif smoking_status in ['ever', 'former', 'not current']:
        return 'past_smoker'

df['smoking_history'] = df['smoking_history'].apply(recategorize_smoking)
df['smoking_history'] = df['smoking_history'].map({'non-smoker': 0, 'current': 1, 'past_smoker': 2})
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,80.0,0,1,0,3.226447,6.6,140,0
1,0,54.0,0,0,0,3.307619,6.6,80,0
2,1,28.0,0,0,0,3.307619,5.7,158,0
3,0,36.0,0,0,1,3.15487,5.0,155,0
4,1,76.0,1,1,1,3.002708,4.8,155,0


In [20]:
print(df.isnull().sum())
print(df.isna().sum())

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64
gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


## SMOTE

In [21]:
X = df.drop(['diabetes'], axis=1)
y = df['diabetes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)
print('Class distribution after SMOTE:', Counter(y_train_res))

Class distribution after SMOTE: Counter({0: 61365, 1: 61365})


## Modelling

### Find best depth for random forest

In [22]:
cv = KFold(n_splits=10)
accuracies = list()
max_attributes = len(list(X.columns))
depth_range = range(1, max_attributes + 1)

for depth in depth_range:
    fold_accuracies = []
    forest_model = RandomForestClassifier(max_depth=depth, random_state=42)
    for train_fold, valid_fold in cv.split(X_train_res):
        X_train_cv, X_valid_cv = X_train_res.iloc[train_fold], X_train_res.iloc[valid_fold]
        y_train_cv, y_valid_cv = y_train_res.iloc[train_fold], y_train_res.iloc[valid_fold]
        
        model = forest_model.fit(X_train_cv, y_train_cv)
        valid_acc = model.score(X_valid_cv, y_valid_cv)
        fold_accuracies.append(valid_acc)
    avg = sum(fold_accuracies)/len(fold_accuracies)
    accuracies.append(avg)
    
df = pd.DataFrame({"Max Depth": depth_range, "Average Accuracy": accuracies})
df = df[["Max Depth", "Average Accuracy"]]
print(df.to_string(index=False))

 Max Depth  Average Accuracy
         1          0.675035
         2          0.842394
         3          0.873943
         4          0.885806
         5          0.887525
         6          0.890328
         7          0.894370
         8          0.899780


### Testing

In [23]:
random_forest = RandomForestClassifier(max_depth=8, random_state=42)
random_forest.fit(X_train_res, y_train_res)
y_pred = random_forest.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy with max_depth=8: {test_accuracy:.4f}')

Test Accuracy with max_depth=8: 0.9082


In [24]:
test = pd.concat([X_test, y_test], axis=1)
test.to_csv(join(output_dir, 'test_set_randomForest.csv'), index=False)

submission = pd.DataFrame({
        "diabetes": y_pred
    })
submission.to_csv(join(output_dir, 'submission_randomForest.csv'), index=False)

joblib.dump(random_forest, join(log_dir, 'random_forest_model.pkl'))

['..\\log\\random_forest_model.pkl']

# The end