In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt 
import matplotlib.pyplot as plt
import scipy.stats as stats

from data_processing import *

%load_ext autoreload
%autoreload 2


In [2]:
X, y = get_training_data()
X_testing = get_testing_data()
#Unnamed: 0 is the Id of the row, we can drop it
X = X.drop('Unnamed: 0', axis=1 )

x_testing_id = X_testing['Unnamed: 0']
X_testing = X_testing.drop('Unnamed: 0', axis=1 )
# Unnamed: 0 is the Id of the row, we can drop it
y = y.drop(columns=['Unnamed: 0'])  # If the first column is labeled as 'Unnamed: 0'


### Data exploration

In [None]:
print('Number of elements :', len(X))
print('Number of features :', len(X.columns))

In [None]:
print('len(y):', len(y))

In [None]:
X.info()

In [None]:
X.isnull().sum()

In [None]:
unique_values = {}
for col in X.columns:
    unique_values[col] = X[col].value_counts().shape[0]

pd.DataFrame(unique_values, index=['unique value count']).transpose()

In [None]:
plt.figure(figsize = (15,25))
for i,col in enumerate(['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age','Education', 'Income']):
    plt.subplot(4,2,i+1)
    plt.title('Distribution of '+col)
    plt.boxplot(x = col, data = X, vert=False)
plt.show()

In [None]:
cols = ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 
        'PhysActivity', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk']

def create_plot_pivot(data, x_column):
    """Create a pivot table for satisfaction versus another rating for easy plotting."""
    # Merge `X` and `y` to ensure access to 'Diabetes_binary' column for grouping
    data_with_target = data.copy()
    data_with_target['Diabetes_binary'] = y['Diabetes_binary']
    data_with_target['Diabetes_binary'] = data_with_target['Diabetes_binary'].replace({0: 'No Diabetes', 1: 'Diabetes'})
    
    _df_plot = data_with_target.groupby([x_column, 'Diabetes_binary']).size() \
                   .reset_index().pivot(columns='Diabetes_binary', index=x_column, values=0)
    return _df_plot

fig, ax = plt.subplots(3, 4, figsize=(20, 20))
axe = ax.ravel()
c = len(cols)
plt.suptitle('Diabetes Distribution by Features', fontsize=20)

# Custom colors for the plot
custom_colors = {'No Diabetes': 'green', 'Diabetes': 'red'}

# Plotting each column in cols
for i in range(c):
    plot_data = create_plot_pivot(X, cols[i])
    plot_data.plot(kind='bar', stacked=True, ax=axe[i], color=[custom_colors[val] for val in plot_data.columns])
    axe[i].set_xlabel(cols[i])
    
    # Adding percentage labels for the "Diabetes" (red) part of each bar
    for j, (index, row) in enumerate(plot_data.iterrows()):
        total = row['No Diabetes'] + row['Diabetes']  # Sum of "No Diabetes" and "Diabetes" counts
        if total > 0:
            diabetes_percentage = (row['Diabetes'] / total) * 100  # Calculate percentage for Diabetes
            axe[i].text(j, row['No Diabetes'] + row['Diabetes'] / 2, 
                        f"{diabetes_percentage:.1f}%", color="red", ha="center", va="top")

plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to fit the title
plt.show()


#### Gaussian distribution

In [None]:

def normal_probability_plot(data, col):
    """
    Generates a normal probability plot for the given data.
    
    Parameters:
        data (array-like): The data for which to generate the plot.
        
    Returns:
        None
    """
    # Generate a probability plot
    stats.probplot(data, dist="norm", plot=plt)
    plt.title("Normal Probability Plot for "+ col)
    plt.xlabel("Theoretical Quantiles")
    plt.ylabel("Sample Quantiles")
    plt.show()

In [None]:
# for col in X.columns:
#     normal_probability_plot(X[col], col)

In [None]:
#Correlation matrix
corr = X.corr()
plt.figure(figsize=(30, 15))
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

### Process the data

In [3]:

# Look for multicollinearity between the features and remove the features that are unnecessary
# we already know the age so age_group is not needed
columnsToDrop = ['Age_Group', 'MentHlth', 'HvyAlcoholConsump', 'NoDocbcCost', 'Smoker', 'Fruits', 'Mental_Health_Risk', 'Education', 'Income', 'Heart_Disease_Risk']
X = X.drop(columns=columnsToDrop)
X = feature_encoding(X)

X_testing = X_testing.drop(columns=columnsToDrop)
X_testing = feature_encoding(X_testing)

X_testing.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Stroke,HeartDiseaseorAttack,PhysActivity,Veggies,AnyHealthcare,GenHlth,PhysHlth,DiffWalk,Sex,Age,BMI_Category,Healthy_Diet,Education_Level,Income_Group
0,0,0,1,20.807398,0,0,0,1,1,3,7.18234,0,0,7.018934,0,1,0,1
1,1,1,1,27.69003,0,0,1,1,1,3,0.576965,0,0,13.088206,2,1,2,2
2,0,0,1,24.350989,0,0,1,1,1,1,-0.187868,0,1,0.916045,0,1,0,0
3,0,0,1,27.100812,0,0,1,1,1,2,-0.088216,0,1,2.268324,2,0,0,0
4,0,1,1,30.90091,0,0,0,1,1,4,26.942559,1,0,8.028891,1,1,0,1


### Split validation set and train set

In [4]:
from training import *

In [5]:
X_train, X_validation, y_train, y_validation =  data_splits(X, y) #   split data
X_train_scaled, X_validation_scaled, X_testing_Scaled = normalize_features(X_train, X_validation, X_testing) #   normalize data


### Training

In [6]:
import numpy as np
from sklearn.utils import gen_batches
def predict_in_batches(cls, X, batch_size=100):
    """
    Make predictions for the input data in batches.
    
    Parameters:
        cls (object): The trained classifier model.
        X (array-like): The input data to make predictions for.
        batch_size (int): The size of each batch.
        
    Returns:
        array-like: The predictions for the input data.
    """
    predictions = []
    
    # Generate batches and make predictions for each batch
    for batch in gen_batches(len(X), batch_size):
        batch_predictions = cls.predict(X[batch])
        predictions.append(batch_predictions)
    
    # Concatenate all batch predictions into a single array
    predictions = np.concatenate(predictions, axis=0)
    return predictions

In [None]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

def perform_grid_search(model, X_train_scaled, y_train, params):
    print("Performing grid search for ", model)
    # Define the cross-validation strategy
    strat_kfold = StratifiedKFold(n_splits=10) # TODO

    # Grid search for the model 
    #"f1_macro": Calculates F1-score per class and takes the average, treating all classes equally.
    #"f1_weighted": Calculates F1-score per class and takes a weighted average, considering class imbalance.
    grid_search = GridSearchCV(model, params, scoring='f1', cv=strat_kfold, n_jobs=10) # n_jobs=10 uses 10 parallel processes. Speeds up the process 


    if isinstance(model, RandomForestClassifier) or isinstance(model, SVC):
        grid_search.fit(X_train_scaled, y_train.values.ravel())
    else:
        grid_search.fit(X_train, y_train)
    best_param = grid_search.best_params_
    best_score = grid_search.best_score_ 
    print("Best parameters are:", best_param)
    print("Best score is:", best_score)

    # Return the fitted grid search objects
    # return grid_search, best_param, best_score

In [20]:
def model_trainning(model, X_train_scaled, y_train, X_validation_scaled, y_validation):
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_validation_scaled)
    validation_f1 = f1_score(y_validation, y_pred)
    print("Validation f1 score:", validation_f1)

In [22]:
def classification_summary(y_train, y_train_pred, y_validation, y_validation_pred):
    train_f1 = f1_score(y_train, y_train_pred) 
    validation_f1 = f1_score(y_validation, y_validation_pred) 
    print("Train f1 score:", train_f1)

    print("Validation f1 score:", validation_f1)
    print("\nTraining Set Classification Report:")

    print(classification_report(y_train, y_train_pred))

    print("Training Set Confusion Matrix:")

    print(confusion_matrix(y_train, y_train_pred))

    print("\nValidation Set Classification Report:")

    print(classification_report(y_validation, y_validation_pred))

    print("Validation Set Confusion Matrix:")

    print(confusion_matrix(y_validation, y_validation_pred))

In [None]:
def kaggle_submission(model, X_testing_scaled, x_testing_id):
    y_pred = model.predict(X_testing_scaled)
    y_pred = pd.DataFrame({
        'index': x_testing_id , 
        'Diabetes_binary': y_pred,
    })

    # Save the predictions to a CSV file
    y_pred.to_csv('./y_pred.csv', index=False)

### Random Forest

In [None]:
############### TESTING ####################
from sklearn.ensemble import RandomForestClassifier


cls_randomforest = RandomForestClassifier(class_weight='balanced', 
                                          random_state=42, 
                                          max_depth=20, 
                                          max_leaf_nodes=200, 
                                          min_samples_leaf=20)

param_grid_random_forest = {
    'n_estimators': [10, 200, 300, 400, 500],
    'max_depth': [20, 30],
    'bootstrap': [True, False],
}
perform_grid_search(cls_randomforest, X_train_scaled, y_train, params= param_grid_random_forest)




Performing grid search for  RandomForestClassifier(class_weight='balanced', max_depth=20,
                       max_leaf_nodes=200, min_samples_leaf=20,
                       random_state=42)
Best parameters are: {'n_estimators': 500}
Best score is: 0.43756226703549733


(GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
              estimator=RandomForestClassifier(class_weight='balanced',
                                               max_depth=20, max_leaf_nodes=200,
                                               min_samples_leaf=20,
                                               random_state=42),
              n_jobs=10, param_grid={'n_estimators': [100, 200, 300, 400, 500]},
              scoring='f1'),
 {'n_estimators': 500},
 np.float64(0.43756226703549733))

In [None]:
model_trainning(cls_randomforest, X_train_scaled, y_train, X_validation_scaled, y_validation)

In [9]:
y_train_pred = predict_in_batches(cls_randomforest, X_train_scaled)
y_validation_pred = predict_in_batches(cls_randomforest, X_validation_scaled)

In [None]:
classification_summary(y_train, y_train_pred, y_validation, y_validation_pred)

### Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression

In [None]:
cls_logistic = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000, dual=False, C=5)
param_grid_logistic = {
    'max_iter':[1000, 2000, 3000],
    'tol': [1e-3, 1e-4, 1e-5],
    'C': [0.5, 1, 10],
}
perform_grid_search(cls_logistic, X_train_scaled, y_train, param_grid_logistic)

Performing grid search for  LogisticRegression(C=5, class_weight='balanced', max_iter=1000, random_state=42)


  y = column_or_1d(y, warn=True)


Best parameters are: {'C': 1, 'max_iter': 1000, 'tol': 0.001}
Best score is: 0.4377676476845417


(GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=None, shuffle=False),
              estimator=LogisticRegression(C=5, class_weight='balanced',
                                           max_iter=1000, random_state=42),
              n_jobs=10,
              param_grid={'C': [1, 10], 'max_iter': [1000, 2000, 3000],
                          'tol': [0.001, 0.0001, 1e-05]},
              scoring='f1'),
 {'C': 1, 'max_iter': 1000, 'tol': 0.001},
 np.float64(0.4377676476845417))

In [28]:
### Use best parameters to train the model
cls_logistic = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000, dual=False, C=1, tol=1e-3)
model_trainning(cls_logistic, X_train_scaled, y_train, X_validation_scaled, y_validation)

Validation f1 score: 0.4367933177141693


  y = column_or_1d(y, warn=True)


In [15]:
model_trainning(cls_logistic, X_train_scaled, y_train, X_validation_scaled, y_validation)

Validation f1 score: 0.4366340668296659


  y = column_or_1d(y, warn=True)


### XGBClassifier

In [31]:
from xgboost import XGBClassifier

In [None]:
# Initialize the XGBoost classifier
# booster [default= gbtree ]
# Which booster to use. Can be gbtree, gblinear or dart; gbtree and dart use tree based models while gblinear uses linear functions.
xgb_model = XGBClassifier(random_state=42, objective='binary:logistic', eval_metric='logloss')

# Define the parameter grid
param_grid = {
    'booster': ['gbtree', 'gblinear', 'dart'],
    'eval_metric': ['logloss', 'error'],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
perform_grid_search(xgb_model, X_train_scaled, y_train, param_grid)

Performing grid search for  XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=None, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=None,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)


In [None]:
# Define a dictionary of model name and their trained model
trained_models = {
        'Decision Tree': cls_decision_tree,
        'Random Forest': cls_randomforest,
        'SVM': cls_svm }

In [None]:
# predict labels and calculate accuracy and F1score
y_train_pred_dict, y_test_pred_dict, evaluation_results = eval_model(trained_models, X_train_scaled, X_validation_scaled, y_train, y_validation)

# classification report and calculate confusion matrix
report_model(y_train, y_validation, y_train_pred_dict, y_test_pred_dict, trained_models)