<div align = "center" style = "font-family:consolas;"> <h1> Alzheimer's Dataset Model Analysis </h1> </div>

<div align = "center"> <p style = "font-family: consolas"> This notebook is being used to train an efficent classification model to classify patients with the train data set accquired during previous step</p> </div>

<ul><li><p style = "font-family: consolas"> Importing important libraries</p></li></ul>

In [34]:
# Data manipulation
import pandas as pd
import numpy as np
import joblib

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import QuantileTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline

# Model selection
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

#Model training
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

<ul><li><p style = "font-family: consolas"> Loading Data</p></li></ul>

In [35]:
train_data = pd.read_csv('data/train_data.csv')

y = train_data['Alzheimer’s Diagnosis']
X = train_data.drop(columns=['Alzheimer’s Diagnosis'])

test_data = pd.read_csv('data/test_data.csv')
y_test = test_data['Alzheimer’s Diagnosis']
X_test = test_data.drop(columns=['Alzheimer’s Diagnosis'])

In [36]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59426 entries, 0 to 59425
Data columns (total 24 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Country                               59426 non-null  object 
 1   Age                                   59426 non-null  int64  
 2   Gender                                59426 non-null  int64  
 3   Education Level                       59426 non-null  int64  
 4   BMI                                   59426 non-null  float64
 5   Physical Activity Level               59426 non-null  object 
 6   Smoking Status                        59426 non-null  object 
 7   Alcohol Consumption                   59426 non-null  object 
 8   Diabetes                              59426 non-null  int64  
 9   Hypertension                          59426 non-null  int64  
 10  Cholesterol Level                     59426 non-null  int64  
 11  Family History 

<ul><li><p style = "font-family: consolas">Building Pipeline</p></li></ul>

In [37]:
categorical_features = ['Country','Physical Activity Level','Smoking Status'
                        , 'Alcohol Consumption', 'Depression Level', 'Sleep Quality',
                        'Dietary Habits', 'Air Pollution Exposure', 'Employment Status' , 
                        'Marital Status', 'Social Engagement Level', 'Stress Levels',
                        'Income Level']

numerical_features = [x for x in X.columns if x not in categorical_features]

#Building a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', QuantileTransformer() , numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore' , sparse_output=False) ,categorical_features)
    ])

# Building a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())
                ])

<ul><li><p style = "font-family: consolas">Train using GridSearchCV</p></li></ul>

In [38]:
from lightgbm import LGBMClassifier

parameters = [
    {
        'classifier': [LGBMClassifier(boosting_type='gbdt', device='gpu',  gpu_device_id=0, random_state=42, verbosity=-1)],
        'classifier__n_estimators': [200,500],
        'classifier__learning_rate': np.logspace(-2, 0, 3),  # [0.01, 0.1, 1.0]
        'classifier__max_depth': [-1, 10, 20],  # -1 allows dynamic depth
        'classifier__num_leaves': [31, 50, 100],  # Higher values capture more patterns
        'classifier__subsample': [0.8, 1.0],  # Helps with generalization
        'classifier__colsample_bytree': [0.8, 1.0],  # Reduces overfitting
        'classifier__eval_metric': ['recall'],
    }
]

model = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=3, scoring='recall')
model.fit(X, y)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits




<ul><li><p style = "font-family: consolas">Saving The Model</p></li></ul>

In [39]:
# Save the best model from GridSearchCV
joblib.dump(model.best_estimator_, "models/model.pkl")
print("Model saved successfully!")

Model saved successfully!
