<div align = "center" style = "font-family:consolas;"> <h1> Alzheimer's Dataset Model Analysis </h1> </div>

<div align = "center"> <p style = "font-family: consolas"> This notebook is being used to train an efficent classification model to classify patients with the train data set accquired during previous step</p> </div>

<ul><li><p style = "font-family: consolas"> Importing important libraries</p></li></ul>

In [13]:
# Data manipulation
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import QuantileTransformer, OneHotEncoder
from sklearn.pipeline import Pipeline

# Model selection
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

#Model training
from sklearn.model_selection import GridSearchCV

<ul><li><p style = "font-family: consolas"> Loading Data</p></li></ul>

In [14]:
train_data = pd.read_csv('data/train_data.csv')

y = train_data['Alzheimer’s Diagnosis']
X = train_data.drop(columns=['Alzheimer’s Diagnosis'])

In [None]:
X.info()

<ul><li><p style = "font-family: consolas">Building Pipeline</p></li></ul>

In [16]:
categorical_features = ['Country','Physical Activity Level','Smoking Status'
                        , 'Alcohol Consumption', 'Depression Level', 'Sleep Quality',
                        'Dietary Habits', 'Air Pollution Exposure', 'Employment Status' , 
                        'Marital Status', 'Social Engagement Level', 'Stress Levels',
                        'Income Level']

numerical_features = [x for x in X.columns if x not in categorical_features]

#Building a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', QuantileTransformer() , numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore' , sparse_output=False) ,categorical_features)
    ])

# Building a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier())
                ])

<ul><li><p style = "font-family: consolas">Train using GridSearchCV</p></li></ul>

In [17]:
parameters = [
    {
        'classifier' : [LogisticRegression()],
        'classifier__penalty' : ['l1', 'l2', 'elasticnet', 'none'],
        'classifier__C' : np.logspace(-3, 2, 6),
        'classifier__solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    },
    {
        'classifier' : [DecisionTreeClassifier()],
        'classifier__criterion' : ['gini', 'entropy'],
        'classifier__splitter' : ['best', 'random'],
        'classifier__max_depth' : [10, 20, 30],
        'classifier__min_samples_split' : [2, 5, 10],
        'classifier__min_samples_leaf' : [1, 2, 4]
    },
    {
        'classifier' : [RandomForestClassifier()],
        'classifier__n_estimators' : [100, 200, 300],
        'classifier__criterion' : ['gini', 'entropy'],
        'classifier__max_depth' : [10, 20, 30],
        'classifier__min_samples_split' : [2, 5, 10],
        'classifier__min_samples_leaf' : [1, 2, 4]
    },
    {
        'classifier' : [GradientBoostingClassifier()],
        'classifier__n_estimators' : [100, 200, 300],
        'classifier__learning_rate' : np.logspace(-3, 0, 4),
        'classifier__max_depth' : [3, 4, 5],
        'classifier__min_samples_split' : [2, 5, 10],
        'classifier__min_samples_leaf' : [1, 2, 4],
        'classifier__subsample' : [0.8, 0.9, 1.0]
    },
    {
        'classifier' : [SVC()],
        'classifier__C' : np.logspace(-3, 2, 6),
        'classifier__kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
        'classifier__degree' : [2, 3, 4],
        'classifier__gamma' : ['scale', 'auto']
    },
    {
        'classifier' : [KNeighborsClassifier()],
        'classifier__n_neighbors' : [3, 5, 7, 9, 11, 13],
        'classifier__weights' : ['uniform', 'distance'],
        'classifier__algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'classifier__metric' : ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
    },
    {
        'classifier' : [GaussianNB()],
        'classifier__var_smoothing' : np.logspace(-9, -6, 4)
    }
]

model = GridSearchCV(pipeline, parameters, cv=9, n_jobs=-1, verbose=3)
model.fit(X, y)