In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [24]:
dementia_data = pd.read_csv("dementia_dataset.csv")

In [25]:
dementia_data.head()

Unnamed: 0,Gender,Age,EDUC,Socioeconomic Status,Mini-Mental State Examination,Clinical Dementia Rating,Estimated Total Intracranial Volume,Normalized Whole Brain Volume,Atlas Scaling Factor,Target
0,1,87,14,2.0,27.0,0.0,1987,0.696,0.883,Nondemented
1,1,88,14,2.0,30.0,0.0,2004,0.681,0.876,Nondemented
2,1,75,12,,23.0,0.5,1678,0.736,1.046,Demented
3,1,76,12,,28.0,0.5,1738,0.713,1.01,Demented
4,1,80,12,,22.0,0.5,1698,0.701,1.034,Demented


In [26]:
dementia_data.describe()

Unnamed: 0,Gender,Age,EDUC,Socioeconomic Status,Mini-Mental State Examination,Clinical Dementia Rating,Estimated Total Intracranial Volume,Normalized Whole Brain Volume,Atlas Scaling Factor
count,373.0,373.0,373.0,354.0,371.0,373.0,373.0,373.0,373.0
mean,0.428954,77.013405,14.597855,2.460452,27.342318,0.290885,1488.128686,0.729568,1.195461
std,0.495592,7.640957,2.876339,1.134005,3.683244,0.374557,176.139286,0.037135,0.138092
min,0.0,60.0,6.0,1.0,4.0,0.0,1106.0,0.644,0.876
25%,0.0,71.0,12.0,2.0,27.0,0.0,1357.0,0.7,1.099
50%,0.0,77.0,15.0,2.0,29.0,0.0,1470.0,0.729,1.194
75%,1.0,82.0,16.0,3.0,30.0,0.5,1597.0,0.756,1.293
max,1.0,98.0,23.0,5.0,30.0,2.0,2004.0,0.837,1.587


In [27]:
dementia_data.isnull().sum()

Gender                                  0
Age                                     0
EDUC                                    0
Socioeconomic Status                   19
Mini-Mental State Examination           2
Clinical Dementia Rating                0
Estimated Total Intracranial Volume     0
Normalized Whole Brain Volume           0
Atlas Scaling Factor                    0
Target                                  0
dtype: int64

In [28]:
dementia_data.dropna(inplace=True)

In [29]:
dementia_data.isnull().sum()

Gender                                 0
Age                                    0
EDUC                                   0
Socioeconomic Status                   0
Mini-Mental State Examination          0
Clinical Dementia Rating               0
Estimated Total Intracranial Volume    0
Normalized Whole Brain Volume          0
Atlas Scaling Factor                   0
Target                                 0
dtype: int64

In [30]:
dementia_mapping = {
    'Nondemented': 0,
    'Demented': 1,
}

In [31]:
dementia_data['Target'] = dementia_data['Target'].map(dementia_mapping)

In [32]:
dementia_data['Target'].fillna(-1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dementia_data['Target'].fillna(-1, inplace=True)


In [33]:
dementia_data.head()

Unnamed: 0,Gender,Age,EDUC,Socioeconomic Status,Mini-Mental State Examination,Clinical Dementia Rating,Estimated Total Intracranial Volume,Normalized Whole Brain Volume,Atlas Scaling Factor,Target
0,1,87,14,2.0,27.0,0.0,1987,0.696,0.883,0.0
1,1,88,14,2.0,30.0,0.0,2004,0.681,0.876,0.0
5,0,88,18,3.0,28.0,0.0,1215,0.71,1.444,0.0
6,0,90,18,3.0,27.0,0.0,1200,0.718,1.462,0.0
7,1,80,12,4.0,28.0,0.0,1689,0.712,1.039,0.0


In [34]:
dementia_data['Target'].value_counts()

Target
 0.0    190
 1.0    127
-1.0     37
Name: count, dtype: int64

In [35]:
X = dementia_data.drop(columns='Target', axis=1)
Y = dementia_data['Target']

In [36]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [37]:
print(X.shape, X_train.shape, X_test.shape)

(354, 9) (283, 9) (71, 9)


In [38]:
model = LogisticRegression()

In [39]:
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [40]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [41]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.7420494699646644


In [42]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [43]:
print('Accuracy on Test data : ', test_data_accuracy)

Accuracy on Test data :  0.7323943661971831


In [44]:
#saving the model

In [45]:
import pickle

In [46]:
filename = 'dementia_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [47]:
loaded_model = pickle.load(open('dementia_model.sav', 'rb'))

In [48]:
for column in X.columns:
  print(column) 

Gender
Age
EDUC
Socioeconomic Status
Mini-Mental State Examination
Clinical Dementia Rating
Estimated Total Intracranial Volume
Normalized Whole Brain Volume
Atlas Scaling Factor
