Link of deployed app: https://med7diagnosis.herokuapp.com/

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('datasets/liver_disease.csv')
df.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583 entries, 0 to 582
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         583 non-null    int64  
 1   Gender                      583 non-null    object 
 2   Total_Bilirubin             583 non-null    float64
 3   Direct_Bilirubin            583 non-null    float64
 4   Alkaline_Phosphotase        583 non-null    int64  
 5   Alamine_Aminotransferase    583 non-null    int64  
 6   Aspartate_Aminotransferase  583 non-null    int64  
 7   Total_Protiens              583 non-null    float64
 8   Albumin                     583 non-null    float64
 9   Albumin_and_Globulin_Ratio  579 non-null    float64
 10  Dataset                     583 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.2+ KB


In [4]:
df.describe()

Unnamed: 0,Age,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
count,583.0,583.0,583.0,583.0,583.0,583.0,583.0,583.0,579.0,583.0
mean,44.746141,3.298799,1.486106,290.576329,80.713551,109.910806,6.48319,3.141852,0.947064,1.286449
std,16.189833,6.209522,2.808498,242.937989,182.620356,288.918529,1.085451,0.795519,0.319592,0.45249
min,4.0,0.4,0.1,63.0,10.0,10.0,2.7,0.9,0.3,1.0
25%,33.0,0.8,0.2,175.5,23.0,25.0,5.8,2.6,0.7,1.0
50%,45.0,1.0,0.3,208.0,35.0,42.0,6.6,3.1,0.93,1.0
75%,58.0,2.6,1.3,298.0,60.5,87.0,7.2,3.8,1.1,2.0
max,90.0,75.0,19.7,2110.0,2000.0,4929.0,9.6,5.5,2.8,2.0


In [5]:
df.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

In [6]:
df.Gender=df.Gender.map({'Female':1,'Male':0})
df['Dataset']=df['Dataset'].map({1:0,2:1})
df.Dataset.value_counts()

0    416
1    167
Name: Dataset, dtype: int64

In [7]:
from fancyimpute import KNN

df2 = KNN(k=5).fit_transform(df)
df = pd.DataFrame(df2, columns=df.columns)
df.head()

Imputing row 1/583 with 0 missing, elapsed time: 0.043
Imputing row 101/583 with 0 missing, elapsed time: 0.043
Imputing row 201/583 with 0 missing, elapsed time: 0.043
Imputing row 301/583 with 0 missing, elapsed time: 0.044
Imputing row 401/583 with 0 missing, elapsed time: 0.044
Imputing row 501/583 with 0 missing, elapsed time: 0.044


Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65.0,1.0,0.7,0.1,187.0,16.0,18.0,6.8,3.3,0.9,0.0
1,62.0,0.0,10.9,5.5,699.0,64.0,100.0,7.5,3.2,0.74,0.0
2,62.0,0.0,7.3,4.1,490.0,60.0,68.0,7.0,3.3,0.89,0.0
3,58.0,0.0,1.0,0.4,182.0,14.0,20.0,6.8,3.4,1.0,0.0
4,72.0,0.0,3.9,2.0,195.0,27.0,59.0,7.3,2.4,0.4,0.0


In [8]:
df.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

np.random.shuffle(df.values)
X = df.drop(columns=['Dataset'])
y = df['Dataset']

sm=SMOTE()
sc=StandardScaler()
X=sc.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

In [10]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
lr = LogisticRegression()
rc = RidgeClassifier()
knn = KNeighborsClassifier()
rfc = RandomForestClassifier()
nbc = GaussianNB()
svc = SVC()

X_train, y_train=sm.fit_sample(X_train,y_train)
cv_lr=cross_validate(lr,X_train,y_train,cv=10)
cv_rc=cross_validate(rc,X_train,y_train,cv=10)
cv_knn=cross_validate(knn,X_train,y_train,cv=10)
cv_rfc=cross_validate(rfc,X_train,y_train,cv=10)
cv_nbc=cross_validate(nbc,X_train,y_train,cv=10)
cv_svc=cross_validate(svc,X_train,y_train,cv=10)

In [11]:
print('Logistic Regression CV:', cv_lr['test_score'].mean())
print('Ridge Classifier:', cv_rc['test_score'].mean())
print('KNN Classifier:', cv_knn['test_score'].mean())
print('Random Forest Classifier:', cv_rfc['test_score'].mean())
print('Naive Bayes Classifier:', cv_nbc['test_score'].mean())
print('Support Vector Classifier:', cv_svc['test_score'].mean())

Logistic Regression CV: 0.7172072072072073
Ridge Classifier: 0.7319099099099099
KNN Classifier: 0.7347387387387389
Random Forest Classifier: 0.8058018018018018
Naive Bayes Classifier: 0.7104324324324324
Support Vector Classifier: 0.7306126126126126


In [14]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'n_estimators': [10, 50, 100, 200, 300, 400, 500],
    'criterion': ['gini', 'entropy'],
    'max_depth':[3, 4, 5, 6, 7, 8, 9, 10]
}

model = GridSearchCV(RandomForestClassifier(), parameters, n_jobs=-1, verbose=2)
cv_model=cross_validate(model,X_train,y_train,cv=10)

Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 186 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 389 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   37.8s finished


Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 377 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   38.9s finished


Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 377 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   39.4s finished


Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 377 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   38.8s finished


Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 377 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   39.3s finished


Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 377 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   40.0s finished


Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 377 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   41.4s finished


Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:   11.6s
[Parallel(n_jobs=-1)]: Done 377 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   40.9s finished


Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 377 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   42.9s finished


Fitting 5 folds for each of 112 candidates, totalling 560 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 174 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 377 tasks      | elapsed:   27.4s
[Parallel(n_jobs=-1)]: Done 560 out of 560 | elapsed:   45.3s finished


In [15]:
print('Best Model CV:', cv_model['test_score'].mean())

Best Model CV: 0.7883063063063063


In [16]:
final_model = RandomForestClassifier().fit(X, y)

import pickle
pickle.dump(final_model, open('liver_model.pkl', 'wb'))