# Problem Statement
# Patients with Liver disease have been continuously increasing because of excessive consumption of alcohol, inhale of harmful gases, intake of contaminated food, pickles and drugs. This dataset was used to evaluate prediction algorithms in an effort to reduce burden on doctors.

# Details
# This data set contains 583 liver patient records and 167 non liver patient records collected from North East of Andhra Pradesh, India. The "Dataset" column is a class label used to divide groups into liver patient (liver disease) or not (no disease). This data set contains 441 male patient records and 142 female patient records. Any patient whose age exceeded 89 is listed as being of age "90".

# Columns:
# • Age of the patient • Gender of the patient • Total Bilirubin • Direct Bilirubin • Alkaline Phosphotase • Alamine Aminotransferase • Aspartate Aminotransferase • Total Protiens • Albumin • Albumin and Globulin Ratio • Dataset: field used to split the data into two sets (patient with liver disease, or no disease)

# Objective:
# # The primary goal is to visualize the data and establish trends or important characteristics, if any. The next important objective is to create a pycaret model that can predict the class of the patient by the virtue of its input.

In [1]:
import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# pycaret library

In [2]:
# !pip install pycaret

In [3]:
import pycaret

In [4]:
from pycaret.datasets import get_data
dataset = pd.read_csv('indian_liver_patient.csv')
dataset.head()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
0,65,Female,0.7,0.1,187,16,18,6.8,3.3,0.9,1
1,62,Male,10.9,5.5,699,64,100,7.5,3.2,0.74,1
2,62,Male,7.3,4.1,490,60,68,7.0,3.3,0.89,1
3,58,Male,1.0,0.4,182,14,20,6.8,3.4,1.0,1
4,72,Male,3.9,2.0,195,27,59,7.3,2.4,0.4,1


In [5]:
dataset.shape

(583, 11)

In [6]:
dataset.columns

Index(['Age', 'Gender', 'Total_Bilirubin', 'Direct_Bilirubin',
       'Alkaline_Phosphotase', 'Alamine_Aminotransferase',
       'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin',
       'Albumin_and_Globulin_Ratio', 'Dataset'],
      dtype='object')

In [7]:
dataset.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    4
Dataset                       0
dtype: int64

In [8]:
# splitting the data into training and test
# training data
data = dataset.sample(frac=0.90, random_state=1).reset_index(drop=True)
# test data
data_unseen = dataset.drop(data.index).reset_index(drop=True)

In [9]:
data.tail()

Unnamed: 0,Age,Gender,Total_Bilirubin,Direct_Bilirubin,Alkaline_Phosphotase,Alamine_Aminotransferase,Aspartate_Aminotransferase,Total_Protiens,Albumin,Albumin_and_Globulin_Ratio,Dataset
520,45,Male,1.7,0.8,315,12,38,6.3,2.1,0.5,1
521,50,Male,4.2,2.3,450,69,50,7.0,3.0,0.7,1
522,50,Female,1.0,0.5,239,16,39,7.5,3.7,0.9,1
523,55,Male,75.0,3.6,332,40,66,6.2,2.5,0.6,1
524,45,Female,0.7,0.2,153,41,42,4.5,2.2,0.9,2


In [10]:
data['Albumin_and_Globulin_Ratio'] = data['Albumin_and_Globulin_Ratio'].fillna(data['Albumin_and_Globulin_Ratio'].median())

In [11]:
data.isnull().sum()

Age                           0
Gender                        0
Total_Bilirubin               0
Direct_Bilirubin              0
Alkaline_Phosphotase          0
Alamine_Aminotransferase      0
Aspartate_Aminotransferase    0
Total_Protiens                0
Albumin                       0
Albumin_and_Globulin_Ratio    0
Dataset                       0
dtype: int64

In [12]:
data['Gender'] = data['Gender'].astype('category')
data['Gender'] = data['Gender'].cat.codes

In [13]:
x = data.iloc[:,0:-1]
y = data.iloc[:,-1]
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.9, random_state=1)


In [14]:
print("Data for Modeling :" + str(data.shape))
print("###########"*10)
print("Unseen Data for Prediction :" + str(data_unseen.shape))

Data for Modeling :(525, 11)
##############################################################################################################
Unseen Data for Prediction :(58, 11)


In [15]:
# !pip install numba==0.53

In [16]:
from pycaret.classification import *

In [21]:
exp_clf101 = setup(data=data, target='Dataset' , session_id=123)

IntProgress(value=0, description='Processing: ', max=3)

ValueError: Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True.

In [None]:
compare_models()

In [None]:
ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=123, verbose=0,
                     warm_start=False)

In [None]:
lr = create_model('lr')

In [None]:
et = create_model('et')

In [None]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)



In [None]:
x_train.head()

In [None]:
lr_model.fit(x_train, y_train)

In [None]:
y_pred_train = lr_model.predict(x_train)
y_pred_test = lr_model.predict(x_test)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
print("Training Accuracy : ", accuracy_score(y_train, y_pred_train))
print("#########"*10)
print("Test Accuracy : ", accuracy_score(y_test, y_pred_test))

In [None]:
from sklearn.model_selection import cross_val_score
training_accuracy = cross_val_score(lr_model, x_train, y_train, cv=10)
test_accuracy = cross_val_score(lr_model, x_test, y_test, cv=10)
print(training_accuracy.mean())
print("############"*10)
print(test_accuracy.mean())

In [None]:
plot_model(lr, plot='auc')

In [None]:
tuned_lr = tune_model(lr)

In [None]:
tuned_et = tune_model(et)

In [None]:
plot_model(et, plot='auc')

In [None]:
# which feature is more significant
plot_model(tuned_et, plot='feature')

In [None]:
plot_model(tuned_lr, plot='feature')

In [None]:
plot_model(tuned_lr, plot='confusion_matrix')

In [None]:
# Evaluate the model
evaluate_model(tuned_lr)

In [None]:
# Test data - unseen data
unseen_prediction = predict_model(tuned_lr, data=data_unseen)
unseen_prediction.head()

In [None]:
save_model(tuned_lr, 'tuned_lr_model')