In [2]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import random



In [None]:
df = pd.read_csv('cirrhosis.csv')
df

In [None]:
df.nunique()

In [None]:
df.describe()

In [71]:
#look for missing values 
df.isna().any()

ID               False
N_Days           False
Status           False
Drug              True
Age              False
Sex              False
Ascites           True
Hepatomegaly      True
Spiders           True
Edema            False
Bilirubin        False
Cholesterol       True
Albumin          False
Copper            True
Alk_Phos          True
SGOT              True
Tryglicerides     True
Platelets         True
Prothrombin       True
Stage             True
dtype: bool

In [72]:
#look for missing values number
df.isna().sum()

ID                 0
N_Days             0
Status             0
Drug             106
Age                0
Sex                0
Ascites          106
Hepatomegaly     106
Spiders          106
Edema              0
Bilirubin          0
Cholesterol      134
Albumin            0
Copper           108
Alk_Phos         106
SGOT             106
Tryglicerides    136
Platelets         11
Prothrombin        2
Stage              6
dtype: int64

In [73]:
#columns with numerical values 
num_cols = df._get_numeric_data().columns
num_cols
for col in num_cols:
    if df[col].nunique() <= 4:
        num_cols = num_cols.drop(col)

num_cols


Index(['ID', 'N_Days', 'Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper',
       'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin'],
      dtype='object')

In [74]:
def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers
for col in num_cols:
    outliers = find_outliers_IQR(df[col])
    if len(outliers) >0:
        print(col)
        
        print( 'number of outliers:'+ str(len(outliers)))

        print('max outlier value: '+ str(outliers.max()))

        print('min outlier value: '+ str(outliers.min()))
       
        print('##############')

Bilirubin
number of outliers:46
max outlier value: 28.0
min outlier value: 7.3
##############
Cholesterol
number of outliers:20
max outlier value: 1775.0
min outlier value: 636.0
##############
Albumin
number of outliers:9
max outlier value: 4.64
min outlier value: 1.96
##############
Copper
number of outliers:17
max outlier value: 588.0
min outlier value: 247.0
##############
Alk_Phos
number of outliers:35
max outlier value: 13862.4
min outlier value: 3681.0
##############
SGOT
number of outliers:7
max outlier value: 457.25
min outlier value: 272.8
##############
Tryglicerides
number of outliers:10
max outlier value: 598.0
min outlier value: 260.0
##############
Platelets
number of outliers:6
max outlier value: 721.0
min outlier value: 514.0
##############
Prothrombin
number of outliers:18
max outlier value: 18.0
min outlier value: 12.9
##############


In [75]:
#replace outliers with median value 
outliers_cols = []
for col in num_cols:
    outliers = find_outliers_IQR(df[col])
    if len(outliers) >0:
        outliers_cols.append(col)
for col in outliers_cols:
    if col == 'Prothrombin':
        median = df[col].quantile(0.50)
        high = df[col].quantile(0.25)
        low = df[col].quantile(0.75)
        df[col] = np.where(df[col] > high, median, df[col])
#         df[col] = np.where(df[col] < low, median, df[col])
df.describe()

    

Unnamed: 0,ID,N_Days,Age,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
count,418.0,418.0,418.0,418.0,284.0,418.0,310.0,312.0,312.0,282.0,407.0,416.0,412.0
mean,209.5,1917.782297,18533.351675,3.220813,369.510563,3.49744,97.648387,1982.655769,122.556346,124.702128,257.02457,10.376442,3.024272
std,120.810458,1104.672992,3815.845055,4.407506,231.944545,0.424972,85.61392,2140.388824,56.699525,65.148639,98.325585,0.385737,0.882042
min,1.0,41.0,9598.0,0.3,120.0,1.96,4.0,289.0,26.35,33.0,62.0,9.0,1.0
25%,105.25,1092.75,15644.5,0.8,249.5,3.2425,41.25,871.5,80.6,84.25,188.5,10.0,2.0
50%,209.5,1730.0,18628.0,1.4,309.5,3.53,73.0,1259.0,114.7,108.0,251.0,10.6,3.0
75%,313.75,2613.5,21272.5,3.4,400.0,3.77,123.0,1980.0,151.9,151.0,318.0,10.6,4.0
max,418.0,4795.0,28650.0,28.0,1775.0,4.64,588.0,13862.4,457.25,598.0,721.0,10.6,4.0


In [1]:
#columns with numerical values missing 
contentious_cols_nan = []
for col in num_cols:
    if df[col].isnull().values.any():
        contentious_cols_nan.append(col)
contentious_cols_nan

NameError: name 'num_cols' is not defined

In [None]:
#looking into categorial data 
categorial_data=df.loc[:,df.nunique() <= 4]
cat_cols = [col for col in df.columns if col in categorial_data]
cat_cols



In [None]:
df[cat_cols].isna().any()

In [None]:
#columns with categorial values missing 
cat_cols_nan = []
for col in cat_cols:
    if df[col].isnull().values.any():
        cat_cols_nan.append(col)
cat_cols_nan

In [None]:
for col in cat_cols_nan:
    print(df[col].unique())

In [None]:
for col in cat_cols_nan:
    print(df[col].value_counts(dropna = False, normalize = True))

In [None]:
# fill missing valuesin categorial columns while keeping ratio
for col in cat_cols_nan:
    nans = df[col].isna()

    length = sum(nans)
    val_list = df[col].dropna().unique() #all categories
    num_val = [] #count of each category while keeping the same order as in val_list 
    for val_1 in val_list:
        count = (df[col] == val_1).sum()
        num_val.append(count)
    num_val_ratio= [] #ration of each category while keeping the same order as in val_list
    for val_2 in num_val:
        ratio = val_2/sum(num_val)
        num_val_ratio.append(ratio)

    replacement = random.choices(val_list, weights=num_val_ratio, k=length)
    df.loc[nans,col] = replacement

df

In [None]:
# change categorial columns to numbers 
encoder = preprocessing.LabelEncoder()
for col in cat_cols:
    df[col] = encoder.fit_transform(df[col])
df

In [None]:
#KNN to add values to empty spots in coiumns with contentious values 
imputer = KNNImputer(n_neighbors=5)
df_imputation = pd.DataFrame(imputer.fit_transform(df),columns = df.columns)

df_imputation

In [None]:
df_imputation.isna().any()

In [None]:
ax = sns.countplot(data = df_imputation, x = 'Stage',palette='rainbow')


In [None]:
# Split the data for train and test 
x, y = df_imputation.drop(['ID', 'Stage'], axis=1), df_imputation['Stage']
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size = 0.8)
print(x_train.shape, x_test.shape)

In [None]:
from sklearn.svm import SVC
classifier_svc = SVC(kernel = 'linear', random_state = 0)
classifier_svc.fit(x_train, y_train)
y_pred_svc = classifier_svc.predict(x_test)
from sklearn.metrics import accuracy_score
print("SVM - " , accuracy_score(y_test, y_pred_svc))



In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

rf = RandomForestClassifier(criterion = 'entropy', max_depth = 20, n_estimators = 100)
rf.fit(x_train, y_train)
probs = rf.predict_proba(x_test)
score = roc_auc_score(y_test, probs, multi_class = 'ovr')
score

In [None]:
# XGB
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import confusion_matrix


xgbc = XGBClassifier()
xgbc.fit(x_train, y_train)

# - cross validataion
scores = cross_val_score(xgbc, x_train, y_train, cv=5)
print("Mean cross-validation score: %.2f" % scores.mean())

kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(xgbc, x_train, y_train, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())

ypred = xgbc.predict(x_test)
cm = confusion_matrix(y_test,ypred)
print(cm)



In [None]:
from sklearn.model_selection import GridSearchCV

xgb = XGBClassifier(use_label_encoder = False, eval_metric = 'mlogloss')
param_grid_xgb = [{'eta' : [0.005, 0.05, 0.1, 0.3, 0.5], 'max_depth' : [2, 4, 6, 8, 10], 'lambda': [0.25, 0.5, 1, 1.5, 2]}]
grid_search_xgb = GridSearchCV(xgb, param_grid_xgb, cv = 5, return_train_score = True)
grid_search_xgb.fit(x_train, y_train)
grid_search_xgb.best_params_, grid_search_xgb.best_score_

In [None]:
# Decsion Tree

from sklearn.tree import DecisionTreeClassifier
classifier_dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier_dt.fit(x_train, y_train)
y_pred_dt = classifier_dt.predict(x_test)
print("Decision Tree - ", accuracy_score(y_test, y_pred_dt))


In [None]:
# Neural Network
import tensorflow as tf

# set seed
tf.random.set_seed(42)

# model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(4, activation="relu"),
    tf.keras.layers.Dense(100, activation="relu"),
    tf.keras.layers.Dense(10, activation="relu"),
    tf.keras.layers.Dense(4)
])

# compile
model.compile(loss="binary_crossentropy",
                optimizer=tf.keras.optimizers.Adam(lr=0.01),
                metrics="accuracy")

# fit model
model.fit(X_train, y_train, epochs=500, verbose=0)