<a href="https://colab.research.google.com/github/LBncl/AmazonMLInterviewQuestion/blob/main/NAFLD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NAFLD PROJECT

## Imports

In [1]:
# Imports
import missingno as msno
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Data Cleaning

In [2]:
# Read in data
df = pd.read_excel('master_with_nordic_and_multiBM.xlsx')

# Check if dataFrame is empty
if df.empty:
    print('DataFrame is empty!')

In [5]:
# Subset data frame
main_df = df.filter(items=['CPH_EV_AGE_CALC', 'TBL.PATIENT.INFO..PI_BL_GENDER', 'CPH_EV_CI_BMI_CALC', 
                         'TBL.ALL.EVENTS..AE_SF_ALCO_XS', 'insulin_resistance', 'hypertensive', 'waist_to_hip_ratio',
                         'idf_metabolic_syndrome', 'eGFR', 'dyslipidaemia', 'fibroscan_stiffness_reliable',
                         'TBL.ALL.EVENTS..AE_BR_ALT_iuL',
                         'TBL.ALL.EVENTS..AE_BR_AST_iuL', 'TBL.ALL.EVENTS..AE_BR_GGT_iuL',
                         'TBL.ALL.EVENTS..AE_BR_FERR_ugL',
                         'TBL.ALL.EVENTS..AE_BR_PLT_109L', 'TBL.ALL.EVENTS..AE_BR_CREAT_umolL_CALC',
                         'TBL.ALL.EVENTS..AE_BR_STG_mmolL_CALC',
                         'TBL.ALL.EVENTS..AE_BR_ALBU_gL_CALC', 'TBL.ALL.EVENTS..AE_BR_BILI_umolL_CALC',
                         'TBL.ALL.EVENTS..AE_BR_IGA',
                         'TBL.ALL.EVENTS..AE_CD_OSA', 'LIT_NB_CK18_M30', 'LIT_NB_CK18_M65', 'LIT_NB_PRO_C3', 'LIT_NB_PRO_C6',
                          'LIT_NB_ELF', 'FIB4', 'NFS', 'APRI', 'ADAPT', 'FIBC3', 'ABC3D', 'BARD', 'AST_ALT_Ratio', 'response_1', 'response_2', 'response_3a', 'response_3b', 
                             'response_4', 'response_5', 'response_6a', 'response_6b', 'response_7'])

In [None]:
msno.bar(main_df)

In [7]:
from sklearn import preprocessing

# Convert required variables to boolean and normlaise the remaining variables
features = main_df.iloc[:, 0:35]
features.iloc[:, [1,3,4,5,7,9,21]] = features.iloc[:, [1,3,4,5,7,9,21]].astype(bool)
cols_normalise = features.select_dtypes(include=[np.float64])
features[cols_normalise.columns] = preprocessing.scale(cols_normalise)

In [22]:
# Filter to only select basline event types
features['CPH_EV_EVENT_TYPE'] = df['CPH_EV_EVENT_TYPE']
features = features.loc[df['CPH_EV_EVENT_TYPE'] == 'Baseline']
features = features.iloc[: , :-1]

Unnamed: 0,CPH_EV_AGE_CALC,TBL.PATIENT.INFO..PI_BL_GENDER,CPH_EV_CI_BMI_CALC,TBL.ALL.EVENTS..AE_SF_ALCO_XS,insulin_resistance,hypertensive,waist_to_hip_ratio,idf_metabolic_syndrome,eGFR,dyslipidaemia,...,LIT_NB_PRO_C6,LIT_NB_ELF,FIB4,NFS,APRI,ADAPT,FIBC3,ABC3D,BARD,AST_ALT_Ratio
0,-0.417474,True,1.188363,False,True,False,,False,1.625120,False,...,,,-0.941617,1.097709,-0.575725,,,0.326362,1.470121,0.321123
3,-0.417474,False,-1.308425,False,True,True,-0.492271,False,,True,...,,,1.987935,-0.320448,3.422324,,,0.326362,-0.898544,-0.601372
6,-0.570058,True,0.243116,True,False,False,,False,-0.543713,False,...,,,-0.908024,-0.596894,-0.749636,,,-0.986470,-0.898544,-0.290186
7,-0.570058,False,1.154946,True,False,False,,True,0.297185,True,...,,,-0.741440,-0.078982,-0.617846,,,-0.986470,-0.898544,-0.321963
8,-0.493766,True,0.090349,True,False,True,,True,0.238236,True,...,,,-1.067812,-0.648087,-0.746962,,,-0.986470,0.680566,1.104844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14231,0.421741,True,-0.253377,False,True,True,,False,,True,...,,,,0.670164,,,,0.982778,,
14232,-1.027812,True,-0.219959,False,False,False,-0.480382,False,,True,...,,,,-1.073382,,,,,,
14233,0.498034,True,-0.395005,False,False,False,,True,4.608980,True,...,,,-0.277856,-0.461258,0.033026,,,-0.330054,0.680566,0.774639
14234,0.498034,True,0.020331,False,True,True,,True,,True,...,,,,0.861205,,,,,,


## Data Imputation Strategies

In [13]:
# Imports for imputation functions
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression

In [14]:
# Single imputation with mean to replace not a number (NaNs)
def nan2mean(fdf):
    cols = list(fdf.columns)
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    fdf=imp.fit_transform(fdf)
    fdf = pd.DataFrame(fdf, columns=cols)
    return fdf

In [15]:
# Single imputation with median to replace not a number (NaNs)
def nan2median(fdf):
    cols = list(fdf.columns)
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    fdf=imp.fit_transform(fdf)
    fdf = pd.DataFrame(fdf, columns=cols)
    return fdf

In [16]:
# Single imputation with most_frequent to replace not a number (NaNs)
def nan2most_frequent(fdf):
    cols = list(fdf.columns)
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    fdf=imp.fit_transform(fdf)
    fdf = pd.DataFrame(fdf, columns=cols)
    return fdf

In [36]:
# Multiple Imputation by chained equation linear regression
def nan2mice(fdf):
    cols = list(fdf.columns)
    lr = LinearRegression()
    imp = IterativeImputer(estimator=lr,missing_values=np.nan, max_iter=50, imputation_order='roman',random_state=0)
    fdf=imp.fit_transform(fdf)
    fdf = pd.DataFrame(fdf, columns=cols)
    return fdf

In [18]:
# Imputation by K neasrest neighbours
def nan2knn(fdf):
    cols = list(fdf.columns)
    imp = KNNImputer(n_neighbors=2, weights="distance")
    fdf=imp.fit_transform(fdf)
    fdf = pd.DataFrame(fdf, columns=cols)
    return fdf

In [19]:
# function to compare the denisty of raw vs imputed data
def compare_denisty(column_name, fdf):
    d = {'raw': preprocessing.scale(df[column_name]), 'imputed': fdf[column_name]}
    fdata = pd.DataFrame(data=d)
    # calling density() to make multiple density plot 
    fdata.plot.density(figsize = (7, 7),linewidth = 2)
    plt.xlabel(column_name)
    return fdata

## Model evaluation

### Logistic Regression

In [64]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics

# Logistic Regression model
def model_logREG(fdf, response):
  # Pull the correct resposne variable from the dataframe
  y = main_df[response]
  # Remove all rows with missing observations
  X = fdf
  X['y'] = y
  X = fdf.dropna()
  # Split data into test and trian
  X_train, X_test, y_train, y_test = train_test_split(X.iloc[:, 0:35], X.iloc[: , -1], test_size=0.20, random_state=0)
  # Implement the model
  logreg = LogisticRegression()
  logreg.fit(X_train, y_train)
  y_pred = logreg.predict(X_test)

  #use model to predict probability that given y value is 1
  y_pred_proba = logreg.predict_proba(X_test)[::,1]

  #calculate AUC of model
  auc = metrics.roc_auc_score(y_test, y_pred_proba)

  #print AUC score
  print(auc)

  # Calculate F-score
  report = classification_report(y_test, y_pred)
  print(report)


  return y_pred

### SVM

In [63]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# SVM model
def model_SVM(fdf, response):
  # Pull the correct resposne variable from the dataframe
  y = main_df[response]
  # Remove all rows with missing observations
  X = fdf
  X['y'] = y
  X = fdf.dropna()
  # Split data into test and trian
  X_train, X_test, y_train, y_test = train_test_split(X.iloc[:, 0:35], X.iloc[: , -1],test_size=0.20, random_state=0)
  # Implement the model
  clf = svm.SVC(kernel='linear')
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)

  #use model to predict probability that given y value is 1
  y_pred_proba = clf.predict_proba(X_test)[::,1]

  #calculate AUC of model
  auc = metrics.roc_auc_score(y_test, y_pred_proba)

  #print AUC score
  print(auc)

  # Calculate F-score
  report = classification_report(y_test, y_pred)
  print(report)

  return y_pred

### XGBoost 

In [62]:
import xgboost as xgb

def model_XGB(fdf, response):
  # Pull the correct resposne variable from the dataframe
  y = main_df[response]
  # Remove all rows with missing observations
  X = fdf
  X['y'] = y
  X = fdf.dropna()
  # Split data into test and trian
  X_train, X_test, y_train, y_test = train_test_split(X.iloc[:, 0:35], X.iloc[: , -1], test_size=0.20, random_state=0)
  # Implement the model
  xgb_rgr = xgb.XGBRegressor()
  xgb_rgr.fit(X_train, y_train)
  y_pred = xgb_rgr.predict(X_test)

  #use model to predict probability that given y value is 1
  y_pred_proba = xgb_rgr.predict_proba(X_test)[::,1]

  #calculate AUC of model
  auc = metrics.roc_auc_score(y_test, y_pred_proba)

  #print AUC score
  print(auc)

  # Calculate F-score
  report = classification_report(y_test, y_pred)
  print(report)

  return y_pred


### Random Forest

In [61]:
from sklearn.ensemble import RandomForestRegressor

def model_RF(fdf, response):
  # Pull the correct resposne variable from the dataframe
  y = main_df[response]
  # Remove all rows with missing observations
  X = fdf
  X['y'] = y
  X = fdf.dropna()
  # Split data into test and trian
  X_train, X_test, y_train, y_test = train_test_split(X.iloc[:, 0:35], X.iloc[: , -1], test_size=0.20, random_state=0)
  # Implement the model
  regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
  regressor.fit(X_train, y_train)
  y_pred = regressor.predict(X_test)

  #use model to predict probability that given y value is 1
  y_pred_proba = regressor.predict_proba(X_test)[::,1]

  #calculate AUC of model
  auc = metrics.roc_auc_score(y_test, y_pred_proba)

  #print AUC score
  print(auc)

  # Calculate F-score
  report = classification_report(y_test, y_pred)
  print(report)

  return y_pred

# Pipeline

## Imputation and Target Variable Selection

In [37]:
# Create new imputed datasets
featuresKNN = nan2knn(features)
featuresMean = nan2mean(features)
featuresMedian = nan2median(features)
featuresMICE = nan2mice(features)
featuresMostFrq = nan2most_frequent(features)

# Create list of all imputation strategies
imputed_features = [featuresKNN,featuresMean,featuresMedian,featuresMICE,featuresMostFrq]



## Compare Performance for All Models

In [57]:
def compare_model(arg):
  # Select target responses
  response = arg

  # Apply model to all imputed data frames
  for i in range(5):
    x = imputed_features[i]
    model_logREG(x, response)

In [76]:
x = model_logREG(featuresMean,'response_2')

0.543846194531126
              precision    recall  f1-score   support

         0.0       0.49      0.03      0.06       511
         1.0       0.57      0.97      0.72       666

    accuracy                           0.56      1177
   macro avg       0.53      0.50      0.39      1177
weighted avg       0.53      0.56      0.43      1177

