<a href="https://colab.research.google.com/github/LBncl/AmazonMLInterviewQuestion/blob/main/NAFLD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NAFLD PROJECT

## Imports

In [88]:
# Imports
import missingno as msno
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Data Cleaning

In [155]:
# Read in data
df = pd.read_excel('master_with_nordic_and_multiBM.xlsx')

# Check if dataFrame is empty
if df.empty:
    print('DataFrame is empty!')

In [156]:
# Subset data frame
main_df = df.filter(items=['CPH_EV_AGE_CALC', 'TBL.PATIENT.INFO..PI_BL_GENDER', 'CPH_EV_CI_BMI_CALC', 
                         'TBL.ALL.EVENTS..AE_SF_ALCO_XS', 'insulin_resistance', 'hypertensive', 'waist_to_hip_ratio',
                         'idf_metabolic_syndrome', 'eGFR', 'dyslipidaemia', 'fibroscan_stiffness_reliable',
                         'TBL.ALL.EVENTS..AE_BR_ALT_iuL',
                         'TBL.ALL.EVENTS..AE_BR_AST_iuL', 'TBL.ALL.EVENTS..AE_BR_GGT_iuL',
                         'TBL.ALL.EVENTS..AE_BR_FERR_ugL',
                         'TBL.ALL.EVENTS..AE_BR_PLT_109L', 'TBL.ALL.EVENTS..AE_BR_CREAT_umolL_CALC',
                         'TBL.ALL.EVENTS..AE_BR_STG_mmolL_CALC',
                         'TBL.ALL.EVENTS..AE_BR_ALBU_gL_CALC', 'TBL.ALL.EVENTS..AE_BR_BILI_umolL_CALC',
                         'TBL.ALL.EVENTS..AE_BR_IGA',
                         'TBL.ALL.EVENTS..AE_CD_OSA', 'LIT_NB_CK18_M30', 'LIT_NB_CK18_M65', 'LIT_NB_PRO_C3', 'LIT_NB_PRO_C6',
                          'LIT_NB_ELF', 'FIB4', 'NFS', 'APRI', 'ADAPT', 'FIBC3', 'ABC3D', 'BARD', 'AST_ALT_Ratio', 'response_1', 'response_2', 'response_3a', 'response_3b', 
                             'response_4', 'response_5', 'response_6a', 'response_6b', 'response_7'])

In [None]:
main_df.describe()

In [None]:
msno.bar(main_df)

In [157]:
from sklearn import preprocessing

# Convert required variables to boolean and normlaise the remaining variables
features = main_df.iloc[:, 0:35]

# Loop to check for boolean columns
for column in features:
  if features[column].max() == 1:
    features[column] = features[column].astype(bool)

#cols_normalise = features.select_dtypes(include=[np.float64])
#features[cols_normalise.columns] = preprocessing.scale(cols_normalise)

In [None]:
features.dtypes

In [158]:
# Filter to only select basline event types
features['CPH_EV_EVENT_TYPE'] = df['CPH_EV_EVENT_TYPE']
is_Baseline = features['CPH_EV_EVENT_TYPE'] == 'Baseline'
features_baseline = features[is_Baseline]
features_baseline = features_baseline.iloc[: , :-1]
features_baseline = features_baseline.reset_index()
features_baseline = features_baseline.iloc[: , 1:]

In [None]:
features_baseline

## Data Imputation Strategies

In [7]:
# Imports for imputation functions
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression

In [8]:
# Single imputation with mean to replace not a number (NaNs)
def nan2mean(fdf):
    cols = list(fdf.columns)
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    fdf=imp.fit_transform(fdf)
    fdf = pd.DataFrame(fdf, columns=cols)
    return fdf

In [9]:
# Single imputation with median to replace not a number (NaNs)
def nan2median(fdf):
    cols = list(fdf.columns)
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    fdf=imp.fit_transform(fdf)
    fdf = pd.DataFrame(fdf, columns=cols)
    return fdf

In [10]:
# Single imputation with most_frequent to replace not a number (NaNs)
def nan2most_frequent(fdf):
    cols = list(fdf.columns)
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    fdf=imp.fit_transform(fdf)
    fdf = pd.DataFrame(fdf, columns=cols)
    return fdf

In [11]:
# Multiple Imputation by chained equation linear regression
def nan2mice(fdf):
    cols = list(fdf.columns)
    lr = LinearRegression()
    imp = IterativeImputer(estimator=lr,missing_values=np.nan, max_iter=50, imputation_order='roman',random_state=0)
    fdf=imp.fit_transform(fdf)
    fdf = pd.DataFrame(fdf, columns=cols)
    return fdf

In [12]:
# Imputation by K neasrest neighbours
def nan2knn(fdf):
    cols = list(fdf.columns)
    imp = KNNImputer(n_neighbors=2, weights="distance")
    fdf=imp.fit_transform(fdf)
    fdf = pd.DataFrame(fdf, columns=cols)
    return fdf

In [13]:
# function to compare the denisty of raw vs imputed data
def compare_denisty(column_name, fdf):
  d = {'raw': preprocessing.scale(features_baseline[column_name]), 'imputed': fdf[column_name]}
  fdata = pd.DataFrame(data=d)

  # calling density() to make multiple density plot 
  fig, axes = plt.subplots(1, 2)
  fig.suptitle('Raw vs Imputed Desity Plots')
  fdata['raw'].plot.density(ax=axes[0], figsize = (15, 7))
  fdata['imputed'].plot.density(ax=axes[1], figsize = (15, 7), color='red')

  axes[0].set_title('Raw Data')
  axes[1].set_title('Imputed Data')

  axes[0].set_xlabel(column_name)
  axes[1].set_xlabel(column_name)


  return fdata

## Model evaluation

In [159]:
# Create new imputed datasets
featuresKNN = nan2knn(features_baseline)
featuresMean = nan2mean(features_baseline)
featuresMedian = nan2median(features_baseline)
featuresMICE = nan2mice(features_baseline)
featuresMostFrq = nan2most_frequent(features_baseline)
featuresMostFrq = featuresMostFrq.apply(pd.to_numeric)

# Create list of all imputation strategies
imputed_features = [featuresKNN,featuresMean,featuresMedian,featuresMICE,featuresMostFrq]
imputed_features_names = ['featuresKNN','featuresMean','featuresMedian','featuresMICE','featuresMostFrq']



In [160]:
# Pull response from data frame with only baseline observations
y = main_df["response_1"]
response = pd.DataFrame({'y':y, 'Baseline':features['CPH_EV_EVENT_TYPE']})
is_Baseline = response['Baseline'] == 'Baseline'
response = response[is_Baseline]
response = response['y']

# Add response variable to all imputed datasets and remove observations with NaN responses
for df in imputed_features:
  df['response'] = response
  df = df.dropna(inplace=True)

In [161]:
from sklearn.model_selection import train_test_split

# function to generate test/train split
def train_test (fdf):
  # Split data into test and trian
  X_train, X_test, y_train, y_test = train_test_split(fdf.iloc[:, 0:35], fdf.iloc[: , -1], test_size=0.30, random_state=0)
  return X_train, X_test, y_train, y_test

### Logistic Regression

In [162]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

# Logistic Regression model
# Implement the model
logreg = LogisticRegression()

# hyperparameters, penalty values chosen to work with all solvers
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=logreg, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)

for i in range(len(imputed_features)):
  df = imputed_features[i]
  grid_result = grid_search.fit(df.iloc[:, 0:35], df.iloc[: , -1])

  # summarize results
  print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_), imputed_features_names[i])

Best: 0.606310 using {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'} featuresKNN
Best: 0.605561 using {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'} featuresMean


KeyboardInterrupt: ignored

### SVM

In [None]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# SVM model
def model_SVM(fdf, response):
  # Pull the correct resposne variable from the dataframe
  y = main_df[response]
  # Remove all rows with missing observations
  X = fdf
  X['y'] = y
  X = fdf.dropna()
  # Split data into test and trian
  X_train, X_test, y_train, y_test = train_test_split(X.iloc[:, 0:35], X.iloc[: , -1],test_size=0.20, random_state=0)
  # Implement the model
  clf = svm.SVC(kernel='linear')
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)

  #use model to predict probability that given y value is 1
  y_pred_proba = clf.predict_proba(X_test)[::,1]

  #calculate AUC of model
  auc = metrics.roc_auc_score(y_test, y_pred_proba)

  #print AUC score
  print(auc)

  # Calculate F-score
  report = classification_report(y_test, y_pred)
  print(report)

  return y_pred

### XGBoost 

In [163]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Implement the model
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

for i in range(len(imputed_features)):
  df = imputed_features[i]
  X_train, X_test, y_train, y_test = train_test(df)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  print(accuracy)



0.5952529668956902
0.5983760149906309
0.599000624609619
0.5983760149906309
0.6083697688944409


### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

def model_RF(fdf, response):
  # Pull the correct resposne variable from the dataframe
  y = main_df[response]
  # Remove all rows with missing observations
  X = fdf
  X['y'] = y
  X = fdf.dropna()
  # Split data into test and trian
  X_train, X_test, y_train, y_test = train_test_split(X.iloc[:, 0:35], X.iloc[: , -1], test_size=0.20, random_state=0)
  # Implement the model
  regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
  regressor.fit(X_train, y_train)
  y_pred = regressor.predict(X_test)

  #use model to predict probability that given y value is 1
  y_pred_proba = regressor.predict_proba(X_test)[::,1]

  #calculate AUC of model
  auc = metrics.roc_auc_score(y_test, y_pred_proba)

  #print AUC score
  print(auc)

  # Calculate F-score
  report = classification_report(y_test, y_pred)
  print(report)

  return y_pred

## Model Imports

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics

# Pipeline

## Imputation and Target Variable Selection

In [None]:
compare_denisty("CPH_EV_AGE_CALC",featuresKNN)

## Compare Performance for All Models

In [None]:
def compare_model(arg):
  # Select target responses
  response = arg

  # Apply model to all imputed data frames
  for i in range(5):
    x = imputed_features[i]
    model_SVM(x, response)