# MASH Prediction By Logistic Regression
Author: Nana K. Owusu

Metabolic-dysfunction Associated Steatotic Liver Disease (MASLD) is a chronic liver disease with a mild form known as
simple steatosis (macroscopic droplets of fat deposited in liver cells, MASLD) and a severe, progrossive form known as 
steatoshepatitis (inflammation and cellular injury in liver tissue, MASH). As the liver continues to store fat in its functional
cells, a physiologic response leads to the deposition of fibrotic molecules in the extra-cellular matrix (fibrosis).
Inflammation can also occur as a physiologic response to the stress, as well as cellular injury, where liver cells break
apart due to excess fat deposition. In clinical practice, patients with fibrosis grade 2 along with histologic proof of
inflammation and cellular injury are said to be at risk of irreversible damage to the liver (high-risk MASH). 

Magnetic resonance elastography (MRE) is the imaging technique that allows for non-invasive assesment of mechanical properties
of biological tissue. This includes measures of elastic properties like storage modulus and shear stiffness as well as 
viscosity measures like loss modulus anby predictors of the logistic regression models.

### Initial libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from importlib import import_module, reload
from timeit import default_timer as timer
%matplotlib inline

In [None]:
from sys import path
path.append('/Users/nowusu/GitHub_repos/ml_portfolio/MASLD_prediction/')

import ml_lib
# ml_lib = reload(ml_lib)
from ml_lib import (roc_plot, classification_metrics, LOO_testing, calc_auc, DeLong2_test,
                   final_training)

### Getting and Cleaning the Data

In [None]:
data=pd.read_csv('/Users/nowusu/GitHub_repos/ml_portfolio/MASLD_prediction/NAFLD-four-cohorts_20220413.csv')
data.info()

In [None]:
#SELECTING RELEVANT COLUMNS
cols_3Dmre = ['Steatosis','Inflammation','Ballooning',
              'NASH(0-1)','Fibrosis','PDFF',
              'LS-3D-60','DR-3D-60','SM-3D-60', 'LM-3D-60']
data=data[cols_3Dmre]

data.dropna(inplace=True)

#ADDING TARGET COLUMNS
# create categorcial column for fibrosis grade greater than 1
data['Fib_gt1'] = data['Fibrosis'].apply(lambda x: int(x>1))

# create categorical column for patients with high-risk MASH,
# (HRM = MASH ^ Fib_gt1)
data['HRM'] = data['NASH(0-1)'].astype('int16') & data['Fib_gt1']

data.rename({'LS-3D-60':'SS','DR-3D-60':'DR',
            'SM-3D-60':'SM','LM-3D-60':'LM',
            'NASH(0-1)':'MASH_gt0'},axis=1, inplace=True)

#RESETTING INDEX
data.reset_index(drop=True, inplace=True)

#### Export cleaned data to excel

In [None]:
#SUMMARY
print('\nDATA TYPES:\n-----------')
print(data.dtypes)
print('\nMISSING VALUES:\n---------------')
print(data.isna().sum())
print('\nDIMENSIONS:\n---------------')
print(data.shape)
data.head()

### Machine Learning
#### Initialization

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import LeaveOneOut
from statsmodels.stats.contingency_tables import mcnemar

In [None]:
#DEFINING GLOBAL ML PARAMETERS
Cs = np.logspace(-4,4,9)
cv = LeaveOneOut()
scoring = 'accuracy'
solver = 'liblinear'
tol = 1e-6
max_iter = int(1e6)
class_weight = 'balanced'

#DEFINING TARGETS
# HRM is high-risk MASH a designation for those with positive MASH diagnosis by biopsy,
# concomitant with fibrosis at stage 2 or above
# MASH_gt0 is the positive diagnosis for metabolic dysfunction associated steatotic
# liver disease via liver biopsy.
targets = ['HRM']#,'MASH_gt0']

#DEFINING MODELS
models = {'SS':['SS'],
          'FF':['PDFF'],
          'SSFFDR':['SS','PDFF','DR']}

#DEFINING METRICS DATAFRAME
metric_cols=['target','model','AUC','AUC_L','AUC_U','TP','FP','TN','FN',
             'sens','sens_L','sens_U','spec','spec_L','spec_U','NPV',
             'NPV_L','NPV_U','PPV','PPV_L','PPV_U']
metric_cols=metric_cols+[f"MN_p_{model}" for model in models.keys()]+[f"DL_p_{model}" for model in models.keys()]
metrics=pd.DataFrame(columns=metric_cols)

#### Bootstrapped Training, Validation, and Testing

In [None]:
#TESTING PERFORMANCE OF EACH MODEL
#Initialization
alpha=0.05
start = timer()
#Looping Over Models and Targets
for target in targets:
    for model in models:
        #Initializing Logistic Regression Model
        clf=LogisticRegressionCV(Cs=Cs,cv=cv,scoring=scoring,solver=solver, \
                                 tol=tol,max_iter=max_iter,class_weight=class_weight)
        
        #Defining Features and Target
        X=data[models[model]]
        Y=data[target]
        
        #Defining Output Columns
        data[target+'_'+model+'_proba']=np.nan
        data[target+'_'+model+'_pred']=np.nan
        
        #Computing Test Predictions With Bootstrap
        LOO_testing(data, target, model, X, Y, clf)
        
        #Test AUC
        AUC_L, AUC, AUC_U = calc_auc(data, Y, data[target+'_'+model+'_proba'], alpha)
        
        #Test Classification Metrics
        cmatrix_dict = classification_metrics(data, Y, data[target+'_'+model+'_pred'], 
                                              alpha)
        
        #Storing Results
        new_row={'target':target,'model':model,
                 'AUC':AUC,'AUC_L':AUC_L,'AUC_U':AUC_U}
        new_row.update(cmatrix_dict)
        metrics = pd.concat([metrics, pd.DataFrame([new_row])], ignore_index=True)
        
end = timer()

mins, secs = divmod(end - start, 60)
hrs, mins = divmod(mins, 60)
print(f'elapsed time: {hrs} hr {mins} min {round(secs, 2)} sec')

### Evaluation
#### Exact McNemar Test of Sensitivities

In [None]:
for model1 in models:
    k=0
    for target in targets:
        for model2 in models:
            #Labeling Positive Cohort Predictions Correct/Incorrect
            TP1=data[target+'_'+model1+'_pred'][data[target]==1]
            TP2=data[target+'_'+model2+'_pred'][data[target]==1]
            ctable=np.array([[((TP1==1)&(TP2==1)).sum(),((TP1==1)&(TP2==0)).sum()],\
                             [((TP1==0)&(TP2==1)).sum(),((TP1==0)&(TP2==0)).sum()]])
            
            #Different Models
            if model1!=model2:
                metrics.loc[k,'MN_p_'+model1]=mcnemar(ctable,exact=True).pvalue
            
            #Same Model
            else:
                metrics.loc[k,'MN_p_'+model1]=np.nan
            
            #Row Iterator
            k+=1
metrics

#### Delong Test of AUCs

In [None]:
for model1 in models:
    k=0
    for target in targets:
        for model2 in models:
            #Different Models
            if model1!=model2:
                X1=np.array((data[target+'_'+model1+'_proba'])[data[target]==1])
                Y1=np.array((data[target+'_'+model1+'_proba'])[data[target]==0])
                X2=np.array((data[target+'_'+model2+'_proba'])[data[target]==1])
                Y2=np.array((data[target+'_'+model2+'_proba'])[data[target]==0])
                metrics.loc[k,'DL_p_'+model1] = DeLong2_test(X1,Y1,X2,Y2)[1]
                
            #Same Model
            else:
                metrics.loc[k,'DL_p_'+model1]=np.nan
                
            #Row Iterator
            k+=1
metrics

#### ROC Curves

In [None]:
hrm_fig = plt.figure(figsize=(10,10))
hrm_fig.add_subplot()
roc_plot(hrm_fig, data, 'HRM', ['SS','FF','SSFFDR'], 'Predicting High-Risk MASH')

### Final Model Training

In [None]:
#INITIALIZATION
clfs={target:{model:LogisticRegressionCV(Cs=Cs,cv=cv,scoring=scoring,solver=solver,tol=tol,max_iter=max_iter,
                                         class_weight=class_weight) for model in models} for target in targets}

model_keys = {'SS':['SS'],'FF':['FF'],
              'SSFFDR':['SS','FF','DR']}

final_cols=['target','model','AUC','intercept',
            'beta_SS','beta_DR','beta_FF',
            'beta_std_SS','beta_std_DR','beta_std_FF',
            'OR_std_SS','OR_std_DR','OR_std_FF']
final=pd.DataFrame(columns=final_cols)

In [None]:
#FINAL TRAINING
# cutoffs=np.linspace(1,0,1001)
# k=0
# for target in targets:
#     for model in models:
#         final.loc[k,'target']=target
#         final.loc[k,'model']=model
        
#         #Cross Validated Training
#         X=data[models[model]]
#         Y=data[target]
#         clfs[target][model].fit(X,Y)
        
#         #Checking AUC
#         Y_proba=clfs[target][model].predict_proba(X)[:,1]
        
#         #Computing AUC
#         final.loc[k,'AUC']=roc_auc_score(Y,Y_proba)
        
#         #Getting Intercept
#         final.loc[k,'intercept']=clfs[target][model].intercept_[0]
        
#         #Getting Coefficients
#         coefs=clfs[target][model].coef_[0]
#         for i,coef in enumerate(coefs):
#             final.loc[k,f'beta_{model_keys[model][i]}']=coef
#             final.loc[k,f'beta_std_{model_keys[model][i]}']=\
#             final.loc[k,f'beta_{model_keys[model][i]}']*(X[models[model][i]].std())
#             final.loc[k,f'OR_std_{model_keys[model][i]}']=\
#             np.exp(final.loc[k,f'beta_std_{model_keys[model][i]}'])
        
#         #Saving Model
# #         dump(clfs[target][model], f'3_cohorts/LRM_Fib_{target}_{model}.joblib')
# #         dump(clfs[target][model], f'4_cohorts/LRM_Fib_{target}_{model}.joblib')
# #         dump(clfs[target][model], f'nash/LRM_{target}_{model}.joblib')
        
#         #Row Iterator
#         k+=1

In [None]:
final_training(data, final, ['HRM'], ['SS','FF','DR'], model_keys, clfs)
final