<a href="https://colab.research.google.com/github/Lucy-Moctezuma/SFSU-CodeLab-Work-/blob/main/E.%20Coli%20Machine%20Learning%20Project/5_Deep_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Deep Learning**

### **1) Importing Packages needed**

In [None]:
# Data manipulation imports for ML
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Import packages for Neural Networks model
from sklearn.utils.multiclass import unique_labels
from sklearn.preprocessing import scale
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Flatten
from keras.layers import Dense
from keras.layers import Dropout
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping

# Imports for model evaluation 
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Imports for data visualization
import matplotlib.pyplot as plt

# Imports for file management
import os
from google.colab import drive
drive.mount('/content/drive')

### **2) Loading CSV file and creating dataframes for each antibiotic**

#### **a) Loading CSV created from previous notebook**

In [None]:
# Loads csv file as a dataframe
filepath = '/content/drive/My Drive/EColi_ML_CSV_files/'

# reads csv file as a dataframe
All_Drugs_df = pd.read_csv(filepath+"EColi_Merged_dfs.csv", na_values="NaN")
All_Drugs_df

#### **b) Changing "R" to 0 and "S" to 1  for Deep Learning Model**

In [None]:
# creating a list of antibiotic names
drug_list = All_Drugs_df.iloc[:,1:13].columns
drug_list

# converts all S values into 0 for each antibiotic
for drug in drug_list:
  All_Drugs_df.loc[All_Drugs_df[drug] == "S", drug] = 1.0

# converts all R values into 1 for each antibiotic
for drug in drug_list:
  All_Drugs_df.loc[All_Drugs_df[drug] == "R", drug] = 0.0

# Checking at how S and R classes were recoded
All_Drugs_df.head()

#### **c) Creating dataframes for each drug**

In [None]:
# creating a function that makes dataframes for each antibiotic and dropping NaN values
def makeDF(drug):
  All_features = All_Drugs_df.iloc[:,13:].astype(float)
  df_list = [All_Drugs_df[["Isolate",drug]],All_features]
  Drug_df = pd.concat(df_list, axis=1)
  Drug_df = Drug_df.dropna()
  return Drug_df


In [None]:
# implementing function using as example the drug AMC
AMX_df = makeDF("AMX")

# looking at the shape of AMC dataframe
print("AMX dataframe shape: ", AMX_df.shape)

# looking at the first 5 rows of this dataframe
AMX_df.head()

### **3) Separating each Drug Dataframe into 4 sections : Training (features and labels) and Testing (features and labels)**

#### **a) Creating Testing and Training datasets for each antibiotic drug**

In [None]:
# Separating each dataframe into Labels and Features
def Split_train_test(Drug_df,drug):
  Train_test_dic = {}
  labels = Drug_df[drug]
  features = Drug_df.drop(columns=[drug])
  features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.33, random_state=42, stratify=labels)

  Train_test_dic['labels_train'] = labels_train
  Train_test_dic['features_train'] = features_train
  Train_test_dic['labels_test'] = labels_test
  Train_test_dic['features_test'] = features_test

  return Train_test_dic

In [None]:
# Implementing the function Split_train_test() for AMC example
AMX_Train_test_dic = Split_train_test(AMX_df,"AMX")
AMX_Train_test_dic["features_train"].columns

# checking the shape of each dataframe or series stored in the dictionary created for drug AMC
print("AMX")
for k, df in AMX_Train_test_dic.items():
  print(k, df.shape)

# Checking how many samples are Resistant and how many Susceptible to AMC
AMX_Train_test_dic["labels_train"].value_counts()

### **4) Creating different combination of features before training**

In [None]:
# making a list of combinations of data sources we would like to test in our ML models
combo_list = ['G', 'S', 'GY', 'SY', 'GS', 'GYS'] 

# making a function that creates different feature combinations of the predictor features
def combo_feat(features_df, drug, combo):

  # creating Year column filters for features_df
  year_filter = [col for col in features_df if col.startswith("Year")]
  year_feat = features_df[year_filter]

  # creating Population structure column filters for features_df
  pop_str_filter = [col for col in features_df if col.startswith("cutoff")]
  pop_struc_feat = features_df[pop_str_filter]

  # creating Gene precence column filters for features_df
  gene_presc_filter = [col for col in features_df.columns if col not in pop_str_filter and col not in year_filter and col != "Isolate"]
  gene_presc_feat = features_df[gene_presc_filter]  

  if combo == 'G':
    df_list = [features_df['Isolate'], gene_presc_feat]
    G_feat_df = pd.concat(df_list, axis=1)
    G_feat_df = scale(G_feat_df.drop(columns=['Isolate']))
    return G_feat_df

  if combo == 'S':
    df_list = [features_df['Isolate'], pop_struc_feat]
    S_feat_df = pd.concat(df_list, axis=1)
    S_feat_df = scale(S_feat_df.drop(columns=['Isolate']))
    return S_feat_df

  if combo == 'GY':
    df_list = [features_df['Isolate'], gene_presc_feat, year_feat]
    GY_feat_df = pd.concat(df_list, axis=1)
    GY_feat_df = scale(GY_feat_df.drop(columns=['Isolate']))
    return GY_feat_df

  if combo == 'SY':
    df_list = [features_df['Isolate'], pop_struc_feat, year_feat]
    SY_feat_df = pd.concat(df_list, axis=1)
    SY_feat_df = scale(SY_feat_df.drop(columns=['Isolate']))
    return SY_feat_df

  if combo == 'GS':
    df_list = [features_df['Isolate'], gene_presc_feat, pop_struc_feat]
    GS_feat_df = pd.concat(df_list, axis=1)
    GS_feat_df = scale(GS_feat_df.drop(columns=['Isolate']))
    return GS_feat_df

  if combo == 'GYS':
    df_list = [features_df['Isolate'], gene_presc_feat, year_feat, pop_struc_feat]
    GYS_feat_df = pd.concat(df_list, axis=1)
    GYS_feat_df = scale(GYS_feat_df.drop(columns=['Isolate']))
    return GYS_feat_df

In [None]:
# Implementing combo_feat() function created for training data
AMX_GS_train_array = combo_feat(AMX_Train_test_dic['features_train'],"AMX","GS")

# Each list within the array represents a row
print(AMX_GS_train_array)
print("Number of rows: ",len(AMX_GS_train_array))

### **5) Creating Deep Learning model and training it per feature combination**

Below we can test changing the parameters to fine tune our results

In [None]:
# Parameter values
firstlayer = 200
interlayer = 100
dropout = 0.8
numblayer = 2

Creating function to run Deep Learning and implementing it:

In [None]:
# creating Deep Learning model function
def run_DL(feat_train_df, lab_train, drug, combo, sum = True):
  print(drug +" Training combo: "+ combo)
  
  # Reweighting classes due to imbalanced dataset
  class_labels = np.unique(lab_train)
  reweight = compute_class_weight(class_weight='balanced', classes=class_labels, y=lab_train)
  print(reweight)
  
  # Constructing Deep Learning model
  DL = Sequential()
  DL.add(Dense(int(firstlayer),activation='relu', input_shape=(feat_train_df.shape[1],)))
  DL.add(Dropout(dropout, input_shape=(feat_train_df.shape[1],)))
  for i in range(1,int(numblayer)):
      DL.add(Dense(int(interlayer),activation='relu'))
      DL.add(Dropout(dropout))
  DL.add(Dense(2, activation = 'softmax'))

  # Additional parameters for training (Early stopping)
  early_stopping_monitor= EarlyStopping(patience=50)

  # Compiling model created
  DL.compile(optimizer= tf.keras.optimizers.Adam(learning_rate=0.0001) ,loss= 'binary_crossentropy', metrics=['accuracy'])
  
  # Printing the model layers and parameters
  if sum == True:
    DL.summary()
  
  # Training with the neural network created
  DL_history = DL.fit(feat_train_df, to_categorical(lab_train), validation_split = 0.2, callbacks= [early_stopping_monitor],epochs=10, batch_size=128, class_weight=dict(enumerate(reweight)))

  return DL


In [None]:
# implementing run_DL() for specific drug feature combination dataframe
DL_AMX_GS_model = run_DL(AMX_GS_train_array, AMX_Train_test_dic['labels_train'],"AMX","GS")
DL_AMX_GS_model

### **6) Making predictions from Deep Learning model**

In [None]:
# creating a function using the model created and trained and the feature combinations from testing data
def predict(DL_combo_Model, features_test):
  labels_pred = DL_combo_Model.predict(features_test)
  labels_pred = np.argmax(labels_pred, axis=1)
  return labels_pred

In [None]:
# Implementing combo_feat() function created for testing data
AMX_GS_test_array = combo_feat(AMX_Train_test_dic['features_test'],"AMX","GS")

# Each list within the array represents a row
print(AMX_GS_test_array)
print("Number of rows: ",len(AMX_GS_test_array))

In [None]:
# Implementation of the predict() function using the feature combination "GS"
AMX_GS_labels_pred = predict(DL_AMX_GS_model,AMX_GS_test_array)

# observe how many predictions were made for each category "R"=0 and "S"=1
print("Labels predicted: ")
print(AMX_GS_labels_pred)

### **7) Evaluating our model using a confusion matrix and metrics**

In [None]:
# Creating a function that evaluates our model using our actual and predicted data
def evaluate(DL_combo_model, features_test, labels_test, labels_pred, cf= True):
  
  labels_test = np.asfarray(labels_test,float)
  score = DL_combo_model.evaluate(features_test, to_categorical(labels_test)) # only take accuracy
  
  labels = unique_labels(labels_test, labels_pred)
  inp = precision_recall_fscore_support(labels_test, labels_pred, labels=labels, average=None) # get f1 scores
  print(inp)
  report = np.asarray(inp).ravel().tolist()
  report= pd.DataFrame(report, index = ['PRC_R','PRC_S','RCL_R','RCL_S','FSc_R','FSc_S','Sc_R','Sc_S'])
  report = report.transpose()
  print(report)

  if cf == True:
    cm = confusion_matrix(labels_test, labels_pred, labels=labels, sample_weight=None)
    labels= np.where(labels<1,"R","S")
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot()
    plt.show()
  return [score[1], report['FSc_R'][0], report['FSc_S'][0]]

In [None]:
# implementing the evaluate() function 
Model_Report = evaluate(DL_AMX_GS_model,AMX_GS_test_array, AMX_Train_test_dic['labels_test'],AMX_GS_labels_pred)
print("Results from Model for drug: AMX")
print("Using feature combination: GS")
print("Accuracy: ", Model_Report[0])
print("R_f1_score: ", Model_Report[1])
print("S_f1_score: ", Model_Report[2])

### **8) Use all functions and evaluate every drug in every feature combination!**

#### **a) Lets recall the list of drugs we have available and the combination of features we are interested in**

In [None]:
# let's check all drugs
print(drug_list)

# let's see all combinations we are interested in
print(combo_list)

#### **b) Create a loop that will go through all our functions using the lists above**

In [None]:
# Lets use all our functions this time and save our report into a single data structure
DL_model_metrics = {}

for drug in drug_list:
  print(drug)
  Drug_df = makeDF(drug) # creates one df per drug
  Test_Train_dic = Split_train_test(Drug_df, drug) # splits each drug df into a dictionary with testing and training data
  for combo in combo_list:
    # Training each drug_combo features
    labels_train = Test_Train_dic["labels_train"]
    features_train = combo_feat(Test_Train_dic["features_train"], drug, combo) # create corresponding feature_df for training
    DL_combo_model = run_DL(features_train, labels_train, drug, combo, sum = False) # runs deep learning model using the corresponding training feature_df 
    
    # Predicting each drug_combo features
    features_test = combo_feat(Test_Train_dic["features_test"], drug, combo) # create corresponding feature_df for testing
    labels_pred = predict(DL_combo_model, features_test) # generate predictions based on the feature combination tested

    # Evaluating our models
    labels_test = Test_Train_dic["labels_test"]
    report = evaluate(DL_combo_model,features_test, labels_test, labels_pred, cf=False)
    DL_model_metrics[drug+"_"+combo] = report
    
    print(report)

#### **b) Store the metrics report for all drugs and features combinations as a csv file**

In [None]:
# convert dictionary into a dataframe
DL_metrics = pd.DataFrame.from_dict(DL_model_metrics, orient='index',columns=["Accuracy", "R_f1_score", "S_f1_score"]).reset_index()
DL_metrics = DL_metrics.rename(columns = {'index':'Drug_combo'})

# saving our metric results into a CSV file
DL_metrics.to_csv(filepath+"DL_metrics_df.csv", index= False)
DL_metrics


#### **c) Create a bar graph showing accuracies of all drugs when using all features (G)**

In [None]:
# filtering for all the rows that contain 
GS_filter = [drug_combo for drug_combo in DL_metrics['Drug_combo'] if drug_combo.endswith("GS")]
GS_df = DL_metrics.loc[DL_metrics["Drug_combo"].isin(GS_filter)]


In [None]:
# plotting bar graph of only 

# Figure Size
fig = plt.figure(figsize =(20, 8))

# Adding title
plt.title('Accuracy, R_f1_scores and S_f1_scores', fontsize = 12)

# Variables to be plotted 
x = np.arange(len(GS_df["Drug_combo"]))
acc = list(GS_df["Accuracy"])
R_f1 = list(GS_df["R_f1_score"])
S_f1 = list(GS_df["S_f1_score"])

# Plotting barcharts
acc_bar=plt.bar(x-0.25, height= acc, width=0.25, color="darkgrey", edgecolor="gray")
rf1_bar=plt.bar(x, height= R_f1, width=0.25, color="steelblue", align="center", edgecolor="gray")
sf1_bar=plt.bar(x+0.25, height= S_f1, width=0.25, color="lightcyan", edgecolor="gray")

plt.xticks([r for r in range(len(GS_df["Drug_combo"]))],
        GS_df["Drug_combo"], fontsize = 12)

#legend
fig.legend([acc_bar,rf1_bar,sf1_bar],["Accuracy", "R_f1_score", "S_f1_score"], bbox_to_anchor=(0.4,-0.35, 0.04, 0.4), fontsize=12)

# Show Plot
plt.show()
