<a href="https://colab.research.google.com/github/Lucy-Moctezuma/SFSU-CodeLab-Work-/blob/main/E.%20Coli%20Machine%20Learning%20Project/3_Random_Forest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Random Forest**

### **1) Importing Packages needed**

In [None]:
# Data manipulation imports for ML
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Import packages for Random Forest model
from sklearn.ensemble  import RandomForestClassifier

# Imports for model evaluation 
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Imports for data visualization
import matplotlib.pyplot as plt

# Imports for file management
import os
from google.colab import drive
drive.mount('/content/drive')

### **2) Loading CSV file and creating dataframes for each antibiotic**

#### **a) Loading CSV created from previous notebook**

In [None]:
# Loads csv file as a dataframe
filepath = '/content/drive/My Drive/EColi_ML_CSV_files/'

# reads csv file as a dataframe
All_Drugs_df = pd.read_csv(filepath+"EColi_Merged_dfs.csv", na_values="NaN")
All_Drugs_df

#### **b) Creating dataframes for each drug**

In [None]:
# creating a list of antibiotic names
drug_list = All_Drugs_df.iloc[:,1:13].columns
drug_list

# creating a function that makes dataframes for each antibiotic and dropping NaN values
def makeDF(drug):
  df_list = [All_Drugs_df[["Isolate",drug]],All_Drugs_df.iloc[:,13:]]
  Drug_df = pd.concat(df_list, axis=1)
  Drug_df = Drug_df.dropna()
  return Drug_df

In [None]:
# implementing function using as example the drug CTX
CTX_df = makeDF("CTX")

# looking at the shape of CTZ dataframe
print("CTX dataframe shape: ", CTX_df.shape)

# looking at the first 5 rows of this dataframe
CTX_df.head()

### **3) Separating each Drug Dataframe into 4 sections : Training (features and labels) and Testing (features and labels)**

#### **a) Creating Testing and Training datasets for each antibiotic drug**

In [None]:
# Separating each dataframe into Labels and Features
def Split_train_test(Drug_df,drug):
  Train_test_dic = {}
  labels = Drug_df[drug]
  features = Drug_df.drop(columns=[drug])
  features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.33, random_state=42)

  Train_test_dic['labels_train'] = labels_train
  Train_test_dic['features_train'] = features_train
  Train_test_dic['labels_test'] = labels_test
  Train_test_dic['features_test'] = features_test

  return Train_test_dic

In [None]:
# Implementing the function Split_train_test() for CTZ example
CTX_Train_test_dic = Split_train_test(CTX_df,"CTX")
CTX_Train_test_dic["features_train"]

# checking the shape of each dataframe or series stored in the dictionary created for drug CTZ
print("CTX")
for k, df in CTX_Train_test_dic.items():
  print(k, df.shape)

### **4) Creating different combination of features before training**

In [None]:
# making a list of combinations of data sources we would like to test in our ML models
combo_list = ['G', 'S', 'GY', 'SY', 'GS', 'GYS'] 

# making a function that creates different feature combinations of the predictor features
def combo_feat(features_df, drug, combo):

  # creating Year column filters for features_df
  year_filter = [col for col in features_df if col.startswith("Year")]
  year_feat = features_df[year_filter]

  # creating Population structure column filters for features_df
  pop_str_filter = [col for col in features_df if col.startswith("cutoff")]
  pop_struc_feat = features_df[pop_str_filter]

  # creating Gene precence column filters for features_df
  gene_presc_filter = [col for col in features_df.columns if col not in pop_str_filter and col not in year_filter and col != "Isolate"]
  gene_presc_feat = features_df[gene_presc_filter]  

  if combo == 'G':
    df_list = [features_df['Isolate'], gene_presc_feat]
    G_feat_df = pd.concat(df_list, axis=1)
    G_feat_df = G_feat_df.drop(columns=['Isolate'])
    return G_feat_df

  if combo == 'S':
    df_list = [features_df['Isolate'], pop_struc_feat]
    S_feat_df = pd.concat(df_list, axis=1)
    S_feat_df = S_feat_df.drop(columns=['Isolate'])
    return S_feat_df

  if combo == 'GY':
    df_list = [features_df['Isolate'], gene_presc_feat, year_feat]
    GY_feat_df = pd.concat(df_list, axis=1)
    GY_feat_df = GY_feat_df.drop(columns=['Isolate'])
    return GY_feat_df

  if combo == 'SY':
    df_list = [features_df['Isolate'], pop_struc_feat, year_feat]
    SY_feat_df = pd.concat(df_list, axis=1)
    SY_feat_df = SY_feat_df.drop(columns=['Isolate'])
    return SY_feat_df

  if combo == 'GS':
    df_list = [features_df['Isolate'], gene_presc_feat, pop_struc_feat]
    GS_feat_df = pd.concat(df_list, axis=1)
    GS_feat_df = GS_feat_df.drop(columns=['Isolate'])
    return GS_feat_df

  if combo == 'GYS':
    df_list = [features_df['Isolate'], gene_presc_feat, year_feat, pop_struc_feat, ]
    GYS_feat_df = pd.concat(df_list, axis=1)
    GYS_feat_df = GYS_feat_df.drop(columns=['Isolate'])
    return GYS_feat_df

In [None]:
# Implementing combo_feat() function created for training data
CTX_GY_train_df = combo_feat(CTX_Train_test_dic['features_train'],"CTX","GY")

# looking only at the feature column names for the combination for "GY" for drug "CTX" for training data
CTX_GY_train_df.columns

### **5) Creating Random Forest model and training it per feature combination**

In [None]:
# creating Random Forest model function
def run_RF(feat_train_df, lab_train, drug, combo):
  print(drug +" Training combo: "+ combo)
  RF = RandomForestClassifier(random_state = 42)
  RF = RF.fit(feat_train_df, lab_train)
  return RF

In [None]:
# implementing run_RF() for specific drug feature combination dataframe
RF_CTX_GY_model = run_RF(CTX_GY_train_df, CTX_Train_test_dic['labels_train'],"CTX","GYS")
RF_CTX_GY_model

### **6) Making predictions from Random Forest model**

In [None]:
# creating a function using the model created and trained and the feature combinations from testing data
def predict(RF_combo_Model, features_test):
  labels_pred = RF_combo_Model.predict(features_test)
  return labels_pred

In [None]:
# Implementing combo_feat() function created for testing data
CTX_GY_test_df = combo_feat(CTX_Train_test_dic['features_test'],"CTX","GY")

# looking only at the feature column names for the combination for "GY" for drug "CTX" for testing data
CTX_GY_test_df.columns

In [None]:
# Implementation of the predict() function using the feature combination "GY"
CTX_GY_labels_pred = predict(RF_CTX_GY_model,CTX_GY_test_df)

# observe how many predictions were made for each category "R" and "S"
print("Labels predicted: ", np.unique(CTX_GY_labels_pred, return_counts=True))

### **7) Evaluating our model using a confusion matrix and metrics**

In [None]:
# Creating a function that evaluates our model using our actual and predicted data
def evaluate(RF_combo_model, labels_test, labels_pred, cf= True):
  report = classification_report(labels_test, labels_pred, output_dict = True)
  accuracy = report['accuracy']
  R_f1_score = report['R']['f1-score']# Resistant
  S_f1_score = report['S']['f1-score']# Susceptible
  if cf == True:
    cm = confusion_matrix(labels_test, labels_pred, labels=RF_combo_model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=RF_combo_model.classes_)
    disp.plot()
    plt.show()
  return [accuracy,R_f1_score,S_f1_score]

In [None]:
# implementing the evaluate() function 
Model_Report = evaluate(RF_CTX_GY_model, CTX_Train_test_dic['labels_test'],CTX_GY_labels_pred)
print("Results from Model for drug: CTX")
print("Using feature combination: GY")
print("Accuracy: ", Model_Report[0])
print("R_f1_score: ", Model_Report[1])
print("S_f1_score: ", Model_Report[2])

### **8) Use all functions and evaluate every drug in every feature combination!**

#### **a) Lets recall the list of drugs we have available and the combination of features we are interested in**

In [None]:
# let's check all drugs
print(drug_list)

# let's see all combinations we are interested in
print(combo_list)

#### **b) Create a loop that will go through all our functions using the lists above**

In [None]:
# Lets use all our functions this time and save our report into a single data structure
RF_model_metrics = {}

for drug in drug_list:
  print(drug)
  Drug_df = makeDF(drug) # creates one df per drug
  Test_Train_dic = Split_train_test(Drug_df, drug) # splits each drug df into a dictionary with testing and training data
  for combo in combo_list:
    # Training each drug_combo features
    labels_train = Test_Train_dic["labels_train"]
    features_train = combo_feat(Test_Train_dic["features_train"], drug, combo) # create corresponding feature_df for training
    RF_combo_model = run_RF(features_train, labels_train, drug, combo) # runs logistic regression model using the corresponding training feature_df 
    
    # Predicting each drug_combo features
    features_test = combo_feat(Test_Train_dic["features_test"], drug, combo) # create corresponding feature_df for testing
    labels_pred = predict(RF_combo_model, features_test) # generate predictions based on the feature combination tested

    # Evaluating our models
    labels_test = Test_Train_dic["labels_test"]
    report = evaluate(RF_combo_model, labels_test, labels_pred, cf=False)
    RF_model_metrics[drug+"_"+combo] = report
    
    print(report)

#### **b) Store the metrics report for all drugs and features combinations as a csv file**

In [None]:
# convert dictionary into a dataframe
RF_metrics = pd.DataFrame.from_dict(RF_model_metrics, orient='index',columns=["Accuracy", "R_f1_score", "S_f1_score"]).reset_index()
RF_metrics = RF_metrics.rename(columns = {'index':'Drug_combo'})

# saving our metric results into a CSV file
RF_metrics.to_csv(filepath+"RF_metrics_df.csv", index= False)
RF_metrics


#### **c) Create a bar graph showing accuracies of all drugs when using all features (GY)**

In [None]:
# filtering for all the rows that contain 
GY_filter = [drug_combo for drug_combo in RF_metrics['Drug_combo'] if drug_combo.endswith("GY")]
GY_df = RF_metrics.loc[RF_metrics["Drug_combo"].isin(GY_filter)]

In [None]:
# Figure Size
fig = plt.figure(figsize =(20, 8))

# Adding title
plt.title('Accuracy, R_f1_scores and S_f1_scores', fontsize = 12)

# Variables to be plotted 
x = np.arange(len(GY_df["Drug_combo"]))
acc = list(GY_df["Accuracy"])
R_f1 = list(GY_df["R_f1_score"])
S_f1 = list(GY_df["R_f1_score"])

# Plotting barcharts
acc_bar=plt.bar(x-0.25, height= acc, width=0.25, color="darkgrey", edgecolor="gray")
rf1_bar=plt.bar(x, height= R_f1, width=0.25, color="cadetblue", align="center", edgecolor="gray")
sf1_bar=plt.bar(x+0.25, height= S_f1, width=0.25, color="azure", edgecolor="gray")

plt.xticks([r for r in range(len(GY_df["Drug_combo"]))],
        GY_df["Drug_combo"], fontsize = 12)

#legend
fig.legend([acc_bar,rf1_bar,sf1_bar],["Accuracy", "R_f1_score", "S_f1_score"], bbox_to_anchor=(0.4,-0.35, 0.04, 0.4), fontsize=12)

# Show Plot
plt.show()