<a href="https://colab.research.google.com/github/Lucy-Moctezuma/SFSU-CodeLab-Work-/blob/main/E.%20Coli%20Machine%20Learning%20Project/2_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Logistic Regression**



### **1) Importing Packages needed**


In [None]:
# Data manipulation imports for ML
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Import packages for logistic regression model
from sklearn.linear_model import LogisticRegression
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

# Imports for model evaluation 
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Imports for data visualization
import matplotlib.pyplot as plt

# Imports for file management
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### **2) Loading CSV file and creating dataframes for each antibiotic**


#### **a) Loading CSV created from previous notebook**


In [None]:
# Loads csv file as a dataframe
filepath = '/content/drive/My Drive/EColi_ML_CSV_files/'

# reads csv file as a dataframe
All_Drugs_df = pd.read_csv(filepath+"EColi_Merged_dfs.csv", na_values="NaN")
All_Drugs_df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Isolate,CTZ,CTX,AMP,AMX,AMC,TZP,CXM,CET,GEN,...,cutoff_25459,cutoff_25654,cutoff_25772,cutoff_25979,cutoff_26792,cutoff_27119,cutoff_27236,cutoff_27248,cutoff_27690,cutoff_45092
0,11657_5#10,S,S,S,,S,S,S,S,S,...,0,0,0,0,0,0,0,0,0,0
1,11657_5#11,S,S,R,,R,S,S,S,S,...,0,0,0,0,0,0,0,0,0,0
2,11657_5#12,S,S,S,,S,S,S,S,S,...,0,0,0,0,0,0,0,0,0,0
3,11657_5#13,S,S,R,,R,S,S,S,S,...,0,0,0,0,0,0,0,0,0,0
4,11657_5#14,S,S,R,,S,S,S,S,S,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1931,24742_1#96,S,S,S,,,,S,S,S,...,0,0,0,0,0,0,0,0,0,0
1932,24742_1#97,S,S,S,,,,S,S,S,...,0,0,0,0,0,0,0,0,0,0
1933,24742_1#98,S,S,R,,,,S,S,S,...,0,0,0,0,0,0,0,0,0,0
1934,24742_1#99,S,S,R,,,,S,S,S,...,0,0,0,0,0,0,0,0,0,0


#### **b) Creating dataframes for each drug**

In [None]:
# creating a list of antibiotic names
drug_list = All_Drugs_df.iloc[:,1:13].columns
drug_list

# creating a function that makes dataframes for each antibiotic and dropping NaN values
def makeDF(drug):
  df_list = [All_Drugs_df[["Isolate",drug]],All_Drugs_df.iloc[:,13:]]
  Drug_df = pd.concat(df_list, axis=1)
  Drug_df = Drug_df.dropna()
  return Drug_df

# implementing function using as example the drug CTZ
CTZ_df = makeDF("CTZ")

# looking at the shape of CTZ dataframe
print("CTZ dataframe shape: ", CTZ_df.shape)

# looking at the first 5 rows of this dataframe
CTZ_df.head()

### **3) Separating each Drug Dataframe into 4 sections : Training (features and labels) and Testing (features and labels)**

#### **a) Creating Testing and Training datasets for each antibiotic drug**


In [None]:
# Separating each dataframe into Labels and Features for training and testing data
def Split_train_test(Drug_df,drug):
  Train_test_dic = {}
  labels = Drug_df[drug]
  features = Drug_df.drop(columns=[drug])
  features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.33, random_state=42)

  Train_test_dic['labels_train'] = labels_train
  Train_test_dic['features_train'] = features_train
  Train_test_dic['labels_test'] = labels_test
  Train_test_dic['features_test'] = features_test

  return Train_test_dic

# Implementing the function Split_train_test() for CTZ example
CTZ_Train_test_dic = Split_train_test(CTZ_df,"CTZ")

# checking the shape of each dataframe or series stored in the dictionary created for drug CTZ
print("CTZ")
for k, df in CTZ_Train_test_dic.items():
  print(k, df.shape)

# Accessing a particular chunk of data
CTZ_Train_test_dic["features_train"]

### **4) Creating different combination of features before training**


In [None]:
# making a list of combinations of data sources we would like to test in our ML models
combo_list = ['G', 'S', 'GY', 'SY', 'GYS' ] 

# making a function that creates different feature combinations of the predictor features
def combo_feat(features_df, drug, combo):

  # creating Year column filters for features_df
  year_filter = [col for col in features_df if col.startswith("Year")]
  year_feat = features_df[year_filter]

  # creating Population structure column filters for features_df
  pop_str_filter = [col for col in features_df if col.startswith("cutoff")]
  pop_struc_feat = features_df[pop_str_filter]

  # creating Gene precence column filters for features_df
  gene_presc_filter = [col for col in features_df.columns if col not in pop_str_filter and col not in year_filter and col != "Isolate"]
  gene_presc_feat = features_df[gene_presc_filter]  

  if combo == 'G':
    df_list = [features_df['Isolate'], gene_presc_feat]
    G_feat_df = pd.concat(df_list, axis=1)
    G_feat_df = G_feat_df.drop(columns=['Isolate'])
    return G_feat_df

  if combo == 'S':
    df_list = [features_df['Isolate'], pop_struc_feat]
    S_feat_df = pd.concat(df_list, axis=1)
    S_feat_df = S_feat_df.drop(columns=['Isolate'])
    return S_feat_df

  if combo == 'GY':
    df_list = [features_df['Isolate'], gene_presc_feat, year_feat]
    GY_feat_df = pd.concat(df_list, axis=1)
    GY_feat_df = GY_feat_df.drop(columns=['Isolate'])
    return GY_feat_df

  if combo == 'SY':
    df_list = [features_df['Isolate'], pop_struc_feat, year_feat]
    SY_feat_df = pd.concat(df_list, axis=1)
    SY_feat_df = SY_feat_df.drop(columns=['Isolate'])
    return SY_feat_df

  if combo == 'GYS':
    df_list = [features_df['Isolate'], gene_presc_feat, year_feat, pop_struc_feat, ]
    GYS_feat_df = pd.concat(df_list, axis=1)
    GYS_feat_df = GYS_feat_df.drop(columns=['Isolate'])
    return GYS_feat_df

# Implementing combo_feat() function created for training data
CTZ_GYS_train_df = combo_feat(CTZ_Train_test_dic['features_train'],"CTZ","GYS")

# looking only at the feature column names for the combination for "GYS" for drug "CTZ" for training data
CTZ_GYS_train_df.columns

### **5) Creating Logistic regression model and training it per feature combination**


In [None]:
# creating Logistic regression model function
@ignore_warnings(category=ConvergenceWarning)
def run_LG(feat_train_df, lab_train, drug, combo):
  print(drug +" Training combo: "+ combo)
  LG = LogisticRegression(random_state = 42, solver= 'lbfgs', C=1.0, max_iter=500) 
  LG = LG.fit(feat_train_df, lab_train)
  return LG

# implementing run_LG() for specific drug feature combination dataframe
LG_CTZ_GYS_model = run_LG(CTZ_GYS_train_df, CTZ_Train_test_dic['labels_train'],"CTZ","GYS")
LG_CTZ_GYS_model

# printing the beta_0 or intercept value of our model
print("Intercept:",LG_CTZ_GYS_model.intercept_[0])

# printing all the beta_j's or coefficients of our logistic regression model
print("All beta_j values:", LG_CTZ_GYS_model.coef_[0])

# printing the number of all the beta_j values
print("Number of beta_j values: ", len(LG_CTZ_GYS_model.coef_[0]))

### **6) Making predictions from Logistic regression model**


In [None]:
# creating a function using the model created and trained and the feature combinations from testing data
def predict(LG_combo_Model, features_test):
  labels_pred = LG_combo_Model.predict(features_test)
  return labels_pred

# Implementing combo_feat() function created for testing data
CTZ_GYS_test_df = combo_feat(CTZ_Train_test_dic['features_test'],"CTZ","GYS")

# looking only at the feature column names for the combination for "GYS" for drug "CTZ" for testing data
CTZ_GYS_test_df.columns

# Implementation of the predict() function using the feature combination "GYS"
CTZ_GYS_labels_pred = predict(LG_CTZ_GYS_model,CTZ_GYS_test_df)

# observe how many predictions were made for each category "R" and "S"
print("Labels predicted: ", np.unique(CTZ_GYS_labels_pred, return_counts=True))

### **7) Evaluating our model using a confusion matrix and metrics**

In [None]:
# Creating a function that evaluates our model using our actual and predicted data
def evaluate(LG_combo_model, labels_test, labels_pred, cf= True):
  report = classification_report(labels_test, labels_pred, output_dict = True)
  accuracy = report['accuracy']
  R_f1_score = report['R']['f1-score']# Resistant
  S_f1_score = report['S']['f1-score']# Susceptible
  if cf == True:
    cm = confusion_matrix(labels_test, labels_pred, labels=LG_combo_model.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=LG_combo_model.classes_)
    disp.plot()
    plt.show()
  return [accuracy,R_f1_score,S_f1_score]

# implementing the evaluate() function 
Model_Report = evaluate(LG_CTZ_GYS_model, CTZ_Train_test_dic['labels_test'],CTZ_GYS_labels_pred)
print("Results from Model for drug: CTZ")
print("Using feature combination: GYS")
print("Accuracy: ", Model_Report[0])
print("R_f1_score: ", Model_Report[1])
print("S_f1_score: ", Model_Report[2])

### **8) Use all functions and evaluate every drug in every feature combination!**

#### **a) Lets recall the list of drugs we have available and the combination of features we are interested in**

In [None]:
# let's check all drugs
drug_list

# let's see all combinations we are interested in
combo_list

Index(['CTZ', 'CTX', 'AMP', 'AMX', 'AMC', 'TZP', 'CXM', 'CET', 'GEN', 'TBM',
       'TMP', 'CIP'],
      dtype='object')

#### **b) Create a loop that will go through all our functions using the lists above**

In [None]:
# Lets use all our functions this time and save our report into a single data structure
LG_model_metrics = {}

for drug in drug_list:
  print(drug)
  Drug_df = makeDF(drug) # creates one df per drug
  Test_Train_dic = Split_train_test(Drug_df, drug) # splits each drug df into a dictionary with testing and training data
  for combo in combo_list:
    # Training each drug_combo features
    labels_train = Test_Train_dic["labels_train"]
    features_train = combo_feat(Test_Train_dic["features_train"], drug, combo) # create corresponding feature_df for training
    LG_combo_model = run_LG(features_train, labels_train, drug, combo) # runs logistic regression model using the corresponding training feature_df 
    
    # Predicting each drug_combo features
    features_test = combo_feat(Test_Train_dic["features_test"], drug, combo) # create corresponding feature_df for testing
    labels_pred = predict(LG_combo_model, features_test) # generate predictions based on the feature combination tested

    # Evaluating our models
    labels_test = Test_Train_dic["labels_test"]
    report = evaluate(LG_combo_model, labels_test, labels_pred, cf=False)
    LG_model_metrics[drug+"_"+combo] = report
    
    print(report)

CTZ
CTZ Training combo: G
[0.9358372456964006, 0.7421383647798743, 0.9633601429848079]
CTZ Training combo: S
[0.8497652582159625, 0.42857142857142855, 0.9135135135135134]
CTZ Training combo: GY
[0.9358372456964006, 0.7421383647798743, 0.9633601429848079]
CTZ Training combo: SY
[0.8356807511737089, 0.4198895027624309, 0.9042844120328168]
CTZ Training combo: GYS
[0.8341158059467919, 0.43010752688172044, 0.9029304029304028]
CTX
CTX Training combo: G
[0.9641693811074918, 0.9035087719298246, 0.978]
CTX Training combo: S
[0.8257328990228013, 0.5992509363295879, 0.8886576482830385]
CTX Training combo: GY
[0.9690553745928339, 0.9177489177489178, 0.9809428284854563]
CTX Training combo: SY
[0.8289902280130294, 0.6125461254612546, 0.890282131661442]
CTX Training combo: GYS
[0.8306188925081434, 0.6119402985074627, 0.8916666666666666]
AMP
AMP Training combo: G
[0.8237410071942446, 0.8852459016393444, 0.6201550387596899]
AMP Training combo: S
[0.6366906474820144, 0.7376623376623378, 0.40935672514619

#### **b) Store the metrics report for all drugs and features combinations as a csv file**

In [None]:
# convert dictionary into a dataframe
LG_metrics = pd.DataFrame.from_dict(LG_model_metrics, orient='index',columns=["Accuracy", "R_f1_score", "S_f1_score"]).reset_index()
LG_metrics = LG_metrics.rename(columns = {'index':'Drug_combo'})

# saving our metric results into a CSV file
LG_metrics.to_csv(filepath+"LG_metrics_df.csv", index= False)
LG_metrics


Unnamed: 0,Drug_combo,Accuracy,R_f1_score,S_f1_score
0,CTZ_G,0.935837,0.742138,0.96336
1,CTZ_S,0.849765,0.428571,0.913514
2,CTZ_GY,0.935837,0.742138,0.96336
3,CTZ_SY,0.835681,0.41989,0.904284
4,CTZ_GYS,0.834116,0.430108,0.90293
5,CTX_G,0.964169,0.903509,0.978
6,CTX_S,0.825733,0.599251,0.888658
7,CTX_GY,0.969055,0.917749,0.980943
8,CTX_SY,0.82899,0.612546,0.890282
9,CTX_GYS,0.830619,0.61194,0.891667


#### **c) Create a bar graph showing accuracies of all drugs when using all features (GYS)**

In [None]:
# filtering for all the rows that contain 
GYS_filter = [drug_combo for drug_combo in LG_metrics['Drug_combo'] if drug_combo.endswith("GYS")]
GYS_df = LG_metrics.loc[LG_metrics["Drug_combo"].isin(GYS_filter)]

# Figure Size
fig = plt.figure(figsize =(20, 8))

# Adding title
plt.title('Accuracy, R_f1_scores and S_f1_scores', fontsize = 12)

# Variables to be plotted 
x = np.arange(len(GYS_df["Drug_combo"]))
acc = list(GYS_df["Accuracy"])
R_f1 = list(GYS_df["R_f1_score"])
S_f1 = list(GYS_df["R_f1_score"])

# Plotting barcharts
acc_bar=plt.bar(x-0.25, height= acc, width=0.25, color="grey", edgecolor="gray")
rf1_bar=plt.bar(x, height= R_f1, width=0.25, color="plum", align="center", edgecolor="gray")
sf1_bar=plt.bar(x+0.25, height= S_f1, width=0.25, color="lavenderblush", edgecolor="gray")

plt.xticks([r for r in range(len(GYS_df["Drug_combo"]))],
        GYS_df["Drug_combo"], fontsize = 12)

#legend
fig.legend([acc_bar,rf1_bar,sf1_bar],["Accuracy", "R_f1_score", "S_f1_score"], bbox_to_anchor=(0.4,-0.29, 0.04, 0.4), fontsize=12)

# Show Plot
plt.show()