# Load Libraries and Functions

In [4]:
# System, Data, Time, and Spec Libraries
import os
import pandas as pd
import time
import random
from datetime import datetime, timedelta
import numpy as np 
from line_profiler import LineProfiler  # Code peformance
profiler = LineProfiler()
import warnings
warnings.filterwarnings('ignore')
import csv
#pd.set_option('display.max_colwidth', None)
import multiprocess as mp
num_cores = mp.cpu_count()

# Data Visualization Libraries
import matplotlib.pyplot as plt
from tabulate import tabulate
import plotly.express as px
import seaborn as sns
#from pandas.io.json import json_normalize  # Older version
from pandas import json_normalize  # Newer version
from pandas.plotting import parallel_coordinates


# Natural Language Processing Libraries
import json
import requests
import xml.etree.ElementTree as ET
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import words
import string
import nltk
from collections import OrderedDict
from nltk.tokenize import word_tokenize
from pandas import json_normalize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.util import ngrams
from nltk.corpus import stopwords
from string import punctuation
punctuation = set(punctuation)
punctuation.update({'_', '-','‘'})
english_words = set(words.words())
from fuzzywuzzy import process
#nltk.download('words')
#nltk.download('punkt')
#nltk.download('stopwords')

# SQL Interface Libraries
import pymysql as mysql
import mysql.connector
import pyodbc
import sqlite3
import sqlalchemy
from sqlalchemy import create_engine
from sqlalchemy import event
from string import punctuation

# Machine Learning Libraries
import scipy.stats as stats
from scipy.stats import zscore
from scipy.stats import yeojohnson
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.impute import KNNImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, ConfusionMatrixDisplay,roc_auc_score, roc_curve 
from sklearn.metrics import classification_report, mean_squared_error, f1_score
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, Pipeline
from dmba import classificationSummary, AIC_score, BIC_score, plotDecisionTree,gainsChart
from scikitplot.metrics import plot_lift_curve, plot_cumulative_gain
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.metrics import specificity_score, sensitivity_score
from imblearn.under_sampling import RandomUnderSampler
from scikitplot.metrics import plot_lift_curve
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn import preprocessing
from sklearn import tree
from sklearn.model_selection import cross_val_score
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import KFold
from dmba import stepwise_selection, classificationSummary, backward_elimination
from sklearn.decomposition import PCA
from sklearn.utils.random import sample_without_replacement
from sklearn.metrics import precision_score, recall_score

# Importing Custom Functions
import nbimporter
from Functions import nan_info

### SQL Password

In [7]:
PASSWORD  = 'schweini'

# Get Dataset from SQL Database

In [8]:
# Connect to the MySQL server
connection = mysql.connector.connect(
    host="localhost", user="root", password=PASSWORD, database="pharma_db"
)

# Create a cursor object
cursor = connection.cursor()

In [9]:
# Master Query from Data

master_query = """SELECT 
                a.serious_outcome,
                a.expedited,
                a.age,
                a.sex,
                a.year
                a.weight
                r.outcome,
                p.unit_price,
                p.generic_brand,
                l.ingredients,
                l.rxcui,
                l.set_id,
                d.manu_num,
                d.unii
            FROM adverse_events a 
                INNER JOIN patients_reactions r ON a.event_id = r.event_id 
                INNER JOIN patients_drugs d ON r.event_id = d.event_id 
                INNER JOIN prices p ON d.ndc11 = p.ndc11
                INNER JOIN lables l ON p.ndc11 = l.ndc11
            WHERE y.year, a.year, etc. FROM 2020 TO 2024  # Create proper syntax here
            ORDER BY y.year"""  # Still need to test and figure out

In [23]:
connection = mysql.connector.connect(
    host="localhost", user="root", password=PASSWORD, database="pharma_db"
)

cursor = connection.cursor()
sql_query = """SELECT 
a.serious_outcome,
a.expedited,
a.age,
a.sex,
a.year,
a.weight,
r.outcome
FROM adverse_events a 
INNER JOIN patient_reactions r ON a.event_id = r.event_id
INNER JOIN patient_drugs d ON a.event_id = d.event_id
LIMIT 5"""
cursor.execute(sql_query)
result = cursor.fetchall()
column_names = [i[0] for i in cursor.description]
result_query_df = pd.DataFrame(result, columns=column_names)

cursor.close()
connection.close()

result_query_df

Unnamed: 0,serious_outcome,expedited,age,sex,year,weight,outcome
0,Serious,1,,2,2022,67,Recovered
1,Serious,1,,2,2022,67,Recovered
2,Death,1,,2,2022,49,Fatal
3,Death,1,,2,2022,49,Fatal
4,Death,1,,2,2022,49,Fatal


In [12]:
cursor.execute(master_query)
result = cursor.fetchall()
column_names = [i[0] for i in cursor.description]
master_query_df = pd.DataFrame(result, columns=column_names)
cursor.close()
connection.close()

ProgrammingError: 1064 (42000): You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near '.weight
                r.outcome,
                p.unit_price,
               ' at line 7

# Preparation for Data Split

In [None]:
master_query_df.head()

In [None]:
nan_info(master_query_df)

### Update Datatypes

In [None]:
master_query_df.info()

### Define numerical, text, and categorical variables

In [None]:
cats = ['drug_name']
nums = ['age', 'weight', 'price'] 
texts = ['']
all_vars = cats+nums+texts

### Create Transformation Pipeline

In [None]:
# Create a categorical processing pipeline that uses one-hot encoding
# Dropping binary columns and drop first of each level** NEED TO ADD**
cat_pipe = Pipeline([('encoder', OneHotEncoder(drop='if_binary'))])

# Create a numerical processing pipeline that uses skewness correction/center/scale.
num_pipe = Pipeline([('skew_standardize', PowerTransformer())])

# Create a text token processing step to vectorize tokens
text_pipe = Pipeline([('vector', tf_idf function())])

# Combine pipeline steps
all_pipe = make_pipeline(ColumnTransformer([('cat', cat_pipe, cats), 
                                            ('num', num_pipe, nums),
                                           ('text', text_pipe, texts)],
                                          verbose_feature_names_out=False))
# Verify steps
all_pipe.named_steps

# Split Data into Training/Validation/Testing

In [None]:
# Split the data into features (X) and the target variable (y).
X = master_query_df[[all_vars]]

#Define outcome variable
y = master_query_df[['outcome']]  # Need to Decide 5 Level or 3 Level

#Split data
X_train, X_rest, y_train, y_rest = train_test_split(X, y, train_size=0.8, random_state = 2)
X_val, X_test, y_val, y_test = train_test_split(X_rest, y_rest, test_size=0.5, random_state = 2)

## Apply Pipeline to Training Data

In [None]:
#Fit pipeline to resampled data
X_train_fit = all_pipe.fit(X_train)
#Get feature names out from fit and create as new list
X_train_cols = X_train_fit.get_feature_names_out().tolist()
X_train_pipe = pd.DataFrame(all_pipe.fit_transform(X_train), columns = X_train_cols)

## Apply Pipeline to Validation Data

In [None]:
#Fit pipeline to resampled data
X_val_fit = all_pipe.fit(X_val)
#Get feature names out from fit and create as new list
X_val_cols = X_val_fit.get_feature_names_out().tolist()
X_val_pipe = pd.DataFrame(all_pipe.fit_transform(X_val), columns = X_val_cols)

## Apply Pipeline to Testing Data

In [None]:
#Fit pipeline to resampled data
X_test_fit = all_pipe.fit(X_test)
#Get feature names out from fit and create as new list
X_test_cols = X_test_fit.get_feature_names_out().tolist()
X_test_pipe = pd.DataFrame(all_pipe.fit_transform(X_test), columns = X_test_cols)

## Undersample Training Data to Balance Outcome

In [None]:
# Create a RandomUnderSampler instance with a specified random seed and sampling strategy
rus = RandomUnderSampler(random_state = 1, sampling_strategy='majority')

# Perform random under-sampling on the training dataset
X_train_under, y_train_under = rus.fit_resample(X_train_pipe, y_train)

# Multiclass Classification Model Training

## White Box Models

### Logistic Regression Classic

In [4]:
log_reg = LogisticRegression(random_state=1)

### Logistic Regression L1 Regularization

In [5]:
log_l1 = LogisticRegression(solver='saga', penalty='l1', random_state=1)

### Logistic Regression L2 Regularization

In [6]:
log_l2 = LogisticRegression(solver='saga', penalty='l2', random_state=1)

### Logistic Regression L1 w/ 10-k CV

In [None]:
# Create Logistic Regression model with L1 regularization
log_l1_cv = LogisticRegressionCV(solver = 'saga', penalty = 'l1', cv = 10, random_state = 1)
# Fit the model to the training data
log_l1_cv.fit(X_train_under, y_train_under.values.ravel())

In [None]:
# Intercept Log-Odds and Odds
print(log_l1_cv.intercept_ , np.exp(log_l1_cv.intercept_))

In [None]:
# Create table of coefficient odds
d = {'Feature': pd.Series(X_train_under.columns), 'LogOdds': pd.Series(log_l1_cv.coef_[0])}
df = pd.DataFrame(data=d).reindex(d['LogOdds'].abs().sort_values(ascending=False).index)
df['Odds'] = np.exp(df['LogOdds'])
df

In [None]:
# Cross val performance

### Logistic Regression Elastic Net

In [None]:
param_grid = {
    'l1_ratio': [i / 9 for i in range(10)],  # 10 values from 0 to 1 (0, 0.1, 0.2, ..., 1.0)
    'C': [0.01, 0.1, 1, 10, 100]  # Different strengths of regularization
}

log_reg_elasticnet = LogisticRegression(
    penalty='elasticnet',  # Use Elastic Net regularization
    solver='saga',        # Solver that supports Elastic Net
    random_state=1
)

grid_search = GridSearchCV(
    estimator=log_reg_elasticnet,
    param_grid=param_grid,
    scoring='accuracy',   # Or another metric of choice
    cv=5,                 # Number of cross-validation folds
    n_jobs=-1              # Use all available CPUs
)

grid_search.fit(X_train_under, y_train_under.values.ravel())


best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

best_score = grid_search.best_score_
print(f"Best cross-validation score: {best_score:.4f}")

best_model = grid_search.best_estimator_

### Single Decision Tree with Grid Search and 10k-CV

In [None]:
# Define a grid of hyperparameters to search for in tree
param_grid = {
    'max_depth' : [1,2,3,4,5],
    'min_samples_leaf' : [1,2,3,4,5]
    
}
# Create a GridSearchCV object using and the defined parameter grid
tree1_search = GridSearchCV(DecisionTreeClassifier(random_state=1), param_grid, cv=10, n_jobs=-1)
# Fit the GridSearchCV to the balanced training data to find the best hyperparameters
tree1_search.fit(X_train_under, y_train_under.values.ravel())
# Get the best hyperparameters found by the GridSearch
tree1_search.best_params_

In [None]:
tree1 = DecisionTreeClassifier(max_depth = 4, min_samples_leaf = 4, random_state = 1).fit(X_train_under, y_train_under)

In [None]:
# Get feature importance scores
importances=tree1.feature_importances_
feature_importance_pairs=list(zip(X_train_under.columns, importances))

# Sort in descending order
sorted_feature_importance_pairs = sorted(feature_importance_pairs, key=lambda x: x[1], reverse=True)

# Print scores
print("Feature Importance Scores")
for feature, importance in sorted_feature_importance_pairs:
    print(f"{feature}: {importance:.4f}")

In [None]:
# Cross val performance

## Ensembles

### Random Forest Classifier w/ 10-k CV

In [None]:
#Random Forest
# Create a Random Forest classifier with 100 trees
random_forest = RandomForestClassifier(n_estimators=1000, random_state=1)  

# Fit (train) the Random Forest classifier on the balanced training data
random_forest.fit(X_train_under, y_train_under.values.ravel())

In [None]:
# Get feature importance scores
importances=random_forest.feature_importances_
feature_importance_pairs=list(zip(X_train_under.columns, importances))

# Sort in descending order
sorted_feature_importance_pairs = sorted(feature_importance_pairs, key=lambda x: x[1], reverse=True)

# Print scores
print("Feature Importance Scores")
for feature, importance in sorted_feature_importance_pairs:
    print(f"{feature}: {importance:.4f}")

In [None]:
# Extract feature names and importances
feature_names, importances = zip(*sorted_feature_importance_pairs)

# Create a horizontal bar plot
plt.figure(figsize=(14, 12))
#plt.barh(range(len(feature_names)), importances, align='center')
#plt.yticks(range(len(feature_names)), feature_names)

plt.barh(range(len(feature_names)), importances, align='center')
plt.yticks(range(len(feature_names)), feature_names)

plt.xlabel('Feature Importance')
plt.gca().invert_yaxis()  # Invert the y-axis to show the most important features at the top
#plt.show()
plt.savefig('featimp.png')

In [None]:
# Cross val peformance

## Gradient Boosted Tree w/ 10-k CV

In [None]:
# Define a grid of hyperparameters to search for in the Gradient Boosting Classifier
param_grid = {
    'max_depth' : [1,2,3,4,5],
    'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 1],
    'n_estimators' : [20,21,22,23,24],
}
# Create a GridSearchCV object using GradientBoostingClassifier and the defined parameter grid
tree_search = GridSearchCV(GradientBoostingClassifier(random_state=1), param_grid, cv=10, n_jobs=-1)
# Fit the GridSearchCV to the balanced training data to find the best hyperparameters
tree_search.fit(X_train_under, y_train_under.values.ravel())
# Get the best hyperparameters found by the GridSearch
tree_search.best_params_

In [None]:
# Cross val performance

## Non-parametric

### K-Nearest Neighbors Classifier

In [None]:
# Initialize an empty list to store KNN results
knn_results = []
for k in range (1, 50):
    # Create a K-Nearest Neighbors model with k neighbors and fit it to the balanced training data
    knn_mod = KNeighborsClassifier(n_neighbors = k).fit(X_train_under, y_train_under.values.ravel())
    # Calculate and append results to the list
    knn_results.append({
        'k': k,
        'Sens': sensitivity_score(y_test_imp.values.ravel(), knn_mod.predict(X_test_combo)),
        'Acc': accuracy_score(y_test_imp.values.ravel(), knn_mod.predict(X_test_combo)) 
    })
# Create a DataFrame from the list of KNN results    
knn_results = pd.DataFrame(knn_results)
warnings.filterwarnings('ignore')

In [None]:
knn_results

In [None]:
# Create a K-Nearest Neighbors (KNN) classifier with 6 neighbors and fit it to the balanced training data
knn = KNeighborsClassifier(n_neighbors = 5).fit(X_train_under, y_train_under.values.ravel())

## Neural Network

In [None]:
# train

In [None]:
# feature importance

In [None]:
# 

# Validation Data Tuning

# Testing Performance

## Logistic Regression

### Predictions

### Confusion Matrix

In [None]:
#Logistic Regression
log_cm = confusion_matrix(y_true = y_test, y_pred = log_l1.predict(X_test_log), labels = log_l2_bal.classes_)
log_disp = ConfusionMatrixDisplay(confusion_matrix= log_cm_bal, display_labels=log_l2_bal.classes_)
log_disp.plot()
#plt.show()
plt.savefig('lr_cm.png')

In [None]:
# classification summary

In [None]:
# curve

In [None]:
#Sensitivity, specificity, and accuracy for log reg
logl1_sens = sensitivity_score(y_test, log_l1.predict(X_test))
logl1_spec = specificity_score(y_test, log_l1.predict(X_test))
logl1_acc = accuracy_score(y_test, log_l1.predict(X_test))
logl1_prec = precision_score(y_test, log_l1.predict(X_test)) 
logl1_rec = recall_score(y_test, log_l1.predict(X_test))
logl1_f1 = f1_score(y_test, log_l1.predict(X_test))
logl1_sens, logl1_spec, logl1_acc, logl1_prec, logl1_rec, logl1_f1

## Decision Tree

### Predictions

### Confusion Matrix

In [None]:
#Confusion matrix for tree
tree_cm = confusion_matrix(y_true = y_test, y_pred = tree1.predict(X_test), labels = tree1.classes_)
tree_disp = ConfusionMatrixDisplay(confusion_matrix= tree_cm, display_labels=tree1.classes_)
tree_disp.plot()
plt.show() 

In [None]:
# classification summary

In [None]:
# curve

In [None]:
#Sensitivity, specificity, and accuracy for decision tree
tree1_sens = sensitivity_score(y_test, tree1.predict(X_test))
tree1_spec = specificity_score(y_test, tree1.predict(X_test))
tree1_acc = accuracy_score(y_test, tree1.predict(X_test))
tree1_prec = precision_score(y_test, tree1.predict(X_test)) 
tree1_rec = recall_score(y_test, tree1.predict(X_test))
tree1_f1 = f1_score(y_test, tree1.predict(X_test))
tree1_sens, tree1_spec, tree1_acc, tree1_prec, tree1_rec, tree1_f1

## Random Forest

### Predictions

### Confusion Matrix

In [None]:
# Compute the confusion matrix for RF 
rf_cm = confusion_matrix(y_true = y_test, y_pred = random_forest.predict(X_test), labels = random_forest.classes_)
# Create a ConfusionMatrixDisplay object for visualization
rf_disp = ConfusionMatrixDisplay(confusion_matrix= rf_cm, display_labels=random_forest.classes_)
rf_disp.plot()
warnings.filterwarnings('ignore')
#plt.show()
#plt.savefig('rf_cm.png')

In [1]:
# classification summary

In [None]:
# curve

In [None]:
#Sensitivity, specificity, and accuracy for RF
rf_sens = sensitivity_score(y_test, random_forest.predict(X_test))
rf_spec = specificity_score(y_test, random_forest.predict(X_test))
rf_acc = accuracy_score(y_test, random_forest.predict(X_test))
rf_prec = precision_score(y_test, random_forest.predict(X_test)) 
rf_rec = recall_score(y_test, random_forest.predict(X_test))
rf_f1 = f1_score(y_test, random_forest.predict(X_test))
warnings.filterwarnings('ignore')
rf_sens, rf_spec, rf_acc, rf_prec, rf_rec, rf_f1

## Gradient Boosted Tree

### Predictions

### Confusion Matrix

In [None]:
#Confusion matrix for tree
tree_cm = confusion_matrix(y_true = y_test, y_pred = tree_search.predict(X_test), labels = tree_search.classes_)
tree_disp = ConfusionMatrixDisplay(confusion_matrix= tree_cm, display_labels=tree_search.classes_)
tree_disp.plot()
plt.show() 

In [2]:
# classification summary

In [None]:
# curve

In [None]:
#Sensitivity, specificity, and accuracy for decision tree 
tree_sens = sensitivity_score(y_test, tree_search.predict(X_test))
tree_spec = specificity_score(y_test, tree_search.predict(X_test))
tree_acc = accuracy_score(y_test, tree_search.predict(X_test))
tree_prec = precision_score(y_test, tree_search.predict(X_test)) 
tree_rec = recall_score(y_test, tree_search.predict(X_test))
tree_f1 = f1_score(y_test, tree_search.predict(X_test))
tree_sens, tree_spec, tree_acc, tree_prec, tree_rec, tree_f1

## K-Nearest Neighbors

### Predictions

### Confusion Matrix

In [None]:
#KNN
# Compute the confusion matrix for KNN 
knn_cm = confusion_matrix(y_true = y_test, y_pred = knn.predict(X_test), labels = knn.classes_)
# Create a ConfusionMatrixDisplay object for visualization
knn_disp = ConfusionMatrixDisplay(confusion_matrix= knn_cm, display_labels=knn.classes_)
knn_disp.plot()
warnings.filterwarnings('ignore')
plt.show()

In [None]:
# Classification summary

In [None]:
# Curve

In [None]:
#Sensitivity, specificity, and accuracy for KNN 
knn_sens = sensitivity_score(y_test, knn.predict(X_test))
knn_spec = specificity_score(y_test, knn.predict(X_test))
knn_acc = accuracy_score(y_test, knn.predict(X_test))
knn_prec = precision_score(y_test, knn.predict(X_test)) 
knn_rec = recall_score(y_test, knn.predict(X_test))
knn_f1 = f1_score(y_test, knn.predict(X_test))
warnings.filterwarnings('ignore')
knn_sens, knn_spec, knn_acc, knn_prec, knn_rec, knn_f1

## Neural Network

### Predictions

### Confusion Matrix

In [None]:
# Classification summary

In [None]:
# Curve

# Performance Metrics Table

In [None]:
# Create a table to display performance metrics for different models on the validation dataset

val_performance = [
{'Model': 'Neural Net', 'Test Sensitivity': nn_sens, 'Test Specificity':nn_spec, 
 'Accuracy': nn_acc, 'Test Precision': nn_prec, 'Test Recall': nn_rec, 'F1 Score': nn_f1},
    {'Model': 'Logistic Regression', 'Test Sensitivity': logl1_sens, 'Test Specificity':logl1_spec, 
 'Accuracy': logl1_acc, 'Test Precision': logl1_prec, 'Test Recall': logl1_rec,  'F1 Score': logl1_f1},
    {'Model': 'Boosted Tree', 'Test Sensitivity': tree_sens, 'Test Specificity':tree_spec, 
 'Accuracy': tree_acc, 'Test Precision': tree_prec, 'Test Recall': tree_rec,  'F1 Score': tree_f1},
    {'Model': 'Single Tree', 'Test Sensitivity': tree1_sens, 'Test Specificity':tree1_spec, 
 'Accuracy': tree1_acc, 'Test Precision': tree1_prec, 'Test Recall': tree1_rec, 'F1 Score': tree1_f1},
    {'Model': 'Random Forest', 'Test Sensitivity': rf_sens, 'Test Specificity':rf_spec, 
 'Accuracy': rf_acc, 'Test Precision': rf_prec, 'Test Recall': rf_rec, 'F1 Score': rf_f1},
    {'Model': 'K-Nearest Neighbors', 'Test Sensitivity': knn_sens, 'Test Specificity':knn_spec, 
 'Accuracy': knn_acc, 'Test Precision': knn_prec, 'Test Recall': knn_rec,  'F1 Score': knn_f1},
    
]
# Create a formatted table using tabulate and specify the format as 'fancy_grid'
table = tabulate(val_performance, headers='keys', tablefmt='fancy_grid')
# Display the comparison table
print(table)