In [None]:
print(connection_file)
from jupyter_client import find_connection_file
connection_file = find_connection_file()
print(connection_file)
print(connection_file)

In [None]:
import pyarrow
import pandas as pd
import sklearn

import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'

from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource

In [None]:
df = pd.read_csv("../data/GSE169159_covid2.csv")

In [None]:
df

In [None]:
# Keep only the data of the sample time (post vaccine days) we want.
# Todo: decide on the correct day per vaccince type
day_col = "Day"
sample_day = "FC.D21"
df = df.loc[df[day_col] == sample_day]
day_col

In [None]:
# Need to find out what's considered a "high" (=protective?) response.  
# For the meantime, take the average response as the threshold
response_col = "Response"
responses = df[[response_col]]
mean_response = responses.mean()
print(f'mean_response: {mean_response}')

In [None]:
feature_col = "IMMAGE"
# Note the double [[]], which keeps X as a DF and not a series - sklearn expects a 2D array-like structure which a 1-feature DF supplies
X = df[[feature_col]]

# Get a boolean map of sub and above threshold values
high_response_thr = mean_response
y = df[[response_col]] > high_response_thr
# Convert boolean Series to 0s and 1s and rename column
y = y.astype(int).rename(columns={'Response': 'Labels'})



#### Looking at the distribution of response values

In [None]:
sns.histplot(responses, bins=50)
plt.show()

In [None]:

data = pd.concat([X, responses, y], axis=1)
data

In [None]:
sns.scatterplot(data=data, x="IMMAGE", y="Response", hue="Labels", palette='Set1')
plt.show(block=False)

In [None]:
sns.stripplot(data=data, x='IMMAGE', y=['']*len(df), hue='Labels', jitter=False, dodge=True, palette='Set1')
plt.show(block=False)

In [None]:
px.scatter(data_frame=data, x='IMMAGE', y='Response')

So we see that the assumption that response is linear with IMMAGE doesn't hold... 

In [None]:
# Scikit-learn related imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from scipy.stats import randint
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from mlxtend.plotting import plot_decision_regions


# Training and prediction 
svm = SVC()
svm_poly = SVC(kernel='poly')

### Testing out some basic predictions

#### logistic regression

In [None]:
labels = y["Labels"]

log_regress = LogisticRegression()
regression_result = cross_validate(log_regress, X, labels)
regression_result['test_score']



In [None]:
# Split the data into training and testing sets
X_train, X_test, labels_train, labels_test = train_test_split(X, labels, test_size=0.2, random_state=42)

log_regress.fit(X_train, labels_train)
log_regress.score(X_test, labels_test)

#### Cross validation (SVM, linear)

In [None]:
svm_result = cross_validate(svm, X, y)
svm.fit(X, y)

In [None]:
svm_result['test_score']
svm_linear_score = svm_result['test_score'].mean()

#### Cross validation (SVM, polynomial up to 3rd degree)

In [None]:
svm_poly_result = cross_validate(svm_poly, X, y)
svm_poly_score = svm_poly_result['test_score'].mean()

In [None]:
print(f"All fold scores:{svm_poly_result['test_score']}")
print(f"Mean score: {svm_poly_result['test_score'].mean()}")

#### CV with hyperparameter tuning  (Decision forest)

In [None]:
# define the parameter space that will be searched over
param_distributions = {'max_depth': randint(5, 10)}
tree_search = RandomizedSearchCV(estimator=DecisionTreeClassifier(random_state=0),
                             n_iter=10,
                             param_distributions=param_distributions,
                             random_state=0)
tree_search.fit(X_train, y_train)

In [None]:
tree_score = tree_search.score(X_test, y_test)

In [None]:
# define the parameter space that will be searched over
param_distributions = {'n_estimators': randint(1, 5),
                        'max_depth': randint(5, 10)}
forest_search = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=0),
                             n_iter=10,
                             param_distributions=param_distributions,
                             random_state=0)
forest_search.fit(X_train, y_train)

In [None]:
forest_score = forestSearch.score(X_test, y_test)

In [None]:
## so far we have:

print(f"Linear SVM score: {svm_linear_score:.2}")
print(f"Polynomial SVM score: {svm_poly_score:.2}")
print(f"forest score: {forest_score}")

### Looking at decision boundaries/threshold

In [None]:
beta_0 = log_regress.intercept_[0]
beta_1 = log_regress.coef_[0][0]

# Calculate the cutoff value
cutoff = -beta_0 / beta_1

print(f"The cutoff value for the feature is: {cutoff}")

In [None]:
beta_1

In [None]:
best_tree = tree_search.best_estimator_
plot_tree(best_tree)

In [None]:
y_test

In [None]:
plot_decision_regions(X.to_numpy(), y.to_numpy(), clf=svm, legend=2)

# Adding axes annotations
plt.xlabel('IMMAGE score')
plt.title('SVM classification')

plt.show()

In [None]:
sns