In [None]:
from jupyter_client import find_connection_file
connection_file = find_connection_file()
print(connection_file)

In [None]:
#Plotting related
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.io as pio
pio.renderers.default='notebook'

# Scikit-learn related imports
import pyarrow
import pandas as pd
pd.options.mode.copy_on_write = True


from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from scipy.stats import randint, ttest_ind
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from mlxtend.plotting import plot_decision_regions

In [None]:
df = pd.read_csv("../data/all_vaccines.csv")
df.head()

In [None]:
dataset_col = "Dataset"
uid_col = "uid"
age_col = "Age"
day_col = "Day"
response_col = "Response"
immage_col = "IMMAGE"

df.dropna(inplace=True, subset=[immage_col, dataset_col, day_col, response_col])
all_dataset_names = df.Dataset.unique()

check_days = df[[dataset_col, uid_col, day_col]].groupby(dataset_col).head(10)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(check_days)

In [None]:
# Available datasets:
#
# [      'GSE125921.SDY1529', 'GSE13485.SDY1264', 'GSE13699.SDY1289',
#        'GSE169159', 'GSE190001', 'GSE201533', 'GSE41080.SDY212',
#        'GSE45735.SDY224', 'GSE47353.SDY80', 'GSE48018.SDY1276',
#        'GSE48023.SDY1276', 'GSE52245.SDY1260', 'GSE59635.SDY63',
#        'GSE59654.SDY180', 'GSE59654.SDY404', 'GSE59654.SDY520',
#        'GSE59743.SDY400', 'GSE65834.SDY1328', 'GSE79396.SDY984',
#        'GSE82152.SDY1294', 'SDY1325', 'SDY296', 'SDY67', 'SDY89']

In [None]:
# Narrow to a specific study and post-vaccine day
dataset_name = all_dataset_names[2] #"GSE125921.SDY1529"
dataset = df.loc[df[dataset_col] == dataset_name]
sample_days = dataset[day_col].unique()
sample_days

In [None]:
sample_day = "D90"
dataset = dataset.loc[dataset[day_col] == sample_day]

In [None]:
responses = dataset[[response_col]]
mean_response = responses.mean()
high_response_thr = responses.quantile(q=0.7).item()
low_response_thr = responses.quantile(q=0.3).item()
# sns.histplot(responses, bins=100)
# plt.show()
responses.median().item()

In [None]:

# Get a boolean map of sub and above threshold values
# Note that we define y=1 for all responses >= 30th percentile (and not >) since we want to decrease the chances of cross validation ending up with an empty group in one of the partitions.
X = dataset[[immage_col]]
y = dataset[[response_col]] >= low_response_thr

# Convert boolean Series to 0s and 1s and rename column to Labels
y = y.astype(int).rename(columns={'Response': 'Labels'})

# We sometimes want to have data, responses and labels together
data = pd.concat([X, responses, y], axis=1)

# Add a text label for plot leggends
data['Label text'] = data['Labels'].apply(lambda x: 'Responders' if x == 1 else 'Non-Responders')

In [None]:
# Plot IMMAGE and age values to look at the dynamic range
sns.histplot(data=dataset, x=immage_col, bins=50)
plt.title(f' {immage_col} values ({dataset_name})')
plt.show()

sns.histplot(data=dataset, x=age_col, bins=50)
plt.title(f' {age_col} values ({dataset_name})')
plt.show()

In [None]:
# Plot the response value against sorted IMMAGE, with markers signifying labels
sorted_data = data.sort_values(by="IMMAGE")
sns.scatterplot(data=sorted_data, x="IMMAGE", y="Response", hue="Label text", palette='Set1')
plt.title(f'Vaccine response vs IMMAGE ({dataset_name})')
plt.show()

In [None]:
# Running a t-test
low_group = data.loc[data[response_col]  < low_response_thr, immage_col]
high_group = data.loc[data[response_col]  >= low_response_thr, immage_col]
print(f"low group N: {low_group.shape[0]}, high group N: {high_group.shape[0]}")

ttest = ttest_ind(low_group, high_group)
ttest

In [None]:
# Classifying with logistic regression
labels =  y["Labels"]
log_regress = LogisticRegression()
regression_result = cross_validate(log_regress, X,  labels)
regression_result['test_score'].mean()

In [None]:
# Split the data into training and testing sets
X_train, X_test, labels_train, labels_test = train_test_split(X, labels, test_size=0.2, random_state=42)

log_regress.fit(X_train, labels_train)
log_regress.score(X_test, labels_test)

beta_0 = log_regress.intercept_[0]
beta_1 = log_regress.coef_[0][0]

# Calculate the cutoff value
cutoff = -beta_0 / beta_1

print(f"The cutoff value for the feature is: {cutoff}") 

In [None]:
# Plot IMMAGE values and the cutoff
sns.scatterplot(data=data.sort_values(immage_col, ignore_index=True).reset_index(), x="index", y=immage_col, hue="Label text")
plt.axvline(x=cutoff)
plt.title(f'sorted IMMAGE vs Index ({dataset_name})')
plt.show()