In [None]:
import pandas as pd
import numpy as np

In [None]:
data_path = '/Users/jk1/stroke_datasets/stroke_unit_dataset/per_value/Extraction_20211110'
admission_data_path = '/Users/jk1/OneDrive - unige.ch/stroke_research/geneva_stroke_unit_dataset/data/stroke_registry/post_hoc_modified/stroke_registry_post_hoc_modified.xlsx'
patient_selection_path = '/Users/jk1/temp/opsum_extration_output/high_frequency_data_patient_selection.csv'

In [None]:
from preprocessing.variable_assembly.variable_database_assembly import assemble_variable_database

feature_df = assemble_variable_database(data_path, admission_data_path, patient_selection_path)

In [None]:
from preprocessing.variable_assembly.relative_timestamps import transform_to_relative_timestamps

restricted_feature_df = transform_to_relative_timestamps(feature_df, drop_old_columns=False, restrict_to_time_range=True)

In [None]:
from preprocessing.normalisation.normalisation import normalise_data

normalised_restricted_feature_df = normalise_data(restricted_feature_df, verbose=True)

In [None]:
normalised_restricted_feature_df.head()

In [None]:
normalised_restricted_feature_df['relative_sample_date_hourly_cat'] = np.floor(normalised_restricted_feature_df['relative_sample_date'])

In [None]:
normalised_restricted_feature_df.head()

In [None]:
hour0_df = normalised_restricted_feature_df[normalised_restricted_feature_df['relative_sample_date_hourly_cat'] == 0]

In [None]:
# for simplicity dropping variables with duplicated values (but ideally duplicates should be replacing by median / mode /min / max in preprocessing)
hour0_df = hour0_df.drop_duplicates(['case_admission_id', 'sample_label'])
hour0_df = hour0_df[['case_admission_id', 'sample_label', 'value']].pivot(index='case_admission_id', columns='sample_label', values='value')
hour0_df

In [None]:
hour0_df = hour0_df.reset_index()

In [None]:
from preprocessing.outcome_preprocessing.outcome_preprocessing import preprocess_outcomes

stroke_registry_df = pd.read_excel(admission_data_path)
patient_selection_df = pd.read_csv(patient_selection_path, dtype=str)
outcome_df = preprocess_outcomes(stroke_registry_df, patient_selection_df)

In [None]:
outcome_df.head()

In [None]:
hour0_df_with_outcomes = pd.merge(hour0_df, outcome_df, left_on='case_admission_id', right_on='patient_admission_id')
hour0_df_with_outcomes.head()

In [None]:
columns_to_drop = ['case_admission_id'] + outcome_df.columns.tolist()
input_hour0_df = hour0_df_with_outcomes.drop(columns_to_drop, axis=1)

In [None]:
 # factorize columns if it contains strings
factorized_hour0_df = input_hour0_df.apply(lambda x: pd.factorize(x)[0] if type(x.mode(dropna=True)[0]) == str else x)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(factorized_hour0_df)

In [None]:
factorized_hour0_df

In [None]:
from ppca import PPCA
ppca = PPCA()

In [None]:
factorized_hour0_df = factorized_hour0_df.astype(float)

In [None]:
factorized_hour0_df.to_numpy().shape

In [None]:
n_components = 2

In [None]:
model = ppca.fit(data=factorized_hour0_df.to_numpy(), d=n_components, verbose=True)

In [None]:
model_params = ppca.C

In [None]:
ppca.C = model_params

In [None]:
component_matrix = ppca.transform()

In [None]:
components_with_outcomes = pd.concat([pd.DataFrame(component_matrix), hour0_df_with_outcomes['3M mRS']], axis=1)

In [None]:
components_with_outcomes.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# plot scatter plot of first two principal components
sns.scatterplot(x=components_with_outcomes[0], y=components_with_outcomes[1], hue=components_with_outcomes['3M mRS'])
plt.show()

In [None]:
import plotly.express as px
fig = px.scatter_3d(x=components_with_outcomes[0], y=components_with_outcomes[1], z=components_with_outcomes[2], color=components_with_outcomes['3M mRS'], opacity=0.7, size=1)
fig.show()