# Initial Data Exploration on Preprocessed Data

In [None]:
# imports
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# change current working directory to content root
os.chdir('../..')
# check that this worked out
assert os.getcwd().split('/')[-1] == 'immun-ml'
# ignore error because we do hacky things
pd.options.mode.chained_assignment = None

In [None]:
raw_target_df = pd.read_parquet('data/immun-ml_targets_v1.parquet')
raw_control_df = pd.read_parquet('data/immun-ml_control_v1.parquet')

## Analyse available SARS-IgG Data

In [None]:
sars_igg_columns = [col for col in raw_target_df.columns if 'SARS-IgG' in col and not 'dicho' in col]
target_df = raw_target_df[sars_igg_columns]
target_df.loc[:,'non_null_count'] = target_df.notnull().sum(axis=1)

sars_igg_columns = [col for col in raw_control_df.columns if 'SARS-IgG' in col and not 'dicho' in col]
control_df = raw_control_df[sars_igg_columns]
control_df.loc[:,'non_null_count'] = control_df.notnull().sum(axis=1)

In [None]:
print('How many SARS-IgG values do we have per target patient?')
target_df['non_null_count'].describe()

In [None]:
print('How many SARS-IgG values do we have per control patient?')
control_df['non_null_count'].describe()

In [None]:
sub_control_df = control_df.iloc[:, :-1]
sub_target_df = target_df.iloc[:, :-1]

sub_control_df.rename(columns={col: col.split('_')[-1] for col in sub_control_df.columns}, inplace=True)
sub_target_df.rename(columns={col: col.split('_')[-1] for col in sub_target_df.columns}, inplace=True)

plt.figure(figsize=(15, 5))

#all_data_melted = pd.melt(sub_df, var_name='time_point', value_name='Value')
target_df_melted = pd.melt(sub_target_df, var_name='time_point', value_name='Value')
control_df_melted = pd.melt(sub_control_df, var_name='time_point', value_name='Value')
# Convert to numeric, coerce errors to NaN
target_df_melted['Value'] = pd.to_numeric(target_df_melted['Value'], errors='coerce')
control_df_melted['Value'] = pd.to_numeric(control_df_melted['Value'], errors='coerce')

# Drop rows with NaN in the 'Value' column
target_df_melted = target_df_melted.dropna(subset=['Value'])
target_df_melted['dialysis'] = 'yes'
control_df_melted = control_df_melted.dropna(subset=['Value'])
control_df_melted['dialysis'] = 'no'
merged = pd.concat([target_df_melted, control_df_melted])

g = sns.boxenplot(data=merged, x = 'time_point', y ='Value', hue='dialysis')
g.set_yscale("log")

plt.title('SARS-IgG distribution by time points')

In [None]:
plt.figure(figsize=(15, 5))

g = sns.lineplot(data=merged, x = 'time_point', y ='Value', hue='dialysis')
g.set_yscale("log")

plt.title('SARS-IgG distribution by time points')

In [None]:
target_values = target_df.count()
control_values = control_df.count()
value_counts = pd.concat([target_values, control_values], axis=1, keys=['target', 'control'])
value_counts

## Analyse available SARS-RAI Data

In [None]:
raw_target_df

In [None]:
sars_igg_columns = [col for col in raw_target_df.columns if 'RAI_' in col and not 'strati' in col]
target_df = raw_target_df[sars_igg_columns]
target_df.loc[:,'non_null_count'] = target_df.notnull().sum(axis=1)

sars_igg_columns = [col for col in raw_control_df.columns if 'RAI_' in col and not 'strati' in col]
control_df = raw_control_df[sars_igg_columns]
control_df.loc[:,'non_null_count'] = control_df.notnull().sum(axis=1)

In [None]:
print('How many SARS-RAI values do we have per target patient?')
target_df['non_null_count'].describe()

In [None]:
print('How many SARS-RAI values do we have per control patient?')
control_df['non_null_count'].describe()

In [None]:
sub_control_df = control_df.iloc[:, :-1]
sub_target_df = target_df.iloc[:, :-1]

sub_control_df.rename(columns={col: col.split('_')[-1] for col in sub_control_df.columns}, inplace=True)
sub_target_df.rename(columns={col: col.split('_')[-1] for col in sub_target_df.columns}, inplace=True)

plt.figure(figsize=(15, 5))

#all_data_melted = pd.melt(sub_df, var_name='time_point', value_name='Value')
target_df_melted = pd.melt(sub_target_df, var_name='time_point', value_name='Value')
control_df_melted = pd.melt(sub_control_df, var_name='time_point', value_name='Value')
# Convert to numeric, coerce errors to NaN
target_df_melted['Value'] = pd.to_numeric(target_df_melted['Value'], errors='coerce')
control_df_melted['Value'] = pd.to_numeric(control_df_melted['Value'], errors='coerce')

# Drop rows with NaN in the 'Value' column
target_df_melted = target_df_melted.dropna(subset=['Value'])
target_df_melted['dialysis'] = 'yes'
control_df_melted = control_df_melted.dropna(subset=['Value'])
control_df_melted['dialysis'] = 'no'
merged = pd.concat([target_df_melted, control_df_melted])

g = sns.boxenplot(data=merged, x = 'time_point', y ='Value', hue='dialysis')
g.set_yscale("log")

plt.title('SARS-RAI distribution by time points')

In [None]:
plt.figure(figsize=(15, 5))

g = sns.lineplot(data=merged, x = 'time_point', y ='Value', hue='dialysis')
g.set_yscale("log")

plt.title('SARS-RAI distribution by time points')

In [None]:
target_values = target_df.count()
control_values = control_df.count()
value_counts = pd.concat([target_values, control_values], axis=1, keys=['target', 'control'])
value_counts

## NT_norm
TODO add me

### SFU_SARS

In [None]:
sars_igg_columns = [col for col in raw_target_df.columns if 'SFU_SARS' in col and not 'dicho' in col]
target_df = raw_target_df[sars_igg_columns]
target_df.loc[:,'non_null_count'] = target_df.notnull().sum(axis=1)

sars_igg_columns = [col for col in raw_control_df.columns if 'SFU_SARS' in col and not 'dicho' in col]
control_df = raw_control_df[sars_igg_columns]
control_df.loc[:,'non_null_count'] = control_df.notnull().sum(axis=1)

In [None]:
print('How many SFU_SARS values do we have per target patient?')
target_df['non_null_count'].describe()

In [None]:
print('How many SFU_SARS values do we have per control patient?')
control_df['non_null_count'].describe()

In [None]:
sub_control_df = control_df.iloc[:, :-1]
sub_target_df = target_df.iloc[:, :-1]

sub_control_df.rename(columns={col: col.split('_')[-1] for col in sub_control_df.columns}, inplace=True)
sub_target_df.rename(columns={col: col.split('_')[-1] for col in sub_target_df.columns}, inplace=True)

plt.figure(figsize=(15, 5))

#all_data_melted = pd.melt(sub_df, var_name='time_point', value_name='Value')
target_df_melted = pd.melt(sub_target_df, var_name='time_point', value_name='Value')
control_df_melted = pd.melt(sub_control_df, var_name='time_point', value_name='Value')
# Convert to numeric, coerce errors to NaN
target_df_melted['Value'] = pd.to_numeric(target_df_melted['Value'], errors='coerce')
control_df_melted['Value'] = pd.to_numeric(control_df_melted['Value'], errors='coerce')

# Drop rows with NaN in the 'Value' column
target_df_melted = target_df_melted.dropna(subset=['Value'])
target_df_melted['dialysis'] = 'yes'
control_df_melted = control_df_melted.dropna(subset=['Value'])
control_df_melted['dialysis'] = 'no'
merged = pd.concat([target_df_melted, control_df_melted])

g = sns.boxenplot(data=merged, x = 'time_point', y ='Value', hue='dialysis')
g.set_yscale("log")

plt.title('SFU_SARS distribution by time points')

In [None]:
plt.figure(figsize=(15, 5))

g = sns.lineplot(data=merged, x = 'time_point', y ='Value', hue='dialysis')
g.set_yscale("log")

plt.title('SFU_SARS distribution by time points')

In [None]:
target_values = target_df.count()
control_values = control_df.count()
value_counts = pd.concat([target_values, control_values], axis=1, keys=['target', 'control'])
value_counts

### SFUsub_SARS

In [None]:
sars_igg_columns = [col for col in raw_target_df.columns if 'SFUsub_SARS' in col and not 'dicho' in col]
target_df = raw_target_df[sars_igg_columns]
target_df.loc[:,'non_null_count'] = target_df.notnull().sum(axis=1)

sars_igg_columns = [col for col in raw_control_df.columns if 'SFUsub_SARS' in col and not 'dicho' in col]
control_df = raw_control_df[sars_igg_columns]
control_df.loc[:,'non_null_count'] = control_df.notnull().sum(axis=1)

In [None]:
print('How many SFUsub_SARS values do we have per target patient?')
target_df['non_null_count'].describe()

In [None]:
print('How many SFUsub_SARS values do we have per control patient?')
control_df['non_null_count'].describe()

In [None]:
sub_control_df = control_df.iloc[:, :-1]
sub_target_df = target_df.iloc[:, :-1]

sub_control_df.rename(columns={col: col.split('_')[-1] for col in sub_control_df.columns}, inplace=True)
sub_target_df.rename(columns={col: col.split('_')[-1] for col in sub_target_df.columns}, inplace=True)

plt.figure(figsize=(15, 5))

#all_data_melted = pd.melt(sub_df, var_name='time_point', value_name='Value')
target_df_melted = pd.melt(sub_target_df, var_name='time_point', value_name='Value')
control_df_melted = pd.melt(sub_control_df, var_name='time_point', value_name='Value')
# Convert to numeric, coerce errors to NaN
target_df_melted['Value'] = pd.to_numeric(target_df_melted['Value'], errors='coerce')
control_df_melted['Value'] = pd.to_numeric(control_df_melted['Value'], errors='coerce')

# Drop rows with NaN in the 'Value' column
target_df_melted = target_df_melted.dropna(subset=['Value'])
target_df_melted['dialysis'] = 'yes'
control_df_melted = control_df_melted.dropna(subset=['Value'])
control_df_melted['dialysis'] = 'no'
merged = pd.concat([target_df_melted, control_df_melted])

g = sns.boxenplot(data=merged, x = 'time_point', y ='Value', hue='dialysis')
g.set_yscale("log")

plt.title('SFUsub_SARS distribution by time points')

In [None]:
plt.figure(figsize=(15, 5))

g = sns.lineplot(data=merged, x = 'time_point', y ='Value', hue='dialysis')
g.set_yscale("log")

plt.title('SFUsub_SARS distribution by time points')

In [None]:
target_values = target_df.count()
control_values = control_df.count()
value_counts = pd.concat([target_values, control_values], axis=1, keys=['target', 'control'])
value_counts

### SI_SARS

In [None]:
sars_igg_columns = [col for col in raw_target_df.columns if 'SI_SARS' in col and not 'dicho' in col]
target_df = raw_target_df[sars_igg_columns]
target_df.loc[:,'non_null_count'] = target_df.notnull().sum(axis=1)

sars_igg_columns = [col for col in raw_control_df.columns if 'SI_SARS' in col and not 'dicho' in col]
control_df = raw_control_df[sars_igg_columns]
control_df.loc[:,'non_null_count'] = control_df.notnull().sum(axis=1)

In [None]:
print('How many SI_SARS values do we have per target patient?')
target_df['non_null_count'].describe()

In [None]:
print('How many SI_SARS values do we have per control patient?')
control_df['non_null_count'].describe()

In [None]:
sub_control_df = control_df.iloc[:, :-1]
sub_target_df = target_df.iloc[:, :-1]

sub_control_df.rename(columns={col: col.split('_')[-1] for col in sub_control_df.columns}, inplace=True)
sub_target_df.rename(columns={col: col.split('_')[-1] for col in sub_target_df.columns}, inplace=True)

plt.figure(figsize=(15, 5))

#all_data_melted = pd.melt(sub_df, var_name='time_point', value_name='Value')
target_df_melted = pd.melt(sub_target_df, var_name='time_point', value_name='Value')
control_df_melted = pd.melt(sub_control_df, var_name='time_point', value_name='Value')
# Convert to numeric, coerce errors to NaN
target_df_melted['Value'] = pd.to_numeric(target_df_melted['Value'], errors='coerce')
control_df_melted['Value'] = pd.to_numeric(control_df_melted['Value'], errors='coerce')

# Drop rows with NaN in the 'Value' column
target_df_melted = target_df_melted.dropna(subset=['Value'])
target_df_melted['dialysis'] = 'yes'
control_df_melted = control_df_melted.dropna(subset=['Value'])
control_df_melted['dialysis'] = 'no'
merged = pd.concat([target_df_melted, control_df_melted])

g = sns.boxenplot(data=merged, x = 'time_point', y ='Value', hue='dialysis')
g.set_yscale("log")

plt.title('SI_SARS distribution by time points')

In [None]:
plt.figure(figsize=(15, 5))

g = sns.lineplot(data=merged, x = 'time_point', y ='Value', hue='dialysis')
g.set_yscale("log")

plt.title('SI_SARS distribution by time points')

In [None]:
target_values = target_df.count()
control_values = control_df.count()
value_counts = pd.concat([target_values, control_values], axis=1, keys=['target', 'control'])
value_counts

### Correlation

In [None]:
### dataset still to polluted for this. Require more preprocessing...