# ML Pipeline

In [74]:
# imports
import pandas as pd
import numpy as np
import random

from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn import linear_model


In [42]:
# read in df
path_win = 'G://My Drive//Forschung//Mitarbeiter//Allgaier//23-12-06_Immun-ML//04_Data//03_ML//'

df = pd.read_excel(path_win + '2024.04.24_Mastertabelle_ML.xlsx', index_col='Unnamed: 0')

## Prepare

In [43]:
# drop rows where target is None
df = df[df['future_measurement_val'].notna()]

In [44]:
# create new feature
df['n_events_so_far'] = df['n_vaccinations_so_far'] + df['n_infections_so_far']

In [67]:
# define feature list
features = ['Alter', 'Geschlecht', 'Dialyse_x', 'n_events_so_far', 'SARS-IgG', 'vaccination']

In [46]:
# define target
target = ['future_measurement_val']

## Define train and test users

In [81]:
# define train and test users
random.seed(1994)

all_users = df['ID'].unique().tolist()
train_users = random.sample(all_users, int(len(all_users)*0.8))
test_users = [user for user in all_users if user not in train_users]

assert set(train_users + test_users) == set(all_users)

In [82]:
# define train and test dataframes
df_train = df[df['ID'].isin(train_users)]
df_test = df[df['ID'].isin(test_users)]

assert df_train.shape[0] + df_test.shape[0] == df.shape[0]

## Transform

In [83]:
# Alter
bins = np.arange(0, 120, 10)
df_train.loc[:,'Alter_bins'] = pd.cut(df_train.Alter, bins)

le_age = LabelEncoder()
df_train.loc[:,'Alter_bins'] = le_age.fit_transform(df_train['Alter_bins'].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.loc[:,'Alter_bins'] = pd.cut(df_train.Alter, bins)


In [84]:
#SARS-IgG
rs = RobustScaler()
rs.fit(df_train['SARS-IgG'].values.reshape(-1, 1))
df_train.loc[:, 'SARS-IgG'] = rs.transform(df_train.loc[:,'SARS-IgG'].values.reshape(-1, 1))

## Group Cross Validate

In [105]:
# define features and target
X = df_train[features]
y = df_train[target]

# Retrieve an identifier column 'ID' from the dataframe
groups = df_train['ID']
# Initialize a Lasso regression model.
lasso = linear_model.Lasso()
# Perform 5-fold group cross-validation to evaluate model using negative MSE.
scores = list(cross_val_score(lasso, X, y, groups=groups,cv=5, scoring='neg_mean_squared_error'))
# Convert MSE scores to RMSE scores.
rmse_scores = [(-1*score)**0.5 for score in scores]
print('mean RMSE:\t', int(np.array(rmse_scores).mean()))
print('std RMSE:\t', int(np.array(rmse_scores).std()))

#TODO: go to hold-out test set


mean RMSE:	 2525
std RMSE:	 222


## How good is a simple heuristic?

In [113]:
m = y.mean().iloc[0]

y_pred = [m for i in range(y.shape[0])]
y_true = y

from sklearn.metrics import mean_squared_error
int(mean_squared_error(y, y_pred)**0.5)

2444