# ML Pipeline

In [45]:
# imports
import pandas as pd
import numpy as np
import random

from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.model_selection import GroupKFold, cross_val_score
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [46]:
# read in df
path_win = 'G://My Drive//Forschung//Mitarbeiter//Allgaier//23-12-06_Immun-ML//04_Data//03_ML//'
path_mac = '/Users/johannesallgaier/My Drive/Forschung/Mitarbeiter/Allgaier/23-12-06_Immun-ML/04_Data/03_ML'

df = pd.read_excel('../../data/03_ml/2024.04.24_Mastertabelle_ML.xlsx', index_col='Unnamed: 0')

## Prepare

In [47]:
# drop rows where target is None
df = df[df['future_measurement_val'].notna()]

In [48]:
# create new feature
df['n_events_so_far'] = df['n_vaccinations_so_far'] + df['n_infections_so_far']

In [49]:
# define feature list
features = ['Alter', 'Geschlecht', 'Dialyse_x', 'n_vaccinations_so_far', 'n_infections_so_far','SARS-IgG', 'vaccination']

In [50]:
# define target
target = ['future_measurement_val']

## Define train and test users

In [51]:
# define train and test users
random.seed(1994)

all_users = df['ID'].unique().tolist()
train_users = random.sample(all_users, int(len(all_users)*0.8))
test_users = [user for user in all_users if user not in train_users]

assert set(train_users + test_users) == set(all_users)

In [52]:
# define train and test dataframes
df_train = df[df['ID'].isin(train_users)]
df_test = df[df['ID'].isin(test_users)]

assert df_train.shape[0] + df_test.shape[0] == df.shape[0]

## Transform

In [53]:
# Alter
bins = np.arange(0, 120, 10)
df_train.loc[:,'Alter_bins'] = pd.cut(df_train.Alter, bins)

le_age = LabelEncoder()
df_train.loc[:,'Alter_bins'] = le_age.fit_transform(df_train['Alter_bins'].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train.loc[:,'Alter_bins'] = pd.cut(df_train.Alter, bins)


In [54]:
#SARS-IgG
rs = RobustScaler()
rs.fit(df_train['SARS-IgG'].values.reshape(-1, 1))
df_train.loc[:, 'SARS-IgG'] = rs.transform(df_train.loc[:,'SARS-IgG'].values.reshape(-1, 1))

## Group Cross Validate

In [57]:
# define features and target
X = df_train[features].dropna()
y = df_train.loc[X.index, target].values.ravel()

total_rmse_scores=list()

# Retrieve an identifier column 'ID' from the dataframe
groups = df_train.loc[X.index, 'ID']

for rs in range(100):

    # Initialize a Lasso regression model.
    model = RandomForestRegressor(random_state=1994)
    # Perform 5-fold group cross-validation to evaluate model using negative MSE.
    total_rmse_scores.extend(list(cross_val_score(model, X, y, groups=groups,cv=5, scoring='neg_mean_squared_error')))

# Convert MSE scores to RMSE scores.
rmse_scores = [(-1*score)**0.5 for score in total_rmse_scores]
print('mean RMSE:\t', int(np.array(rmse_scores).mean()))
print('std RMSE:\t', int(np.array(rmse_scores).std()))

#TODO: go to hold-out test set


mean RMSE:	 2508
std RMSE:	 227


### Performance Log of Mean RMSE (Std):
```
features = ['Alter', 'Geschlecht', 'Dialyse_x', 'n_events_so_far','SARS-IgG', 'vaccination']
Mean    2329
Lasso   2525
RF      2220 (495)
RF      2599 (340)
SVR     2446 (500)
RF (default params)     2545 (339)
Adaboost    2642 (327)

features = ['Alter', 'Geschlecht', 'Dialyse_x', 'n_vaccinations_so_far', 'n_infections_so_far','SARS-IgG', 'vaccination']
RF  2508 (227)
```


## How good is a simple heuristic?

In [56]:
m = y.mean()

y_pred = [m for i in range(y.shape[0])]
y_true = y

from sklearn.metrics import mean_squared_error
print('RMSE\t', int(mean_squared_error(y, y_pred)**0.5))

RMSE	 2329
