In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
from xgboost import XGBRegressor

from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import BaggingRegressor, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

from utils import reg_scores

In [None]:
pd.set_option('display.max_columns',1000)
pd.options.display.float_format = '{:.3f}'.format
sns.set_theme(style='whitegrid')
model_types = ['extratree', 'gradientboost']
cls = model_types[0]
thres = 1.03
random_state = 42
n_iter = 30

bayes = True
ptau217_index = 0

In [None]:
load_bf2_df = pd.read_csv('csv/BF2_R.csv')

In [None]:
ptau217 = ['Plasma WashU %p-tau217',
           'Plasma Lilly p-tau217',
           'CSF Lilly p-tau217',
           'CSF WashU p-tau217']

common = ['CSF AB42/AB40',
          'Age',
          'APOE',
          'ADAS',
          'Education',
          'Sex',
          'Cognitive status',
          'MMSE',
          'CSF Abnormal Ratio',
          'Diagnosis status',
          'fnc_ber_com_composite']

cd_drop = [           
            # 'Age',
            # 'APOE',
            'ADAS',
            'Education',
            'Sex',
            'Cognitive status',
            'MMSE',
            'CSF Abnormal Ratio',
            'Diagnosis status']

name = ['BF2-P-MS','BF2-P-IA','BF2-C-IA','BF2-C-MS']

In [None]:
features = [ptau217[ptau217_index]] + common
select_df = load_bf2_df[features]
select_df = select_df.dropna(how='any').reset_index(drop=True)
select_df.info()

In [None]:
neg_idx = select_df['fnc_ber_com_composite']<=thres
pos_idx = (1-neg_idx).astype('bool')
neg_df = select_df[neg_idx]
pos_df = select_df[pos_idx]
pos_tv_df, pos_test_df = train_test_split(pos_df, test_size=0.2, random_state=random_state)

tv_df = pos_tv_df.drop(cd_drop,axis=1)
test_df = pos_test_df.drop(cd_drop,axis=1)

X_test = test_df.drop(['fnc_ber_com_composite'], axis=1)
y_test = test_df['fnc_ber_com_composite']

In [None]:
tv_df.info()

In [None]:
model_list = [
              SVR(kernel='poly'),
              Ridge(alpha=0.5, random_state=random_state),
              AdaBoostRegressor(n_estimators=450, random_state=random_state),
              KNeighborsRegressor(n_neighbors=10),
              GradientBoostingRegressor(max_depth=2, n_estimators=50, random_state=random_state),
              DecisionTreeRegressor(max_depth=3, random_state=random_state),
              BaggingRegressor(n_estimators=100, random_state=random_state),
              XGBRegressor(max_depth=2, n_estimators=50, eta=0.09, random_state=random_state),
              ExtraTreesRegressor(max_depth=6, random_state=random_state),
              RandomForestRegressor(max_depth=5, random_state=random_state)
              ]
kf_scores = []
kf = 5
for model in model_list:
  l = round(len(tv_df)/kf)
  for k in range(kf):
      val_df = tv_df.iloc[l*k:l*(k+1),:]
      train_df = tv_df.drop(val_df.index)        
      # normalization
      X_train = train_df.drop(['fnc_ber_com_composite'], axis=1)
      y_train = train_df['fnc_ber_com_composite']

      X_val = val_df.drop(['fnc_ber_com_composite'], axis=1)
      y_val = val_df['fnc_ber_com_composite']

      model.fit(X_train, y_train)
      kf_scores.append([str(model)[:str(model).index('(')],k] + list(reg_scores(model, X_train, y_train, X_val, y_val)))

In [None]:
result_df = pd.DataFrame(kf_scores, columns=['Model', 'kfold', 'Train_R2', 'Val_R2', 'Train_MAPE', 'Val_MAPE'])
result_df.groupby('Model').mean().sort_values(by='Val_R2',ascending=False)