In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import cohen_kappa_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils import class_weight
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.metrics import cohen_kappa_score
from numpy import linspace

In [None]:
# suppress Convergence and user Warnings

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

## Parameters

#### test run

In [None]:
# True: randomly choose 500 compounds from the data set
# False: everything

test_run = False
#test_run = True

#### predict test set

In [None]:
# True: predict test data set

pred = True
#pred = false

#### StandardScaler during modeling

In [None]:
# True: use Standard Scaler

StdSca = True
#StdSca = False

#### save prediction as csv

In [None]:
# True: save prediction as csv file

save = False
#save = True

## Reading data sets

In [None]:
df = pd.read_csv('df_train_set.csv')
df['sol_category'] = pd.to_numeric(df['sol_category'], downcast='integer')

if pred:
    df_test_set = pd.read_csv('df_test_set.csv')
    sub_template = pd.read_csv('Data/submission_template_rdm.csv')

#### checking the composition of the data set

In [None]:
df['sol_category'].value_counts()

In [None]:
if test_run:
    df, _, __, ___ = train_test_split(df, df['sol_category'], train_size=500, stratify=df['sol_category'])

## preparing model training data

#### defining features X

In [None]:
X = df.copy(deep=True)
X.drop(columns=['Id', 'smiles', 'sol_category'], inplace=True)
#X

#### defining category list y

In [None]:
y = df.iloc[:, 2]

## training model

#### defining parameters for the xgboost model

In [None]:
# setup parameters for the xgboost model
params = {}
#params['booster'] = 'gbtree' #['gbtree', 'gblinear', 'dart']
#params['objective'] = ['binary:logistic']
#params["eval_metric"] = ["error"]
params['eta'] = 0.001 #, 0.0001, 0.001, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5] # boosting learning rate
params['gamma'] = 0.5 #linspace(0.000000001, 1.0, num=11) # min loss red recuired for further partition on leaf node
params['max_depth'] = 7 #np.arange(1, 11, 2) # max tree dept for base learners
params['n_estimators'] = 100 #np.arange(50, 550, 50) maybe 250 ??
params['min_child_weight'] = 1 # min sum of instance weight in a child
params['max_delta_step'] = 0 # max delta step allowed for each tree's weight estimate
params['subsample']= 0.5 #[0.5, 1] # subsample ratio of training instance
params['colsample_bytree'] = 1 # subsample ratio of columns when cunstructing each tree
#params['silent'] = [1]
#params['seed'] = [0] # = random_state ???
params['base_score'] = 0.5 # initial prediction score, global bias
#params['random_state'] = [0] # = seed ???
#params['scale_pos_weight'] = ratio
params['n_jobs'] = 5

#### applying model on training data set

In [None]:
# applying StandardScaler
if StdSca:
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(X)
    X = scaler.fit_transform(X)

# splitting data set
StratifiedKFold(n_splits=5)
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=1,  shuffle=True, stratify=y)#, test_size=0.2, train_size=0.8)

# defining class weights
classes_weights = class_weight.compute_sample_weight(
    class_weight='balanced',
    y=y_train
    )

# defining model
model = XGBClassifier(**params)

# fitting model on training data set
model.fit(X_train, y_train, sample_weight=classes_weights)

# model validation
valPredictions = model.predict(X_val)

# calculating quadratically weighted kappa score
sk_quad_kappa = cohen_kappa_score(y_val, valPredictions, weights='quadratic')
    
 
# printing results
print(f'quadratically weighted kappa validation score: {sk_quad_kappa}')

## predicting test set

In [None]:
if pred:
    df_test_set.drop(columns=['Id', 'smiles'], inplace=True)

    # apply StandardScaler
    if StdSca:
        scaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(df_test_set)
        df_test_set = scaler.fit_transform(df_test_set)

    # prediction
    testPredictions = model.predict(df_test_set)
    sub_template['pred'] = testPredictions

In [None]:
if pred:
    set(testPredictions)

In [None]:
if pred:
    sub_template['pred'].value_counts()

In [None]:
if save:
    print('saving as csv planned')
    #sub_template.to_csv('Submissions/submission_20_12_C-lab.csv', index=False)
else:
    if test_run:
        print('test run')
    else:
        print('unsaved run')