In [1]:
import pandas as pd
import src.utils as utils

## Read config

## Load the data

In [4]:

data = utils.load_data('data/training.csv')
data = data.drop(['patient_id','patient_gender'],axis=1)

## Preprocessing

In [5]:
import src.data_preprocessing as prep
    
X_train, X_test, y_train, y_test = prep.split_data(data)

cat_features = X_train.select_dtypes(include=['object', 'category']).columns
num_features = X_train.select_dtypes(include=['float64', 'int64']).columns

filler = prep.get_fill_for_na(X_train, cat_features, fill_strategy='mode')
filler.update(prep.get_fill_for_na(X_train, num_features, fill_strategy='mean'))

for c in X_train.columns:
    X_train[c].fillna(filler[c],inplace=True)
    X_test[c].fillna(filler[c],inplace=True)

## Training

In [6]:
from catboost import CatBoostClassifier


cat_model = CatBoostClassifier(
    #custom_loss=["AUC"],
    cat_features=cat_features.values,
    use_best_model=True,
    random_state=42,
    verbose=100,    
)
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


Learning rate set to 0.056407
0:	learn: 0.6665962	test: 0.6658559	best: 0.6658559 (0)	total: 149ms	remaining: 2m 29s
100:	learn: 0.4493351	test: 0.4630768	best: 0.4628633 (86)	total: 5.36s	remaining: 47.7s
200:	learn: 0.4277497	test: 0.4623088	best: 0.4620586 (158)	total: 9.76s	remaining: 38.8s
300:	learn: 0.4078392	test: 0.4630826	best: 0.4620175 (217)	total: 14.5s	remaining: 33.7s
400:	learn: 0.3903937	test: 0.4638073	best: 0.4620175 (217)	total: 18.4s	remaining: 27.5s
500:	learn: 0.3746060	test: 0.4647873	best: 0.4620175 (217)	total: 23.9s	remaining: 23.8s
600:	learn: 0.3616640	test: 0.4662390	best: 0.4620175 (217)	total: 28s	remaining: 18.6s
700:	learn: 0.3488693	test: 0.4669356	best: 0.4620175 (217)	total: 32.5s	remaining: 13.9s
800:	learn: 0.3363025	test: 0.4694454	best: 0.4620175 (217)	total: 36.9s	remaining: 9.17s
900:	learn: 0.3238301	test: 0.4706260	best: 0.4620175 (217)	total: 41.3s	remaining: 4.54s
999:	learn: 0.3132417	test: 0.4725280	best: 0.4620175 (217)	total: 45.9s	rem

<catboost.core.CatBoostClassifier at 0x11b345c00>

In [9]:
y_pred = cat_model.predict(X_test)
y_pred

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


array([1, 1, 1, ..., 0, 1, 0])

## Evaluation

In [23]:
import src.evaluation as eval

model_eval = pd.DataFrame.from_dict(eval.evaluate_model(y_test,y_pred), orient='index')
model_eval

Unnamed: 0,0
Accuracy,0.816421
Precision,0.790544
Recall,0.961657
F1 Score,0.867746
ROC AUC,0.767357


## Create Kaggle submision

In [11]:
sub_X = utils.load_data('data/test.csv')

for c in X_train.columns:
    sub_X[c].fillna(filler[c],inplace=True)

sub_y = pd.DataFrame(cat_model.predict_proba(sub_X[X_train.columns])[:,1], index=sub_X.patient_id,columns=['DiagPeriodL90D'])

  self._init_pool(data, label, cat_features, text_features, embedding_features, embedding_features_data, pairs, weight,


## Save results

In [28]:
run_dir = utils.create_directory()

X_train['DiagPeriodL90D'] = y_train
X_test['DiagPeriodL90D'] = y_test
X_train.to_csv(run_dir+'/training_data.csv')
X_test.to_csv(run_dir+'/test_data.csv')
sub_X.to_csv(run_dir+'/X_submission.csv')
sub_y.to_csv(run_dir+'/submission.csv')
model_eval.to_csv(run_dir+'/metrics.csv')

utils.save_model_as_pickle(cat_model,run_dir+'/model.pickle')