In [None]:
import pandas as pd
import numpy as np
import json
import pickle
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
import preprocessing as prep

In [None]:
path_config = Path("config.json")
with(open(path_config, 'r')) as file:
    config = json.load(file)
    
catch_id = config['features']['catch_id']

In [None]:
prep.main()
path = config['output']['train']
target_col = config['features']['target_col']

In [None]:
df = pd.read_csv(Path(config['data_path']) / Path(path), index_col=0)

In [None]:
X = df.drop(target_col, axis=1)
Y = df[[target_col]]

In [None]:
cv = StratifiedKFold(n_splits=config['features']['folds'], random_state=1, shuffle=True)

model = DecisionTreeClassifier()

scores = cross_validate(model, X, Y, scoring=['roc_auc','f1','accuracy', 'recall', 'precision'],
                         cv=cv, n_jobs=-1, return_train_score=True)

In [None]:
for key in scores.keys():
    scores[key] = scores[key].tolist()

Path(config['metrics_path']).mkdir(parents=True, exist_ok=True)
with open(Path(config['metrics_path'])/Path('metrics.json'), 'w') as fp:
    json.dump(scores, fp, sort_keys=True, indent=4,separators=(',', ':'), )

In [None]:
models = []
for train, test in cv.split(X, Y):
    x_train, y_train = X.iloc[train], Y.iloc[train]
    model = DecisionTreeClassifier()
    models.append(model.fit(x_train, y_train))

In [None]:
path = config['output']['test']
df_test = pd.read_csv(Path(config['data_path']) / Path(path),index_col=0)

df = prep.read_df(config['data_path'], config['input']['test'])

In [None]:
Path(config['model_path']).mkdir(parents=True, exist_ok=True)
preds = []
importances = []
i = 0
for model in models:
    pred = model.predict(df_test)
    with open(Path(config['model_path'])/Path(f'decision_tree_fold_{i}.pkl'), 'wb') as f:
        pickle.dump(model, f)
        i += 1
    preds.append(pred)
    importances.append(np.ndarray.round(model.feature_importances_, 4))

df_importances = pd.DataFrame(importances, columns=df_test.columns)
final_pred = np.mean(preds, axis=0).astype(int)
df_test['predictions'] = final_pred
df_final = pd.DataFrame(df_test['predictions'].groupby(df_test.index).mean().astype(int))

df = df.join(df_final)

df.to_csv(Path(config['data_path']) / Path(config['output']['prediction']))
df.to_json(Path(config['data_path']) / Path(config['output']['prediction'].replace('csv','json')))
df_importances.to_json(Path(config['metrics_path']) / Path('feature_importances.json'))