In [None]:
import sys
from sklearn import preprocessing
import pandas as pd
sys.path.append("..")

from preprocessing import get_sparse_matrix
from feature_adder import FeatureAdder
from linear_predictor import LogisticPredictor, SVMPredictor, XGBPredictor
from tuning import tune, bayesian_optimization
from ensembler import *

# Get data sets and DEfine Parameters

In [None]:
train, test = pd.read_csv("../data/train.csv"), pd.read_csv("../data/test.csv")
TAGS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
write_to = '../data/tuning.txt'
train_ys = {tag: train[tag].values for tag in TAGS}
test_id = test['id']
train_id = train['id']
train_x_sparce, test_x_sparce = get_sparse_matrix(load=True, data_dir='../data')

train_x_ft, test_x_ft = FeatureAdder(data_dir='../data').get_features(load=True)
train_x_ft = train_x_ft[list(test_x_ft)]
del train_x_ft['id'], test_x_ft['id'] 

# Linear Predictor

In [None]:
params = [{"name": "C", "type":"continuous", "domain": (0.1, 6.0)},
      {"name": "dual", "type":"discrete", "domain": [False, True]}]

best_params_s, best_score = bayesian_optimization(LogisticPredictor, train_x_sparce, train_ys,  
                                                  params, model_type='GP', acquisition_type='EI', acquisition_weight=2, 
                                                  max_iter=10, max_time=None, silent=True, persist=False, write_to=write_to)

linear_predictor_ = LogisticPredictor(**best_params_s)
create_ensemble_output(linear_predictor_, train_x_sparce, train_ys, test_x_sparce, train_id, test_id,
                       data_source_nature='sparce_matrix', write_to='../data/output')

# SVM predictor

In [None]:
scaler_ft = preprocessing.StandardScaler().fit(train_x_ft)
params = [{"name": "C", "type":"continuous", "domain": (0.1, 6.0)},
      {"name": "dual", "type":"discrete", "domain": [False, True]}]

best_params_s, best_score = bayesian_optimization(SVMPredictor, train_x_sparce, train_ys, 
                                                  params, model_type='GP', acquisition_type='EI', acquisition_weight=2, 
                                                  max_iter=10, max_time=None, silent=True, persist=False, write_to=write_to)

best_params_f, best_score = bayesian_optimization(SVMPredictor, scaler_ft.transform(train_x_ft), train_ys, 
                                                  params, model_type='GP', acquisition_type='EI', acquisition_weight=2, 
                                                  max_iter=10, max_time=None, silent=True, persist=False, write_to=write_to)

SVC_predictor_ = SVMPredictor(**best_params_s)
create_ensemble_output(SVC_predictor_, train_x_sparce, train_ys, test_x_sparce, train_id, test_id, 
                  write_to='../data/output', data_source_nature='sparce_matrix')

SVC_predictor_ = SVMPredictor(**best_params_f)
create_ensemble_output(SVC_predictor_, scaler_ft.transform(train_x_ft), train_ys, scaler_ft.transform(test_x_ft), train_id, test_id, 
                 write_to='../data/output', data_source_nature='features')

# XGBoost predictor

In [None]:
params = [
        {'name': 'learning_rate', 'type': 'continuous', 'domain': (0, 1)},
        {'name': 'gamma', 'type': 'continuous', 'domain': (0, 5)},
        {'name': 'max_depth', 'type': 'discrete', 'domain': (1, 50)}
     ]

best_params_s, best_score = bayesian_optimization(XGBPredictor, train_x_sparce, train_ys, params, model_type='GP', 
                                                  acquisition_type='EI', acquisition_weight=2, max_iter=10, max_time=None, 
                                                  silent=True, persist=False, write_to=write_to)

best_params_f, best_score = bayesian_optimization(XGBPredictor, train_x_ft, train_ys, params, model_type='GP', 
                                                  acquisition_type='EI', acquisition_weight=2, max_iter=10, max_time=None, 
                                                  silent=True, persist=False, write_to=write_to)

XGBPredictor_ = XGBPredictor(**best_params_s)
create_ensemble_output(XGBPredictor_, train_x_sparce, train_ys, test_x_sparce, train_id, test_id, 
                  write_to='../data/output', data_source_nature='sparce_matrix')

XGBPredictor_ = XGBPredictor(**best_params_f)
create_ensemble_output(XGBPredictor_, train_x_ft, train_ys, test_x_ft, train_id, test_id, write_to='../data/output', 
                  data_source_nature='features')

# Call ensemblers

In [None]:
averages_ensemble = Ensemble(train_ys, test_id, train_id, TAGS, data_dir='../data/output').mean_ensembler()

params = [
        {'name': 'learning_rate', 'type': 'continuous', 'domain': (0, 1)},
        {'name': 'gamma', 'type': 'continuous', 'domain': (0, 5)},
        {'name': 'max_depth', 'type': 'discrete', 'domain': (1, 50)}
]

Ensemble(train_ys, test_id, train_id, TAGS, data_dir='../data/output').meta_learner(params, XGBPredictor)