In [30]:
from functions import *
from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
import sklearn
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.metrics import classification_report
import random
from sklearn.model_selection import cross_validate as cvt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from copy import deepcopy as dp

def tokenize(text):
    return text.split(' ')

In [33]:
test, train = load_pkl(r'C:\Users\HEndo\Documents\GitHub\zemi\cleaned_corpus\base_cleaned.pkl')
test_a, train_a = load_pkl(r'C:\Users\HEndo\Documents\GitHub\zemi\cleaned_corpus\back_up\org_answer_data.pkl')
train_X = dp(train)
train_Y = dp(train_a)
sw = stopwords.words('english') + ['hes']

In [47]:
best_params = []

In [34]:
import optuna

In [39]:
def objective(trial):
    params = {
    '_min': trial.suggest_uniform('_min', 0.001, 0.005),
    'n_est': trial.suggest_discrete_uniform('n_est', 50, 500, 50),
    'mx_depth': trial.suggest_discrete_uniform('mx_depth', 5, 150, 5),
    'mn_sp': trial.suggest_uniform('mn_sp', 0.1, 0.3),
    'mx_feat': trial.suggest_uniform('mx_feat', 0.7, 1.0)
    }
    data = list(zip(train_X, train_Y))
    random.shuffle(data)
    ff_train, ff_train_a = zip(*data)
    ff_train = [' '.join(text) for text in ff_train]
    vect = TfidfVectorizer(stop_words=sw,tokenizer=tokenize, 
                          min_df= params['_min'], max_df=0.85, 
                          ngram_range=(1,2))
    tfidf = vect.fit_transform(ff_train)
    tree = RandomForestClassifier(n_estimators=abs(params['n_est'].astype('int8')), 
                                 max_depth=params['mx_depth'], 
               min_samples_split= params['mn_sp'],
               max_features= params['mx_feat']
              )

    scores = cvt(tree, tfidf, ff_train_a, cv = 4, return_train_score= True, scoring = 'f1_macro', )
    test_s = scores['test_score'].mean()
    train_s = scores['train_score'].mean()
    f_score = 1.0 - float(test_s) + float(abs(train_s-test_s))/2.0
    return f_score

In [40]:
import warnings
warnings.filterwarnings('ignore')
optuna.logging.enable_default_handler()


In [41]:
study = optuna.create_study()
study.optimize(objective, n_trials=500, n_jobs = -1, verb_pace = 50)


[I 2019-02-27 18:03:16,503] Finished trial 50 / 500. Current best value is 0.25076865943319804 with parameters: {'_min': 0.001982792983953625, 'n_est': 300.0, 'mx_depth': 55.0, 'mn_sp': 0.12057953801653995, 'mx_feat': 0.9380754044722736}.
[I 2019-02-27 18:03:17,634] Finished trial 50 / 500. Current best value is 0.25076865943319804 with parameters: {'_min': 0.001982792983953625, 'n_est': 300.0, 'mx_depth': 55.0, 'mn_sp': 0.12057953801653995, 'mx_feat': 0.9380754044722736}.
[I 2019-02-27 18:08:00,402] Finished trial 100 / 500. Current best value is 0.24995134025131455 with parameters: {'_min': 0.004398091343846016, 'n_est': 150.0, 'mx_depth': 35.0, 'mn_sp': 0.10665131241390491, 'mx_feat': 0.9633833625884614}.
[I 2019-02-27 18:08:01,452] Finished trial 100 / 500. Current best value is 0.24995134025131455 with parameters: {'_min': 0.004398091343846016, 'n_est': 150.0, 'mx_depth': 35.0, 'mn_sp': 0.10665131241390491, 'mx_feat': 0.9633833625884614}.
[I 2019-02-27 18:08:02,510] Finished trial

In [42]:
print(study.best_params)
print(study.best_value)
print(study.best_trial)

{'_min': 0.0033950443296979203, 'n_est': 400.0, 'mx_depth': 20.0, 'mn_sp': 0.1268469903355746, 'mx_feat': 0.9097354956132974}
0.24299381679791526
FrozenTrial(trial_id=439, state=<TrialState.COMPLETE: 1>, value=0.24299381679791526, datetime_start=datetime.datetime(2019, 2, 27, 18, 40, 47, 114798), datetime_complete=datetime.datetime(2019, 2, 27, 18, 41, 22, 879564), params={'_min': 0.0033950443296979203, 'n_est': 400.0, 'mx_depth': 20.0, 'mn_sp': 0.1268469903355746, 'mx_feat': 0.9097354956132974}, user_attrs={}, system_attrs={}, intermediate_values={}, params_in_internal_repr={'_min': 0.0033950443296979203, 'n_est': 400.0, 'mx_depth': 20.0, 'mn_sp': 0.1268469903355746, 'mx_feat': 0.9097354956132974})


In [48]:
best_params.append(study.best_params)

In [46]:
def objective2(trial):
    params = {
    'n_est': trial.suggest_discrete_uniform('n_est', 50, 500, 50),
    'mx_depth': trial.suggest_discrete_uniform('mx_depth', 10, 50, 10),
    }
    sw = stopwords.words('english') + ['hes']
    data = list(zip(train_X, train_Y))
    random.shuffle(data)
    ff_train, ff_train_a = zip(*data)
    ff_train = [' '.join(text) for text in ff_train]
    vect = TfidfVectorizer(stop_words=sw,tokenizer=tokenize, 
                          min_df= 0.003, max_df=0.85, 
                          ngram_range=(1,2))
    tfidf = vect.fit_transform(ff_train)
    tree = RandomForestClassifier(
               n_estimators= abs(params['n_est'].astype('int8')),
               max_depth=abs(params['mx_depth']), 
               min_samples_split= 0.12,
               max_features= 0.915
              )

    scores = cvt(tree, tfidf, ff_train_a, cv = 4, return_train_score= True, scoring = 'f1_macro')
    test_s = scores['test_score'].mean()
    train_s = scores['train_score'].mean()
    f_score = 1.0 - float(test_s) + float(abs(train_s-test_s))/2.0
    
    return f_score

In [49]:
study2 = optuna.create_study()
study2.optimize(objective2, n_trials=150, n_jobs = -1, verb_pace = 30)
best_params.append(study2.best_params)

[I 2019-02-27 19:23:01,444] Finished trial 30 / 150. Current best value is 0.24272488863060032 with parameters: {'n_est': 50.0, 'mx_depth': 30.0}.
[I 2019-02-27 19:23:02,485] Finished trial 30 / 150. Current best value is 0.24272488863060032 with parameters: {'n_est': 50.0, 'mx_depth': 30.0}.
[I 2019-02-27 19:23:03,529] Finished trial 30 / 150. Current best value is 0.24272488863060032 with parameters: {'n_est': 50.0, 'mx_depth': 30.0}.
[I 2019-02-27 19:23:04,533] Finished trial 30 / 150. Current best value is 0.24272488863060032 with parameters: {'n_est': 50.0, 'mx_depth': 30.0}.
[I 2019-02-27 19:26:57,326] Finished trial 60 / 150. Current best value is 0.24272488863060032 with parameters: {'n_est': 50.0, 'mx_depth': 30.0}.
[I 2019-02-27 19:26:58,437] Finished trial 60 / 150. Current best value is 0.24272488863060032 with parameters: {'n_est': 50.0, 'mx_depth': 30.0}.
[I 2019-02-27 19:26:59,485] Finished trial 60 / 150. Current best value is 0.24272488863060032 with parameters: {'n_es

In [54]:
def objective_class_weight(trial):
    params = {
    'n_est': trial.suggest_discrete_uniform('n_est', 10, 100, 10),
    'mx_depth': trial.suggest_categorical('mx_depth', [20, 30])
    }
    sw = stopwords.words('english') + ['hes']
    data = list(zip(train_X, train_Y))
    random.shuffle(data)
    ff_train, ff_train_a = zip(*data)
    ff_train = [' '.join(text) for text in ff_train]
    vect = TfidfVectorizer(stop_words=sw,tokenizer=tokenize, 
                          min_df= 0.003, max_df=0.85, 
                          ngram_range=(1,2))
    tfidf = vect.fit_transform(ff_train)
    tree = RandomForestClassifier(
               n_estimators= abs(params['n_est'].astype('int8')),
               max_depth=abs(params['mx_depth']), 
               min_samples_split= 0.12,
               max_features= 0.915,
               class_weight= 'balanced'
              )

    scores = cvt(tree, tfidf, ff_train_a, cv = 4, return_train_score= True, scoring = 'f1_macro')
    test_s = scores['test_score'].mean()
    train_s = scores['train_score'].mean()
    f_score = 1.0 - float(test_s) + float(abs(train_s-test_s))/2.0
    
    return f_score

In [55]:
study3 = optuna.create_study()
study3.optimize(objective_class_weight, n_trials=30, n_jobs = -1, verb_pace = 30)
best_params.append(study3.best_params)

In [56]:
print(study3.best_params)
print(study3.best_value)

{'n_est': 60.0, 'mx_depth': 30}
0.2504425910636287


In [60]:
best_params

[{'_min': 0.0033950443296979203,
  'n_est': 400.0,
  'mx_depth': 20.0,
  'mn_sp': 0.1268469903355746,
  'mx_feat': 0.9097354956132974},
 {'n_est': 50.0, 'mx_depth': 30.0},
 {'n_est': 60.0, 'mx_depth': 30}]

In [66]:
best = {'_min': 0.0033950443296979203,
  '_max':0.85,
  'n_est': 50,
  'mx_depth': 20.0,
  'mn_sp': 0.1268469903355746,
  'mx_feat': 0.9097354956132974}
save_pkl('tuned_parameters/random_forest.pkl', best)

In [67]:
best_class_weight = {'_min': 0.003,
  '_max':0.85,
  'n_est': 60,
  'mx_depth': 30,
  'mn_sp': 0.12,
  'mx_feat': 0.915}
save_pkl('tuned_parameters/random_forest_class_weight.pkl', best_class_weight)