In [1]:
import pandas as pd
import numpy as np
from pprint import pprint
from comet_ml import Experiment
from time import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import json
from joblib import dump

In [2]:
with open('../data/raw/comet_cred.json') as  f:
    comet_creds = json.load(f)

In [3]:
train_data = np.load("../data/interim/train_data.npy", allow_pickle=True)
dev_data = np.load("../data/interim/dev_data.npy", allow_pickle=True)
save_model_path = '../models/sklearn_model.pkl'
n_experiments = 20

In [4]:
def convert_category_to_number(category):
    if category == "positive":
        return 2
    elif category == "negative":
        return 0
    else: return 1

In [5]:
all_data = np.concatenate([train_data, dev_data])
all_labels = [convert_category_to_number(x) for x in all_data[:,1]]
all_text = all_data[:, 10]

In [6]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])

parameters = {
    'tfidf__max_df': (0.975, 0.99),
    'tfidf__min_df': (0.025, 0.01),
    'tfidf__stop_words': ('english', None),
    'tfidf__max_features': (50000, 60000),
    'tfidf__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__norm': ('l1', 'l2'),
    'clf__n_estimators': (4000, 5000, 6000),
    'clf__max_depth': (None, 2),
    'clf__min_samples_split': (2, 3)
}

random_search = RandomizedSearchCV(pipeline, parameters, n_iter=n_experiments, verbose=1, cv=5,
                                    scoring="f1_weighted", random_state=42)

In [7]:
print("Performing random search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
random_search.fit(all_text, all_labels)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % random_search.best_score_)
print("Best parameters set:")
best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing random search...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__max_depth': (None, 2),
 'clf__min_samples_split': (2, 3),
 'clf__n_estimators': (4000, 5000, 6000),
 'tfidf__max_df': (0.975, 0.99),
 'tfidf__max_features': (50000, 60000),
 'tfidf__min_df': (0.025, 0.01),
 'tfidf__ngram_range': ((1, 1), (1, 2)),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__stop_words': ('english', None)}
Fitting 5 folds for each of 20 candidates, totalling 100 fits
done in 14256.547s

Best score: 0.735
Best parameters set:
	clf__max_depth: None
	clf__min_samples_split: 3
	clf__n_estimators: 5000
	tfidf__max_df: 0.975
	tfidf__max_features: 50000
	tfidf__min_df: 0.01
	tfidf__ngram_range: (1, 1)
	tfidf__norm: 'l1'
	tfidf__stop_words: None


In [8]:
for i in range(len(random_search.cv_results_['params'])):
    experiment = Experiment(api_key=comet_creds["api_key"], project_name=comet_creds["project_name"],
                        workspace=comet_creds["workspace"])
    for k,v in random_search.cv_results_.items():
        if k == "params":
            experiment.log_parameters(v[i])
        else:
            experiment.log_metric(k,v[i])

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/jahnic/general/0d0a19f0b08c4db6baf6d912cc8e095f

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/jahnic/general/0d0a19f0b08c4db6baf6d912cc8e095f
COMET INFO:   Metrics:
COMET INFO:     mean_fit_time                : 14.65947904586792
COMET INFO:     mean_score_time              : 1.3476103782653808
COMET INFO:     mean_test_score              : 0.48124086048283166
COMET INFO:     param_clf__max_depth         : 2
COMET INFO:     param_clf__min_samples_split : 3
COMET INFO:     param_clf__n_estimators      : 5000
COMET INFO:     param_tfidf__max_df          : 0.975
COMET INFO:     param_tfidf__max_features    : 60000
COMET INFO:     param_tfidf__min_df          : 0.01
COMET INFO:     param_tfidf__ngram_range     : (1, 2)
COMET INFO:  

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/jahnic/general/778c293559c640fc9ca0435840911d84

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/jahnic/general/778c293559c640fc9ca0435840911d84
COMET INFO:   Metrics:
COMET INFO:     mean_fit_time                : 12.75280466079712
COMET INFO:     mean_score_time              : 1.1686270713806153
COMET INFO:     mean_test_score              : 0.48137849040394637
COMET INFO:     param_clf__max_depth         : 2
COMET INFO:     param_clf__min_samples_split : 3
COMET INFO:     param_clf__n_estimators      : 5000
COMET INFO:     param_tfidf__max_df          : 0.99
COMET INFO:     param_tfidf__max_features    : 60000
COMET INFO:     param_tfidf__min_df          : 0.025
COMET INFO:     param_tfidf__ngram_range     : (1, 1)
COMET INFO:  

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/jahnic/general/c0111b5c420b43979438ddcc2e58a95b

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/jahnic/general/c0111b5c420b43979438ddcc2e58a95b
COMET INFO:   Metrics:
COMET INFO:     mean_fit_time                : 428.1770541667938
COMET INFO:     mean_score_time              : 3.7163039684295653
COMET INFO:     mean_test_score              : 0.7080003425223959
COMET INFO:     param_clf__max_depth         : 1
COMET INFO:     param_clf__min_samples_split : 2
COMET INFO:     param_clf__n_estimators      : 5000
COMET INFO:     param_tfidf__max_df          : 0.99
COMET INFO:     param_tfidf__max_features    : 50000
COMET INFO:     param_tfidf__min_df          : 0.025
COMET INFO:     param_tfidf__ngram_range     : (1, 1)
COMET INFO:   

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/jahnic/general/874590409a154adc871467185c49cc3d

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/jahnic/general/874590409a154adc871467185c49cc3d
COMET INFO:   Metrics:
COMET INFO:     mean_fit_time                : 192.73350257873534
COMET INFO:     mean_score_time              : 3.7239892959594725
COMET INFO:     mean_test_score              : 0.6501254408339037
COMET INFO:     param_clf__max_depth         : 1
COMET INFO:     param_clf__min_samples_split : 2
COMET INFO:     param_clf__n_estimators      : 6000
COMET INFO:     param_tfidf__max_df          : 0.975
COMET INFO:     param_tfidf__max_features    : 60000
COMET INFO:     param_tfidf__min_df          : 0.025
COMET INFO:     param_tfidf__ngram_range     : (1, 2)
COMET INFO: 

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/jahnic/general/fdf54b6a578744f5907ba36ba9441e9d

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/jahnic/general/fdf54b6a578744f5907ba36ba9441e9d
COMET INFO:   Metrics:
COMET INFO:     mean_fit_time                : 327.432745552063
COMET INFO:     mean_score_time              : 3.0015565395355224
COMET INFO:     mean_test_score              : 0.7098143734146847
COMET INFO:     param_clf__max_depth         : 1
COMET INFO:     param_clf__min_samples_split : 3
COMET INFO:     param_clf__n_estimators      : 4000
COMET INFO:     param_tfidf__max_df          : 0.975
COMET INFO:     param_tfidf__max_features    : 50000
COMET INFO:     param_tfidf__min_df          : 0.025
COMET INFO:     param_tfidf__ngram_range     : (1, 2)
COMET INFO:   

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/jahnic/general/db4998acce0e4e949a6ba078cdee7276

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/jahnic/general/db4998acce0e4e949a6ba078cdee7276
COMET INFO:   Metrics:
COMET INFO:     mean_fit_time                : 184.7847680568695
COMET INFO:     mean_score_time              : 3.5835218906402586
COMET INFO:     mean_test_score              : 0.6503239817799953
COMET INFO:     param_clf__max_depth         : 1
COMET INFO:     param_clf__min_samples_split : 3
COMET INFO:     param_clf__n_estimators      : 6000
COMET INFO:     param_tfidf__max_df          : 0.99
COMET INFO:     param_tfidf__max_features    : 50000
COMET INFO:     param_tfidf__min_df          : 0.025
COMET INFO:     param_tfidf__ngram_range     : (1, 2)
COMET INFO:   

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/jahnic/general/ab01fa5195734d2f877967c00005cb84

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/jahnic/general/ab01fa5195734d2f877967c00005cb84
COMET INFO:   Metrics:
COMET INFO:     mean_fit_time                : 10.57558078765869
COMET INFO:     mean_score_time              : 0.9936769485473633
COMET INFO:     mean_test_score              : 0.48124086048283166
COMET INFO:     param_clf__max_depth         : 2
COMET INFO:     param_clf__min_samples_split : 3
COMET INFO:     param_clf__n_estimators      : 4000
COMET INFO:     param_tfidf__max_df          : 0.975
COMET INFO:     param_tfidf__max_features    : 60000
COMET INFO:     param_tfidf__min_df          : 0.01
COMET INFO:     param_tfidf__ngram_range     : (1, 2)
COMET INFO:  

In [9]:
dump(random_search.best_estimator_, save_model_path)

['../models/sklearn_model.pkl']

In [10]:
experiment.end()

COMET INFO: ---------------------------
COMET INFO: Comet.ml Experiment Summary
COMET INFO: ---------------------------
COMET INFO:   Data:
COMET INFO:     display_summary_level : 1
COMET INFO:     url                   : https://www.comet.ml/jahnic/general/312957eda19d464f8cba57021d499093
COMET INFO:   Metrics:
COMET INFO:     mean_fit_time                : 325.49981050491334
COMET INFO:     mean_score_time              : 3.0371742725372313
COMET INFO:     mean_test_score              : 0.7068271015822272
COMET INFO:     param_clf__max_depth         : 1
COMET INFO:     param_clf__min_samples_split : 3
COMET INFO:     param_clf__n_estimators      : 4000
COMET INFO:     param_tfidf__max_df          : 0.975
COMET INFO:     param_tfidf__max_features    : 60000
COMET INFO:     param_tfidf__min_df          : 0.025
COMET INFO:     param_tfidf__ngram_range     : (1, 2)
COMET INFO:     param_tfidf__norm            : l1
COMET INFO:     param_tfidf__stop_words      : 1
COMET INFO:     rank_test_