In [5]:
import pandas as pd
import numpy as np
from pprint import pprint
from comet_ml import Experiment
from time import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import  TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
import json
from joblib import dump

In [6]:
with open('../data/raw/comet_cred.json') as  f:
    comet_creds = json.load(f)

In [3]:
train_data = np.load("../data/interim/train_data.npy", allow_pickle=True)
dev_data = np.load("../data/interim/dev_data.npy", allow_pickle=True)
save_model_path = '../models/sklearn_model.pkl'
n_experiments = 25

In [4]:
def convert_category_to_number(category):
    if category == "positive":
        return 2
    elif category == "negative":
        return 0
    else: return 1

In [5]:
all_data = np.concatenate([train_data, dev_data])
all_labels = [convert_category_to_number(x) for x in all_data[:,1]]
all_text = all_data[:, 10]

In [12]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier())
])

parameters = {
    'tfidf__max_df': (0.9, 0.95, 1.0),
    'tfidf__min_df': (0.1, 0.05, 0.0),
    'tfidf__stop_words': ('english', None),
    'tfidf__max_features': (None, 5000, 10000, 50000),
    'tfidf__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'tfidf__norm': ('l1', 'l2'),
    'clf__n_estimators': (100, 1000, 5000),
    'clf__max_depth': (None, 5, 10, 15),
    'clf__min_samples_split': (2, 4, 6)
}

random_search = RandomizedSearchCV(pipeline, parameters, n_iter=n_experiments, n_jobs=-1, verbose=1,cv=5,
                                    scoring="f1_weighted", random_state=42)

In [13]:
print("Performing random search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
random_search.fit(all_text, all_labels)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % random_search.best_score_)
print("Best parameters set:")
best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing random search...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__max_depth': (None, 5, 10, 15),
 'clf__min_samples_split': (2, 4, 6),
 'clf__n_estimators': (100, 1000, 5000),
 'tfidf__max_df': (0.9, 0.95, 1.0),
 'tfidf__max_features': (None, 5000, 10000, 50000),
 'tfidf__min_df': (0.1, 0.05, 0.0),
 'tfidf__ngram_range': ((1, 1), (1, 2)),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__stop_words': ('english', None)}
Fitting 5 folds for each of 25 candidates, totalling 125 fits


exception calling callback for <Future at 0x7ff68dcfcc10 state=finished raised TerminatedWorkerError>
Traceback (most recent call last):
  File "/home/jahnic/Git/Portfolio/AgileProject/twitter_sentiment/env/lib/python3.8/site-packages/joblib/externals/loky/_base.py", line 625, in _invoke_callbacks
    callback(self)
  File "/home/jahnic/Git/Portfolio/AgileProject/twitter_sentiment/env/lib/python3.8/site-packages/joblib/parallel.py", line 359, in __call__
    self.parallel.dispatch_next()
  File "/home/jahnic/Git/Portfolio/AgileProject/twitter_sentiment/env/lib/python3.8/site-packages/joblib/parallel.py", line 792, in dispatch_next
    if not self.dispatch_one_batch(self._original_iterator):
  File "/home/jahnic/Git/Portfolio/AgileProject/twitter_sentiment/env/lib/python3.8/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
    self._dispatch(tasks)
  File "/home/jahnic/Git/Portfolio/AgileProject/twitter_sentiment/env/lib/python3.8/site-packages/joblib/parallel.py", line

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}