In [1]:
import numpy as np
import pandas as pd
from pprint import pprint
from time import time
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from joblib import dump

In [2]:
train_data = np.load("/Users/mehradghassemi/twitter_sentiment/data/interim/train_data.npy", allow_pickle=True)
dev_data = np.load("/Users/mehradghassemi/twitter_sentiment/data/interim/dev_data.npy", allow_pickle=True)

In [3]:
def convert_sentiment(category):
    if category == "positive":
        return 2
    elif category =="neutral":
        return 1
    else:
        return 0

In [4]:
all_data = np.concatenate([train_data, dev_data])

In [5]:
print(all_data)   ##1 index is category

[[569731104070115329 'positive' 1.0 ... '2015-02-22 21:30:54 -0800'
  'Washington D.C. ' 'Eastern Time (US & Canada)']
 [569263373092823040 'negative' 1.0 ... '2015-02-21 14:32:19 -0800' nan
  nan]
 [568818669024907264 'negative' 1.0 ... '2015-02-20 09:05:13 -0800'
  'Arlington, VA' 'Atlantic Time (Canada)']
 ...
 [569964335038124033 'negative' 1.0 ... '2015-02-23 12:57:41 -0800' nan
  nan]
 [569208236487745536 'negative' 0.6765 ... '2015-02-21 10:53:13 -0800'
  nan 'Central Time (US & Canada)']
 [569489032100614144 'neutral' 0.6416 ... '2015-02-22 05:29:00 -0800'
  'Virginia, USA' nan]]


In [6]:
all_labels = [convert_sentiment(x) for x in all_data[:,1]]
all_text = all_data[:,10]

In [7]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier()),
])

# Define the hyperparameters you want to tune
parameters = {
    'tfidf__max_df': (0.9, 0.95, 1.0),
    'tfidf__min_df': (0.1, 0.05, 0.0),
    'tfidf__stop_words': ('english', None),
    'tfidf__max_features': (None, 5000, 10000, 50000),
    'tfidf__ngram_range': ((1, 1), (1, 2), (2, 2)),  # unigrams or bigrams or trigrams
    'tfidf__norm': ('l1', 'l2'), 
    'clf__n_estimators': (100, 1000, 5000),
    'clf__max_depth': (None, 5, 10, 15), 
    'clf__min_samples_split': (2, 4, 6)
}

# Instantiate the RandomizedSearchCV object
random_search = RandomizedSearchCV(pipeline, parameters, n_iter=100, n_jobs=-1, verbose=1, cv=5, 
                                   scoring='f1_weighted', random_state=42)


print("Performing random search...")
print ("pipeline:", [name for name, _ in pipeline.steps])
print ("parameters:")
pprint (parameters)
t0 = time()
random_search.fit(all_text, all_labels) 
print("done in %0.3fs" % (time() - t0)) 
print()

print("Best score: %0.3f" % random_search.best_score_)
print ("Best parameters set:")
best_parameters = random_search.best_estimator_.get_params()
for param_name in sorted (parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


Performing random search...
pipeline: ['tfidf', 'clf']
parameters:
{'clf__max_depth': (None, 5, 10, 15),
 'clf__min_samples_split': (2, 4, 6),
 'clf__n_estimators': (100, 1000, 5000),
 'tfidf__max_df': (0.9, 0.95, 1.0),
 'tfidf__max_features': (None, 5000, 10000, 50000),
 'tfidf__min_df': (0.1, 0.05, 0.0),
 'tfidf__ngram_range': ((1, 1), (1, 2), (2, 2)),
 'tfidf__norm': ('l1', 'l2'),
 'tfidf__stop_words': ('english', None)}
Fitting 5 folds for each of 100 candidates, totalling 500 fits


80 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/mehradghassemi/twitter_sentiment/env/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/mehradghassemi/twitter_sentiment/env/lib/python3.9/site-packages/sklearn/pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/Users/mehradghassemi/twitter_sentiment/env/lib/python3.9/site-packages/sklearn/pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/Users/mehradghassemi/twitter_sentiment/

done in 2558.738s

Best score: 0.741
Best parameters set:
	clf__max_depth: None
	clf__min_samples_split: 4
	clf__n_estimators: 100
	tfidf__max_df: 1.0
	tfidf__max_features: 10000
	tfidf__min_df: 0.0
	tfidf__ngram_range: (1, 2)
	tfidf__norm: 'l1'
	tfidf__stop_words: None


In [9]:
dump(random_search.best_estimator_, '../models/sklearn_model.pkl')

['../models/sklearn_model.pkl']