<a href="https://colab.research.google.com/github/Jefffish09/MachineLearning/blob/dev/Classification/binary/rfc_tfidf_binary_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

References:

* https://www.kaggle.com/onadegibert/sentiment-analysis-with-tfidf-and-random-forest


In [1]:
import numpy as np
from tensorflow.keras.datasets import imdb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

In [2]:
seed = 2021
index_from = 3
vocab_size = 5000
max_len = 200

In [3]:
# Load the dataset
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size, index_from=index_from)

# Restore original text from imdb dataset
# https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset
word2idx = imdb.get_word_index()
word2idx = {k: (v+index_from) for k, v in word2idx.items()}
word2idx["<PAD>"] = 0
word2idx["<START>"] = 1
word2idx["<UNK>"] = 2
word2idx["<UNUSED>"] = 3
idx2word = {value: key for key,value in word2idx.items()}

x_train_words = [" ".join(idx2word[id] for id in sent) for sent in x_train]
x_test_words = [" ".join(idx2word[id] for id in sent) for sent in x_test]


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [4]:
tfidf_vec = TfidfVectorizer(
    ngram_range=(1, 2),
    max_df=0.95,
    min_df=5,
    max_features=vocab_size,
    sublinear_tf=True
)

x_train_tfidf = tfidf_vec.fit_transform(x_train_words)
x_test_tfidf = tfidf_vec.transform(x_test_words)

In [5]:
rf = RandomForestClassifier(n_estimators=100, max_depth=None, n_jobs=-1, random_state=seed, verbose=1)
rf.fit(X=x_train_tfidf, y=y_train)

preds = rf.predict(X=x_test_tfidf)
report = classification_report(y_true=y_test, y_pred=preds, digits=4)
print(report)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   54.0s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.5s


              precision    recall  f1-score   support

           0     0.8390    0.8430    0.8410     12500
           1     0.8422    0.8382    0.8402     12500

    accuracy                         0.8406     25000
   macro avg     0.8406    0.8406    0.8406     25000
weighted avg     0.8406    0.8406    0.8406     25000



[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    1.1s finished


In [6]:
scores = cross_val_score(rf, X=x_train_tfidf, y=y_train, cv=10, verbose=1)
print(scores)
print(scores.mean())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   47.7s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   21.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   46.9s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Pa

[0.8296 0.8532 0.8428 0.8384 0.844  0.8404 0.8348 0.844  0.8448 0.8268]
0.83988


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  7.9min finished
