## Sense check: Use Random Vectors

To give a sense of the improvement the Word2Vec vectors, which effectively embed words in a latent _semantic space_, give over a baseline, I repeat the exact same procedure using vectors ($\in \mathbb{R}^{300}$) whose elements are drawn from a random uniform distribution supported over $[-10, 10]$.

In [45]:
# Imports and Set Options

import csv  # for slang
import os
import re  # regex
import string  # punct
from pprint import pprint

import gensim
import keras
import lightgbm as lgb
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from gensim.models import KeyedVectors, Word2Vec
from IPython.display import Image
from matplotlib import pyplot as plt
from nltk.corpus import stopwords  # stopwords
from nltk.stem import PorterStemmer  # stemming
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn import svm, tree
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import (AdaBoostClassifier, BaggingClassifier,
                              GradientBoostingClassifier,
                              RandomForestClassifier, RandomForestRegressor,
                              StackingClassifier)
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import (accuracy_score, auc, average_precision_score,
                             brier_score_loss, classification_report,
                             confusion_matrix, f1_score, fbeta_score,
                             make_scorer, plot_precision_recall_curve,
                             precision_recall_curve, precision_score,
                             recall_score, roc_auc_score, roc_curve)
from sklearn.model_selection import (GridSearchCV, KFold, RandomizedSearchCV,
                                     cross_val_score, train_test_split)
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
from sklearn.svm import SVC  # "Support vector classifier"
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

import xgboost as xgb

# pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

%matplotlib inline

Using TensorFlow backend.


In [47]:
print(os.getcwd())
# os.listdir('..')

/content


In [3]:
# Read in cleaned data processed earlier

tweets = pd.read_csv("https://github.com/anilkeshwani/StatLearnProj/raw/master/Anil/clean_tweets_no_stemming.csv")

In [4]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


## Add clean text field containing only words known to pretrained Word2Vec

In [5]:
tweets["text_clean_known"] = tweets.text_clean.apply(func=lambda tweet: ' '.join([word for word in tweet.split() \
                                                                               if word in wv.vocab]))
print(f"Count of text_clean_known entries which are null: {sum(tweets.text_clean_known.isnull())}")
print(f"Count of text_clean_known entries which empty: \
# {sum(tweets.text_clean_known.apply(func=lambda x: x.strip() == ''))}")

# Remove both rows with either null or empty `text_clean_known` entries

tweets = tweets.loc[(~tweets.text_clean_known.isnull()), :]
tweets = tweets.loc[~tweets.text_clean_known.apply(func=lambda x: x.strip() == ''), ]

print("After cleaning:", end="\n")
print(f"Count of text_clean_known entries which are null: {sum(tweets.text_clean_known.isnull())}")
print(f"Count of text_clean_known entries which empty: {sum(tweets.text_clean_known.apply(func=lambda x: x.strip() == ''))}")

Count of text_clean_known entries which are null: 0
Count of text_clean_known entries which empty: # 3
After cleaning:
Count of text_clean_known entries which are null: 0
Count of text_clean_known entries which empty: 0


In [24]:
# Generate a dictionary containing all the words in the corpus of tweets

# {k: v for k, v in 
 
vocab = set()

for tweet in tweets.text_clean_known:
  for word in tweet.split():
    vocab.add(word.strip())

len(vocab)

14230

In [27]:
# Create random vector representations

rv = {k: np.random.uniform(low=-10.0, high=10.0, size=300) for k in vocab}

# Then we have our vectors infilled with random uniformly distributed elements

len(rv["hello"])

300

In [29]:
def RandomVectorizeTweet(tweet, rv=rv):
    tweet_vector = np.zeros(shape=(300,), dtype="float32")
    n_vectorizable = 0
    for word in tweet.split():
        try:
#             print(f"Adding {word} to word representation")
            tweet_vector = np.add(tweet_vector, rv[word])
            n_vectorizable += 1
        except KeyError:
            print(f"Could not vectorize {word}")
    return (tweet_vector/n_vectorizable)

In [30]:
tweets["rv"] = tweets.text_clean_known.apply(func=RandomVectorizeTweet)

In [31]:
tweets.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,username,user_handle,date,retweets,favorites,text,label,text_clean,text_clean_known,rv
0,0,0,WWF Climate & Energy,climateWWF,2020-04-28,11,22,Economic recovery and national climate pledges...,0,economic recovery national climate pledges mus...,economic recovery national climate pledges mus...,"[-1.6829878142677979, 0.8212961599045651, 0.30..."
1,1,1,WWF Climate & Energy,climateWWF,2020-04-22,6,16,"In this difficult time, it’s hard to connect w...",0,difficult time hard connect natural world eart...,difficult time hard connect natural world eart...,"[0.6293775065181396, 0.14288798573284223, 1.28..."
2,2,2,WWF Climate & Energy,climateWWF,2020-04-01,43,69,"The decision to postpone # COP26, is unavoidab...",0,decision postpone cop unavoidable collective p...,decision postpone cop unavoidable collective p...,"[-1.0888818806897693, -1.0935972221220953, 0.0..."
3,3,3,WWF Climate & Energy,climateWWF,2020-03-30,24,30,Japan - the world’s fifth largest emitter of g...,0,japan worlds fifth largest emitter greenhouse ...,japan worlds fifth largest emitter greenhouse ...,"[1.3617367872821422, -1.3378618822394823, 0.32..."
4,4,4,WWF Climate & Energy,climateWWF,2020-03-30,22,40,How can countries include # NatureBasedSolutio...,0,countries include naturebasedsolutions climate...,countries include climate plans new guidance o...,"[0.11711354888495744, 1.3867587750494383, 4.59..."


In [39]:
rv_train, rv_test, Y_train, Y_test = train_test_split(np.array(tweets.rv.tolist()), tweets.label, 
                                                    test_size=0.2, random_state=17, 
                                                    shuffle=True) # explicit default

In [41]:
rf_clf = RandomForestClassifier(oob_score=True)

rf_clf.fit(rv_train, Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=None,
                       verbose=0, warm_start=False)

In [48]:
rf_clf_cv_score = cross_val_score(rf_clf, rv_train, Y_train)

print(f"Training Set Accuracy: {rf_clf.score(rv_train, Y_train)}")
print(f"Out-of-Bag Score: {rf_clf.oob_score_}")

print(f"Cross-validated accuracy : {rf_clf_cv_score}") 
print(f"Mean CV accuracy : {np.round(np.mean(rf_clf_cv_score), 3)}")

print(f"Test set score : {rf_clf.score(rv_test, Y_test)}")

Training Set Accuracy: 0.9997917245209664
Out-of-Bag Score: 0.7706192724243266
Cross-validated accuracy : [0.7965984  0.79451579 0.7764665  0.78028462 0.77083333]
Mean CV accuracy : 0.784
Test set score : 0.7875590113857261


In [49]:
# Set up Hyperparameter Search Space

param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 100, None],
    'max_features': ['sqrt', 'log2'],
    'min_samples_leaf': [1, 3, 5],
    'min_samples_split': [2, 5, 10],
    'n_estimators': [50, 100, 300]
}

# Set Cross-Validation Process

kfcv = KFold(n_splits=5, shuffle=True, random_state=101)

In [50]:
# Instantiate the grid search model
rf_grid_search = GridSearchCV(estimator = rf_clf, param_grid = param_grid, 
                              cv = kfcv, n_jobs = -1, verbose = 2)

In [52]:
rf_grid_search.fit(rv_train, Y_train)

Fitting 5 folds for each of 162 candidates, totalling 810 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 12.0min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 48.5min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed: 98.7min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed: 170.9min
[Parallel(n_jobs=-1)]: Done 810 out of 810 | elapsed: 203.8min finished


GridSearchCV(cv=KFold(n_splits=5, random_state=101, shuffle=True),
             error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_f...
                                              n_estimators=100, n_jobs=None,
                                              oob_score=True, rando

In [53]:
# Best Score

print(f"Best Score: {rf_grid_search.best_score_}", end="\n"*2)

# Best Parameters

print("Best parameters:")
for k, v in rf_grid_search.best_params_.items():
    print(str(k) + ": " + str(v))

Best Score: 0.7926263305565198

Best parameters:
bootstrap: True
max_depth: None
max_features: sqrt
min_samples_leaf: 1
min_samples_split: 2
n_estimators: 300
