# Compare NLP Techniques: Build Model On word2vec Vectors

### Read In Cleaned Text

In [2]:
%pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [6]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

### Create word2vec Vectors

In [8]:
# Train a basic word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [11]:
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index_to_key)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train['clean_text']])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test['clean_text']])

  X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
  X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])


In [12]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [13]:
# What does the unaveraged version look like?
X_train_vect[0]

array([[-0.00861969,  0.00366574,  0.00518988, ..., -0.00239151,
        -0.00951009,  0.00450588],
       [-0.00053623,  0.00023643,  0.00510335, ..., -0.00704156,
         0.00090146,  0.00639253],
       [-0.00861969,  0.00366574,  0.00518988, ..., -0.00239151,
        -0.00951009,  0.00450588],
       ...,
       [-0.00053623,  0.00023643,  0.00510335, ..., -0.00704156,
         0.00090146,  0.00639253],
       [-0.00053623,  0.00023643,  0.00510335, ..., -0.00704156,
         0.00090146,  0.00639253],
       [-0.00053623,  0.00023643,  0.00510335, ..., -0.00704156,
         0.00090146,  0.00639253]], dtype=float32)

In [14]:
# What does the averaged version look like?
X_train_vect_avg[0]

array([-0.00521613,  0.00222182,  0.00515345,  0.00711766,  0.00040592,
       -0.00656731,  0.00335962,  0.00727916, -0.003756  , -0.00515872,
        0.00287008, -0.00549085, -0.00515227,  0.00687275, -0.00010544,
        0.00341864,  0.00514818,  0.00477753, -0.00568223, -0.00430371,
        0.00443823, -0.00048143,  0.00770198, -0.00538615,  0.00659043,
        0.00025346, -0.00325434,  0.00497519, -0.00417413,  0.00222823,
        0.00260635, -0.00291722,  0.00366906, -0.00637923,  0.00124678,
        0.00079742,  0.0073906 ,  0.00103499,  0.0055413 ,  0.00336725,
        0.00052897, -0.00193822, -0.00898901, -0.00205515, -0.00180942,
        0.00444568,  0.00021234,  0.00315346,  0.00297248,  0.00492398,
        0.00109105, -0.00361293, -0.00186092,  0.00235566,  0.00303475,
        0.00297379,  0.00711888,  0.00092663, -0.00190609,  0.00842118,
       -0.00560521,  0.00199206, -0.00442858, -0.00618105,  0.00144325,
        0.00414906,  0.00412423, -0.00100854,  0.00314859,  0.00

### Fit RandomForestClassifier On Top Of Word Vectors

In [15]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [16]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [17]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.55 / Recall: 0.22 / Accuracy: 0.871
