# Compare NLP Techniques: Build Model On word2vec Vectors

### Read In Cleaned Text

In [1]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('../../../data/X_train.csv')
X_test = pd.read_csv('../../../data/X_test.csv')
y_train = pd.read_csv('../../../data/y_train.csv')
y_test = pd.read_csv('../../../data/y_test.csv')

### Create word2vec Vectors

In [2]:
# Train a basic word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   vector_size=100,
                                   window=5,
                                   min_count=2)

In [4]:
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index_to_key)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train['clean_text']], dtype=object)
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test['clean_text']], dtype=object)

In [5]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [9]:
# What does the unaveraged version look like?
X_train_vect[1]

array([[-8.6196875e-03,  3.6657380e-03,  5.1898835e-03,  5.7419371e-03,
         7.4669169e-03, -6.1676763e-03,  1.1056137e-03,  6.0472824e-03,
        -2.8400517e-03, -6.1735227e-03, -4.1022300e-04, -8.3689503e-03,
        -5.6000138e-03,  7.1045374e-03,  3.3525396e-03,  7.2256685e-03,
         6.8002464e-03,  7.5307419e-03, -3.7891555e-03, -5.6180713e-04,
         2.3483753e-03, -4.5190332e-03,  8.3887316e-03, -9.8581649e-03,
         6.7646410e-03,  2.9144168e-03, -4.9328329e-03,  4.3981862e-03,
        -1.7395759e-03,  6.7113829e-03,  9.9648498e-03, -4.3624449e-03,
        -5.9933902e-04, -5.6956387e-03,  3.8508223e-03,  2.7866268e-03,
         6.8910765e-03,  6.1010956e-03,  9.5384959e-03,  9.2734173e-03,
         7.8980681e-03, -6.9895051e-03, -9.1558648e-03, -3.5575390e-04,
        -3.0998420e-03,  7.8943158e-03,  5.9385728e-03, -1.5456629e-03,
         1.5109634e-03,  1.7900396e-03,  7.8175711e-03, -9.5101884e-03,
        -2.0553112e-04,  3.4691954e-03, -9.3897345e-04,  8.38177

In [10]:
# What does the averaged version look like?
X_train_vect_avg[1]

array([-3.7696115e-03,  1.6081532e-03,  5.1379632e-03,  7.7023380e-03,
       -2.5950030e-03, -6.7371563e-03,  4.3175681e-03,  7.8027053e-03,
       -4.1452772e-03, -4.7274330e-03,  4.2642136e-03, -4.2676637e-03,
       -4.9619740e-03,  6.7742453e-03, -1.5750803e-03,  1.8006569e-03,
        4.4460464e-03,  3.6074209e-03, -6.4867912e-03, -5.8940141e-03,
        5.3264098e-03,  1.2345440e-03,  7.4101090e-03, -3.4855469e-03,
        6.5163905e-03, -8.7745284e-04, -2.5409746e-03,  5.2204183e-03,
       -5.2088136e-03,  3.2289009e-04, -5.2100944e-04, -2.3030031e-03,
        5.4831356e-03, -6.6697551e-03,  1.4006710e-04, -4.7994639e-05,
        7.6028914e-03, -1.1180993e-03,  3.8424954e-03,  8.5712597e-04,
       -2.6029032e-03,  2.0857379e-04, -8.9180982e-03, -2.7773969e-03,
       -1.2609968e-03,  2.9800166e-03, -2.2213149e-03,  5.1505798e-03,
        3.5936192e-03,  6.2559014e-03, -1.7677225e-03, -1.1065968e-03,
       -2.5644589e-03,  1.8823992e-03,  4.7235815e-03,  6.7540200e-04,
      

### Fit RandomForestClassifier On Top Of Word Vectors

In [11]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [12]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [13]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.559 / Recall: 0.204 / Accuracy: 0.861
