In [1]:
# Load the cleaned training and test sets
import gensim
import numpy as np
import pandas as pd

X_train = pd.read_csv('Data/X_train.csv')
X_test = pd.read_csv('Data/X_test.csv')
y_train = pd.read_csv('Data/y_train.csv')
y_test = pd.read_csv('Data/y_test.csv')

In [2]:
# Train a basic word2vec model
w2v_model = gensim.models.Word2Vec(X_train,
                                   size=100,
                                   window=5,
                                   min_count=2)

In [3]:
# Replace the words in each text message with the learned word vector
words = set(w2v_model.wv.index2word)
X_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_train['clean_text']])
X_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                         for ls in X_test['clean_text']])

In [4]:
# Average the word vectors for each sentence (and assign a vector of zeros if the model
# did not learn any of the words in the text message during training
X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(100, dtype=float))
        
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(100, dtype=float))

In [5]:
# What does the unaveraged version look like?
X_train_vect[0]

array([[-0.00468796, -0.0004517 , -0.00069944, ...,  0.00228134,
        -0.00179504, -0.00219155],
       [-0.00468796, -0.0004517 , -0.00069944, ...,  0.00228134,
        -0.00179504, -0.00219155],
       [ 0.00480211, -0.00120003,  0.00050045, ..., -0.00352571,
         0.00084311, -0.00253902],
       ...,
       [ 0.00480211, -0.00120003,  0.00050045, ..., -0.00352571,
         0.00084311, -0.00253902],
       [ 0.00480211, -0.00120003,  0.00050045, ..., -0.00352571,
         0.00084311, -0.00253902],
       [-0.00468796, -0.0004517 , -0.00069944, ...,  0.00228134,
        -0.00179504, -0.00219155]], dtype=float32)

In [6]:
# What does the averaged version look like?
X_train_vect_avg[0]

array([ 1.15207932e-03, -9.12214280e-04,  3.89521265e-05,  2.28104834e-03,
        1.19190954e-03,  2.46128417e-03, -1.85332645e-03, -1.03678212e-04,
       -4.83019650e-03,  4.56851150e-04, -9.48049419e-04,  3.21314624e-03,
        5.34684004e-05, -3.63794109e-03, -3.54786543e-03, -2.08948506e-03,
       -1.73820730e-03, -6.77558244e-04, -5.22984425e-04,  1.02996346e-04,
       -2.23536248e-04,  2.81362096e-03,  6.09840499e-04,  2.62840558e-03,
        9.01742198e-04,  4.16102877e-04, -2.20982730e-03, -1.93315977e-03,
       -1.65634803e-04, -2.82433908e-03, -3.25160893e-03,  7.89637095e-04,
       -2.32550222e-03,  3.34539916e-03, -1.57583982e-03,  3.28973331e-03,
       -2.20187171e-03, -3.19124549e-03,  2.59583467e-03,  1.87047431e-03,
        4.29407111e-04, -3.37519706e-03, -2.46620597e-03, -3.25367693e-03,
        1.68316509e-03, -1.29128795e-03,  2.05495418e-03,  1.96213252e-03,
        2.68641883e-03,  1.67605851e-03, -2.77697691e-03,  1.23540103e-03,
        1.35625922e-03,  

In [7]:
# Instantiate and fit a basic Random Forest model on top of the vectors
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

In [8]:
# Use the trained model to make predictions on the test data
y_pred = rf_model.predict(X_test_vect_avg)

In [9]:
# Evaluate the predictions of the model on the holdout test set
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.544 / Recall: 0.224 / Accuracy: 0.857
