In [1]:
# Loading the cleanned train and test sets
import gensim
import pandas as pd
import numpy as np

x_train = pd.read_csv('x_train.csv')
y_train = pd.read_csv('y_train.csv')
x_test = pd.read_csv('x_test.csv')
y_test = pd.read_csv('y_test.csv')

In [2]:
# Traning a Word2Vec model using gensim
w2v_model = gensim.models.Word2Vec(x_train,
                                   vector_size = 100,
                                   window = 5,
                                   min_count = 2)

In [3]:
# Replacing the words in all the text messages with their learned vectors through the Word_to_Vec model
words = set(w2v_model.wv.index_to_key)
x_train_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                    for ls in x_train['cleanned_text']], dtype=object)
x_test_vect = np.array([np.array([w2v_model.wv[i] for i in ls if i in words])
                    for ls in x_test['cleanned_text']], dtype=object)

In [4]:
# Computing the average for word vectors for the words contained in each text message
# and returning an array of zeros if none of the words in a text message were learned by the model.
# Averaging returns one single array of a set of arrays
x_train_vect_avg = []
x_test_vect_avg = []

for vect in x_train_vect:
    if len(vect) != 0:
        x_train_vect_avg.append(vect.mean(axis = 0))
    else:
        x_train_vect_avg.append(np.zeros(100, dtype = float))
        
for vect in x_test_vect:
    if len(vect) != 0:
        x_test_vect_avg.append(vect.mean(axis = 0))
    else:
        x_test_vect_avg.append(np.zeros(100, dtype = float))

In [5]:
# Fit a Random Forst classifier over the training set vectors
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc_model = rfc.fit(x_train_vect_avg, y_train.values.ravel())

In [6]:
# Predicting labels for the test set and evaluating the output
y_pred = rfc_model.predict(x_test_vect_avg)

from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), 
                                                  round(recall, 3), round((y_pred == y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.613 / Recall: 0.354 / Accuracy: 0.874


In [None]:
# The Word_2_Vec results are much worse than those of TFIDF for all the three of precision, recall and accuracy which
# is normal as Word_2_Vec is not intended to create representation for senteces. It is here averaging word vectors of
# sentences to create presentations for the sentences.