In [1]:
# Loading the cleanned train and test sets
import gensim
import pandas as pd
import numpy as np

x_train = pd.read_csv('x_train.csv')
y_train = pd.read_csv('y_train.csv')
x_test = pd.read_csv('x_test.csv')
y_test = pd.read_csv('y_test.csv')

In [2]:
# Creating tagged document vectots to prepare to train and test the model
tagged_docs_train = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(x_train['cleanned_text'])]
tagged_docs_test = [gensim.models.doc2vec.TaggedDocument(v, [i]) for i, v in enumerate(x_test['cleanned_text'])]

# Train a doc2vec model
d2v_model = gensim.models.Doc2Vec(tagged_docs_train,
                                vector_size = 100,
                                window = 5,
                                min_count = 2)

In [3]:
# Infer the vectors to be used in trainging and testing
train_vectors = [d2v_model.infer_vector(eval(v.words)) for v in tagged_docs_train]
test_vectors = [d2v_model.infer_vector(eval(v.words)) for v in tagged_docs_test]

In [4]:
# Fit a Random Forst classifier over the training set vectors
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc_model = rfc.fit(train_vectors, y_train.values.ravel())

# Predicting labels for the test set and evaluating the output
y_pred = rfc_model.predict(test_vectors)

from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), 
                                                  round(recall, 3), round((y_pred == y_test['label']).sum()/len(y_pred), 3)))

Precision: 0.925 / Recall: 0.385 / Accuracy: 0.907


In [None]:
# The Doc_2_Vec results are better than those by the Word_2_Vec model in all the 3 precision, recall and accuracy
# metrics which is normal considering the Word_2_Vec's drawback with averaging word vectors for each text message.
# However, the model has not beaten the TFIDF model. So far, adding more complexity to the model, which is the case
# with both Word_2_Vec and Doc_2_Vec models, has not resulted in any improvement to the TFIDF model.