# THE GENSIM DOC2VEC MODEL

In [113]:
import gensim
#READING A DATA FILE (TAGS range from 0 to len(data))
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

def read_corpus(fname, tokens_only=False):
    with open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                # For training data, add tags
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])


#THIS CREATES A TRAINING CORPUS
train_corpus = list(read_corpus("train.txt"))
#Create Validation corpus
validation_corpus = list(read_corpus("validation.txt"))
#text corpus
test_corpus = list(read_corpus("test.txt"))

In [114]:
#INSTANTIATE THE GENSIM DOC2VEC MODEL
#CREATE VOCABULARY
#TRAIN THE GENSIM MODEL USING train_corpus
#https://radimrehurek.com/gensim/auto_examples/tutorials/run_doc2vec_lee.html
model = gensim.models.doc2vec.Doc2Vec(vector_size=40, min_count=2, epochs=50)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)


# FEATURE VECTORS for a specific training instance in train_corpus using infer_vector()

In [115]:
vector = model.infer_vector(['the', 'welcome', 'housekeeping', 'room'])
print(vector)

[ 0.1305031  -0.05924479 -0.6960198   0.11471944 -0.01019582 -0.06324498
 -0.36004782 -0.48733127 -0.09323093 -0.3469856   0.40649232 -0.07180716
 -0.13098419  0.12916279  0.0580754   0.15191723  0.22735171  0.27661857
  0.02000582  0.33443126  0.02258343  0.0449286   0.09695464  0.593966
 -0.23650116  0.00817174 -0.5920629  -0.08337425  0.3251718  -0.00323302
  0.09217398 -0.2983118  -0.6494542   0.00921688 -0.09935842  0.4955143
  0.20736198 -0.23427123  0.40181515 -0.2439776 ]


In [116]:
inferred_vector = model.infer_vector(train_corpus[doc_id].words)


In [117]:
#stack all the vectors using numpy.vstack() to obtain X (features for classifier)
import numpy as np
features = np.vstack([model.infer_vector(doc.words) for doc in train_corpus])

In [118]:
#Read the labels in trainlabels.txt to obtain Y
y_labels = np.loadtxt("trainlabels.txt", dtype=int)

In [119]:
#stack all the vectors using numpy.vstack() to obtain X (features for classifier)
validation_features = np.vstack([model.infer_vector(doc.words) for doc in validation_corpus])
test_features = np.vstack([model.infer_vector(doc.words) for doc in test_corpus])
#Read the labels in validatelabels.txt to obtain V
v_labels = np.loadtxt('validationlabels.txt', dtype=int)
#Read the labels in testlabels.txt to obtain t
t_labels = np.loadtxt('testlabels.txt', dtype=int)

# NNET,LR,RF on train features

In [120]:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import KFold, cross_val_score, cross_validate
import warnings 
warnings.filterwarnings("ignore")
NNet = MLPClassifier().fit(features, y_labels)
LR = LogisticRegression().fit(features, y_labels)
RF = RandomForestClassifier().fit(features, y_labels)

# NNet perfrmace on Validation dataset

In [121]:
#1.NNet on validation data
EvluationMetrics =  ['f1', 'precision', 'recall']
LRates = ["constant", "invscaling", "adaptive"]
validationScores = list()
for lr in LRates:
    NNet = MLPClassifier(learning_rate=lr)
    NNet_Per = cross_validate(NNet,validation_features, v_labels, cv=5, 
                                scoring=EvluationMetrics)
    validationScores.append(NNet_Per)
for score in validationScores:
    print('F1 Score: '+str(score['test_f1'].mean()))
    print('Precision_Score: ' +str(score['test_precision'].mean()))
    print('Recall_Score: '+str(score['test_recall'].mean()))




F1 Score: 0.6373862433862434
Precision_Score: 0.6551282051282051
Recall_Score: 0.6333333333333334
F1 Score: 0.6458603425559947
Precision_Score: 0.6482983682983682
Recall_Score: 0.65
F1 Score: 0.6814814814814814
Precision_Score: 0.6512820512820513
Recall_Score: 0.7166666666666667


# Random Forest perfrmace on Validation dataset

In [122]:
NE = [30,40,50]
for estimator in NE:
    RF = RandomForestClassifier(n_estimators=estimator,criterion='gini').fit(features, y_labels)
    RF_Per = cross_validate(RF,validation_features, v_labels, cv=5, 
                                scoring=EvluationMetrics)
    validationScores.append(RF_Per)

for score in validationScores:
    print('F1 Score: '+str(score['test_f1'].mean()))
    print('Precision_Score: ' +str(score['test_precision'].mean()))
    print('Recall_Score: '+str(score['test_recall'].mean()))

F1 Score: 0.6373862433862434
Precision_Score: 0.6551282051282051
Recall_Score: 0.6333333333333334
F1 Score: 0.6458603425559947
Precision_Score: 0.6482983682983682
Recall_Score: 0.65
F1 Score: 0.6814814814814814
Precision_Score: 0.6512820512820513
Recall_Score: 0.7166666666666667
F1 Score: 0.5659654185969976
Precision_Score: 0.625079365079365
Recall_Score: 0.5333333333333334
F1 Score: 0.5893333333333333
Precision_Score: 0.6044871794871794
Recall_Score: 0.5833333333333333
F1 Score: 0.5806493506493507
Precision_Score: 0.6202564102564103
Recall_Score: 0.55


# Logistic Regression perfrmace on Validation dataset

In [123]:

classRatio = [0,1]
lrValidationScores = list()
for ratio in classRatio:
    RF = LogisticRegression(l1_ratio=ratio, penalty='elasticnet', solver='saga')
    RF_Per = cross_validate(RF,validation_features, v_labels, cv=5, 
                                scoring=EvluationMetrics)
    lrValidationScores.append(RF_Per)
for score in lrValidationScores:
    print('F1 Score: '+str(score['test_f1'].mean()))
    print('Precision_Score: ' +str(score['test_precision'].mean()))
    print('Recall_Score: '+str(score['test_recall'].mean()))


F1 Score: 0.6438431577861864
Precision_Score: 0.6388591800356507
Recall_Score: 0.6666666666666667
F1 Score: 0.6661523783562764
Precision_Score: 0.6869477581242288
Recall_Score: 0.6666666666666667


# NNet perfrmace on Test dataset

In [124]:
EvluationMetrics =  ['f1', 'precision', 'recall']
LRates = ["constant", "invscaling", "adaptive"]
validationScores = list()
for lr in LRates:
    NNet = MLPClassifier(learning_rate=lr)
    NNet_Per = cross_validate(NNet,test_features, t_labels, cv=5, 
                                scoring=EvluationMetrics)
    validationScores.append(NNet_Per)
for score in validationScores:
    print('F1 Score: '+str(score['test_f1'].mean()))
    print('Precision_Score: ' +str(score['test_precision'].mean()))
    print('Recall_Score: '+str(score['test_recall'].mean()))

F1 Score: 0.7257575757575758
Precision_Score: 0.7271464646464647
Recall_Score: 0.7393939393939395
F1 Score: 0.6877846790890269
Precision_Score: 0.6926975638740345
Recall_Score: 0.7045454545454546
F1 Score: 0.7119047619047619
Precision_Score: 0.7059376571141277
Recall_Score: 0.740909090909091


# Random Forst perfrmace on Test dataset

In [125]:
NE = [30,40,50]
for estimator in NE:
    RF = RandomForestClassifier(n_estimators=estimator,criterion='gini').fit(features, y_labels)
    RF_Per = cross_validate(RF,test_features, t_labels, cv=5, 
                                scoring=EvluationMetrics)
    validationScores.append(RF_Per)

for score in validationScores:
    print('F1 Score: '+str(score['test_f1'].mean()))
    print('Precision_Score: ' +str(score['test_precision'].mean()))
    print('Recall_Score: '+str(score['test_recall'].mean()))

F1 Score: 0.7257575757575758
Precision_Score: 0.7271464646464647
Recall_Score: 0.7393939393939395
F1 Score: 0.6877846790890269
Precision_Score: 0.6926975638740345
Recall_Score: 0.7045454545454546
F1 Score: 0.7119047619047619
Precision_Score: 0.7059376571141277
Recall_Score: 0.740909090909091
F1 Score: 0.542855320118478
Precision_Score: 0.6023448773448774
Recall_Score: 0.5121212121212121
F1 Score: 0.6512987012987013
Precision_Score: 0.6778554778554777
Recall_Score: 0.6363636363636364
F1 Score: 0.5843603455368162
Precision_Score: 0.6814141414141414
Recall_Score: 0.55


# Logistic Regression perfrmace on test dataset

In [126]:
classRatio = [0,1]
lrValidationScores = list()
for ratio in classRatio:
    RF = LogisticRegression(l1_ratio=ratio, penalty='elasticnet', solver='saga')
    RF_Per = cross_validate(RF,test_features, t_labels, cv=5, 
                                scoring=EvluationMetrics)
    lrValidationScores.append(RF_Per)
for score in lrValidationScores:
    print('F1 Score: '+str(score['test_f1'].mean()))
    print('Precision_Score: ' +str(score['test_precision'].mean()))
    print('Recall_Score: '+str(score['test_recall'].mean()))

F1 Score: 0.7028706945228684
Precision_Score: 0.6968831168831169
Recall_Score: 0.7212121212121212
F1 Score: 0.7159762845849802
Precision_Score: 0.7240026640026639
Recall_Score: 0.7196969696969697


In this exercise, three classification algorithms were compared in a binary classification job.
When compared to Random forest and logistic regression classifiers, the MLP Classifier with the adjustable learning rate performs better.
More parameters can be experimented with to improve the performance of the classifiers using techniques such as GridSearch CV