<h4> Import Library

In [1]:
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,classification_report

<h4> Load and Prepare Train and Test Datasets from Pre-Split Dataset

In [2]:
X_train = pd.read_csv('../data/(E) X_train.csv')
X_test = pd.read_csv('../data/(F) X_test.csv')
y_train = pd.read_csv('../data/(G) y_train.csv')
y_test = pd.read_csv('../data/(H) y_test.csv')

In [3]:
y_train = y_train.ensemble_sentiment
y_test = y_test.ensemble_sentiment

<h4> Train the Word2Vec Model

In [4]:
w2v_model = Word2Vec(sentences=X_train['tweet'], vector_size=100, window=5, min_count=1, workers=4)
word_vectors = w2v_model.wv


<h4> Define Function to Calculate Average Word2Vec

In [5]:
def get_avg_word2vec(text, model, num_features):
    """ Calculate the average word2vec for each document """
    feature_vec = np.zeros((num_features,), dtype='float32')
    n_words = 0
    for word in text:
        if word in model.wv.index_to_key:
            n_words += 1
            feature_vec = np.add(feature_vec, model.wv[word])
    if n_words > 0:
        feature_vec = np.divide(feature_vec, n_words)
    return feature_vec


<h4> Get Feature Vectors for Train and Test Data

In [6]:
X_train_vecs = np.array([get_avg_word2vec(doc, w2v_model, 100) for doc in X_train['tweet']])
X_test_vecs = np.array([get_avg_word2vec(doc, w2v_model, 100) for doc in X_test['tweet']])

In [7]:
X_train_vecs = np.array([get_avg_word2vec(doc, w2v_model, 100) for doc in X_train['tweet']])
X_test_vecs = np.array([get_avg_word2vec(doc, w2v_model, 100) for doc in X_test['tweet']])

<h4> Decision Tree

In [8]:
clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train_vecs, y_train)
y_pred = clf.predict(X_test_vecs)
accuracy = accuracy_score(y_test, y_pred)
print("Classification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

    negative       0.66      0.64      0.65      1709
     neutral       0.33      0.35      0.34       704
    positive       0.27      0.28      0.27       504

    accuracy                           0.50      2917
   macro avg       0.42      0.42      0.42      2917
weighted avg       0.51      0.50      0.51      2917



<h4> Logistic Regression

In [9]:
lr_classifier = LogisticRegression( random_state=0)
lr_classifier.fit(X_train_vecs, y_train)
y_pred = lr_classifier.predict(X_test_vecs)
print("Classification Report:\n", classification_report(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

    negative       0.64      0.96      0.77      1709
     neutral       0.50      0.17      0.26       704
    positive       0.67      0.15      0.24       504

    accuracy                           0.63      2917
   macro avg       0.60      0.43      0.42      2917
weighted avg       0.61      0.63      0.55      2917



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


<h4> Random Forest

In [10]:
rf_classifier = RandomForestClassifier( random_state=0)
rf_classifier.fit(X_train_vecs, y_train)
y_pred = rf_classifier.predict(X_test_vecs)
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

    negative       0.67      0.91      0.77      1709
     neutral       0.52      0.30      0.38       704
    positive       0.57      0.20      0.30       504

    accuracy                           0.64      2917
   macro avg       0.58      0.47      0.48      2917
weighted avg       0.61      0.64      0.60      2917

