FinalW2vSvmCrossVal.py

In [21]:
# import sys
# sys.path.append('/content/drive/My Drive/University/FYP/Sentiment Analysis/Implementation/Sentiment-tagger')

In [22]:
folder_path = '/content/drive/My Drive/University/FYP/Sentiment Analysis/Implementation/'

In [23]:
# import W2VVectorizer
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, KFold

from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from prettytable import PrettyTable
from sklearn.svm import SVC

# Implementation

## W2VVectorizer

In [24]:
# -*- coding: utf-8 -*-
"""W2VVectorizer.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1dE0K8h8k4N_8gxyQmyLQ68FKS9A0F1PP
"""

import math
from collections import defaultdict

import numpy as np
from gensim.models import word2vec


class W2VVectorizer:
    """vectorize the words using predefined W2V model"""

    def __init__(self, w2v_model_name, tfidf):
        self.model = word2vec.Word2Vec.load(w2v_model_name)
        self.idf_dict = None
        self.max_idf_score = 1
        self.tfidf = tfidf

    def fit_transform(self, train_documents):
        if self.tfidf:
            self.idf_dict, self.max_idf_score = self.calculate_idf(train_documents)
        clean_train_comments = []
        for comment in train_documents:
            clean_train_comments.append(comment.split())
        return self.get_avg_feature_vectors(clean_train_comments)

    def transform(self, test_documents):
        clean_test_comments = []
        for comment in test_documents:
            clean_test_comments.append(comment.split())
        return self.get_avg_feature_vectors(clean_test_comments)

    def calculate_idf(self, train_comments):
        df_dict = defaultdict(int)
        for comment in train_comments:
            words = comment.split()
            for word in set(words):
                df_dict[word] += 1

        idf_dict = dict()
        for word in df_dict:
            idf_dict[word] = math.log(train_comments.size / float(df_dict[word]))
        max_idf_score = idf_dict[max(idf_dict, key=lambda key: idf_dict[key])]
        return idf_dict, max_idf_score

    # get a list of feature vectors for all comments
    def get_avg_feature_vectors(self, reviews):
        counter = 0
        review_feature_vecs = np.zeros((len(reviews), self.model.vector_size), dtype="float32")
        for review in reviews:
            review_feature_vecs[int(counter)] = self.comment_to_feature_vector(review)
            counter = counter + 1.
        return review_feature_vecs

    # make a feature vector from a single comment
    def comment_to_feature_vector(self, words):
        feature_vec = np.zeros((self.model.vector_size,), dtype="float32")
        nwords = 0.
        index2word_set = set(self.model.wv.index2word)
        for word in words:
            if word in index2word_set:
                if self.tfidf:
                    if word in self.idf_dict:
                        nwords = nwords + self.idf_dict[word]
                        feature_vec = np.add(feature_vec, self.model[word] * self.idf_dict[word])
                    else:
                        nwords = nwords + self.max_idf_score
                        feature_vec = np.add(feature_vec, self.model[word] * self.max_idf_score)
                else:
                    nwords = nwords + 1.
                    feature_vec = np.add(feature_vec, self.model[word])
            # else:
            #     print("Word is not in W2V model: " + word)

        # we have some one word comments that is not included in original model, todo expand original model or ignore them
        if nwords != 0:
            feature_vec = np.divide(feature_vec, nwords)

        return feature_vec

In [25]:
def main():
    start_time = time.time()
    run_cross_val()
    # run_holdout()
    end_time = time.time()
    print("Time taken for the process: " + str(end_time - start_time))
    return

In [26]:
def run_cross_val():
    all_predictions = []
    w2v_model_path = folder_path + "word_embedding/word2vec/old/"
    comments = pd.read_csv(folder_path + "corpus/analyzed/lankadeepa_tagged_2.csv", ";")
    pretty_table = PrettyTable(["Algorithm", "Accuracy", "Precision", "Recall", "F1_Score"])

    i = 1
    kf = KFold(n_splits=10)
    kf.get_n_splits(comments)
    for train_index, test_index in kf.split(comments):
        train_data_comments, test_data_comments = comments["comment"][train_index], comments["comment"][test_index]
        train_data_labels, test_data_labels = comments["label"][train_index], comments["label"][test_index]
        # vectorizer = W2VVectorizer.W2VVectorizer(w2v_model_path + "word2vec_model_skipgram_300", False)
        vectorizer = W2VVectorizer(w2v_model_path + "word2vec_model_skipgram_remove300_10", False)
        predictions = fit_models(vectorizer, train_data_comments, test_data_comments, train_data_labels, test_data_labels)
        all_predictions = all_predictions + predictions.tolist()

        i = i + 1
        evaluation_metrics(test_data_labels, predictions, pretty_table, "iteration" + str(i))

    evaluation_metrics(comments["label"], all_predictions, pretty_table, "final")
    print(pretty_table)
    print_confusion_matrix(comments["label"], all_predictions)

In [27]:
def fit_models(vectorizer, train_data_comments, test_data_comments, train_data_labels, test_data_labels):
    pretty_table = PrettyTable(["Algorithm", "Accuracy", "Precision", "Recall", "F1_Score"])

    vectorized_train_comments = vectorizer.fit_transform(train_data_comments)
    vectorized_test_comments = vectorizer.transform(test_data_comments)

    model = SVC(C=1, kernel='linear')
    model = model.fit(vectorized_train_comments, train_data_labels)
    predictions = model.predict(vectorized_test_comments)
    evaluation_metrics(test_data_labels, predictions, pretty_table, "SVM")
    print_confusion_matrix(test_data_labels, predictions)

    print(pretty_table)
    print("")
    return predictions

In [28]:
def evaluation_metrics(true_sentiment, predicted_sentiment, pretty_table, algorithm):
    label_binarizer = preprocessing.LabelBinarizer()
    label_binarizer.fit(['NEGATIVE', 'POSITIVE'])
    test_labels = label_binarizer.transform(true_sentiment)
    predict_labels = label_binarizer.transform(predicted_sentiment)
    accuracy_str = str(accuracy_score(true_sentiment, predicted_sentiment))
    precision_str = str(precision_score(test_labels, predict_labels))
    recall_str = str(recall_score(test_labels, predict_labels))
    f1_score_str = str(f1_score(test_labels, predict_labels))
    pretty_table.add_row([algorithm, accuracy_str, precision_str, recall_str, f1_score_str])
    return

In [29]:
def print_confusion_matrix(label, prediction):
    cf_matrix = confusion_matrix(label, prediction)
    print(cf_matrix.ravel())
    print(cf_matrix)


In [30]:
main()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[246  24  71 160]
[[246  24]
 [ 71 160]]
+-----------+-------------------+--------------------+--------------------+--------------------+
| Algorithm |      Accuracy     |     Precision      |       Recall       |      F1_Score      |
+-----------+-------------------+--------------------+--------------------+--------------------+
|    SVM    | 0.810379241516966 | 0.8695652173913043 | 0.6926406926406926 | 0.7710843373493975 |
+-----------+-------------------+--------------------+--------------------+--------------------+



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[225  13  51 212]
[[225  13]
 [ 51 212]]
+-----------+-------------------+--------------------+--------------------+--------------------+
| Algorithm |      Accuracy     |     Precision      |       Recall       |      F1_Score      |
+-----------+-------------------+--------------------+--------------------+--------------------+
|    SVM    | 0.872255489021956 | 0.9422222222222222 | 0.8060836501901141 | 0.8688524590163934 |
+-----------+-------------------+--------------------+--------------------+--------------------+



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[181  15  37 268]
[[181  15]
 [ 37 268]]
+-----------+--------------------+--------------------+--------------------+--------------------+
| Algorithm |      Accuracy      |     Precision      |       Recall       |      F1_Score      |
+-----------+--------------------+--------------------+--------------------+--------------------+
|    SVM    | 0.8962075848303394 | 0.9469964664310954 | 0.8786885245901639 | 0.9115646258503401 |
+-----------+--------------------+--------------------+--------------------+--------------------+



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[206  12  63 220]
[[206  12]
 [ 63 220]]
+-----------+--------------------+--------------------+--------------------+--------------------+
| Algorithm |      Accuracy      |     Precision      |       Recall       |      F1_Score      |
+-----------+--------------------+--------------------+--------------------+--------------------+
|    SVM    | 0.8502994011976048 | 0.9482758620689655 | 0.7773851590106007 | 0.8543689320388349 |
+-----------+--------------------+--------------------+--------------------+--------------------+



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[249  13  54 185]
[[249  13]
 [ 54 185]]
+-----------+--------------------+--------------------+--------------------+--------------------+
| Algorithm |      Accuracy      |     Precision      |       Recall       |      F1_Score      |
+-----------+--------------------+--------------------+--------------------+--------------------+
|    SVM    | 0.8662674650698603 | 0.9343434343434344 | 0.7740585774058577 | 0.8466819221967962 |
+-----------+--------------------+--------------------+--------------------+--------------------+



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[228  14  59 200]
[[228  14]
 [ 59 200]]
+-----------+--------------------+--------------------+--------------------+--------------------+
| Algorithm |      Accuracy      |     Precision      |       Recall       |      F1_Score      |
+-----------+--------------------+--------------------+--------------------+--------------------+
|    SVM    | 0.8542914171656687 | 0.9345794392523364 | 0.7722007722007722 | 0.8456659619450319 |
+-----------+--------------------+--------------------+--------------------+--------------------+



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[211  22  36 232]
[[211  22]
 [ 36 232]]
+-----------+--------------------+--------------------+--------------------+--------------------+
| Algorithm |      Accuracy      |     Precision      |       Recall       |      F1_Score      |
+-----------+--------------------+--------------------+--------------------+--------------------+
|    SVM    | 0.8842315369261478 | 0.9133858267716536 | 0.8656716417910447 | 0.8888888888888888 |
+-----------+--------------------+--------------------+--------------------+--------------------+



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[202  10  51 238]
[[202  10]
 [ 51 238]]
+-----------+--------------------+--------------------+--------------------+--------------------+
| Algorithm |      Accuracy      |     Precision      |       Recall       |      F1_Score      |
+-----------+--------------------+--------------------+--------------------+--------------------+
|    SVM    | 0.8782435129740519 | 0.9596774193548387 | 0.8235294117647058 | 0.8864059590316573 |
+-----------+--------------------+--------------------+--------------------+--------------------+



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[288  15  40 158]
[[288  15]
 [ 40 158]]
+-----------+--------------------+--------------------+-------------------+--------------------+
| Algorithm |      Accuracy      |     Precision      |       Recall      |      F1_Score      |
+-----------+--------------------+--------------------+-------------------+--------------------+
|    SVM    | 0.8902195608782435 | 0.9132947976878613 | 0.797979797979798 | 0.8517520215633424 |
+-----------+--------------------+--------------------+-------------------+--------------------+



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


[321  25  38 117]
[[321  25]
 [ 38 117]]
+-----------+-------------------+-------------------+--------------------+-------------------+
| Algorithm |      Accuracy     |     Precision     |       Recall       |      F1_Score     |
+-----------+-------------------+-------------------+--------------------+-------------------+
|    SVM    | 0.874251497005988 | 0.823943661971831 | 0.7548387096774194 | 0.787878787878788 |
+-----------+-------------------+-------------------+--------------------+-------------------+

+-------------+--------------------+--------------------+--------------------+--------------------+
|  Algorithm  |      Accuracy      |     Precision      |       Recall       |      F1_Score      |
+-------------+--------------------+--------------------+--------------------+--------------------+
|  iteration2 | 0.810379241516966  | 0.8695652173913043 | 0.6926406926406926 | 0.7710843373493975 |
|  iteration3 | 0.872255489021956  | 0.9422222222222222 | 0.8060836501901141 | 0.86