Name in Repo : SentimentAnalyzerW2V2.py

# Imports

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import math
from gensim.models import word2vec
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score, precision_score
from gensim.models.fasttext import FastText

In [None]:
comments = pd.read_csv("/content/drive/My Drive/University/FYP/Sentiment Analysis/Implementation/corpus/analyzed/comments_all_remove_all_punc_keep_question.csv", ";")
trainData, testData = train_test_split(comments, test_size=0.2, random_state=0)
no_of_train_samples = trainData.size

In [None]:
def main():
    # generate_word2vec_model()
    train_labels = trainData["label"]
    test_labels = testData["label"]

    train_data_vecs, test_data_vecs = get_train_test_data_vecs(False)
    print("train data size = %d, test data size = %d\n" % (train_data_vecs.size, test_data_vecs.size))
    clssify_using_random_forest(train_data_vecs, test_data_vecs, train_labels, test_labels)
    clssify_using_svm(train_data_vecs, test_data_vecs, train_labels, test_labels)
    clssify_using_logistic_regression(train_data_vecs, test_data_vecs, train_labels, test_labels)
    clssify_using_naive_bayes(train_data_vecs, test_data_vecs, train_labels, test_labels)

    train_data_vecs, test_data_vecs = get_train_test_data_vecs(True)
    print("train data size = %d, test data size = %d\n" % (train_data_vecs.size, test_data_vecs.size))
    clssify_using_random_forest(train_data_vecs, test_data_vecs, train_labels, test_labels)
    clssify_using_svm(train_data_vecs, test_data_vecs, train_labels, test_labels)
    clssify_using_logistic_regression(train_data_vecs, test_data_vecs, train_labels, test_labels)
    clssify_using_naive_bayes(train_data_vecs, test_data_vecs, train_labels, test_labels)
    return

In [None]:
def calculate_idf(train_comments):
    print("calculation idf scores")
    index2word_set = set()
    word2doc_frequency = {}
    for comment in train_comments:
        comment_word_set = set()
        for word in comment.split():
            if word in index2word_set:
                if word not in comment_word_set:
                    word2doc_frequency[word] = word2doc_frequency.get(word) + 1
            else:
                index2word_set.add(word)
                word2doc_frequency[word] = 1
                comment_word_set.add(word)
    return word2doc_frequency

In [None]:
# split a comment into sentences of words
def to_separate_sentences(comment):
    sentences = []
    raw_sentences = str(comment).split(".")
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 2:
            sentences.append(raw_sentence.split())
    return sentences

In [None]:
# to word list
def to_word_list(comment):
    return comment.split()

In [None]:
# make a feature vector from a single comment
def make_feature_vec(words, model, num_features):
    feature_vec = np.zeros((num_features,), dtype="float32")
    nwords = 0.
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set:
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec, model[word])

    # we have some one word comments that is not included in original model, todo expand original model or remove them
    if nwords != 0:
        feature_vec = np.divide(feature_vec, nwords)

    return feature_vec

In [None]:
# make a tfidf feature vector from a single comment
def make_feature_vec_tfidf(words, model, num_features, word2doc_frequency):
    feature_vec = np.zeros((num_features,), dtype="float32")
    nwords = 0.
    index2word_set = set(model.wv.index2word)
    for word in words:
        if word in index2word_set and word in word2doc_frequency:
            nwords = nwords + 1.
            feature_vec = np.add(feature_vec, model[word] * math.log10(no_of_train_samples / (word2doc_frequency.get(word))))

    # we have some one word comments that is not included in original model, todo expand original model or remove them
    if nwords != 0:
        feature_vec = np.divide(feature_vec, nwords)

    return feature_vec

In [None]:
# get a list of feature vectors for all comments
def get_avg_feature_vecs(reviews, model, num_features):
    counter = 0
    review_feature_vecs = np.zeros((len(reviews), num_features), dtype="float32")
    for review in reviews:
        if counter % 1000. == 0.:
            print("Review %d of %d" % (counter, len(reviews)))

        review_feature_vecs[int(counter)] = make_feature_vec(review, model, num_features)
        counter = counter + 1.
    return review_feature_vecs

In [None]:
# get a list of feature vectors for all comments
def get_avg_feature_vecs_tfidf(reviews, model, num_features, word2doc_frequency):
    counter = 0
    review_feature_vecs = np.zeros((len(reviews), num_features), dtype="float32")
    for review in reviews:
        if counter % 1000. == 0.:
            print("Review %d of %d" % (counter, len(reviews)))

        review_feature_vecs[int(counter)] = make_feature_vec_tfidf(review, model, num_features, word2doc_frequency)
        counter = counter + 1.
    return review_feature_vecs


In [None]:
def get_train_test_data_vecs(tfidf):
    num_features = 300
    # model = word2vec.Word2Vec.load("/content/drive/My Drive/UNI/FYP/Sentiment Analysis/Sentiment-tagger/corpus/analyzed/saved_models/Word2Vec/word2vec_model_skipgram_remove_1_300_10")
    model = FastText.load("/content/drive/My Drive/University/FYP/Sentiment Analysis/Implementation/word_embedding/fasttext/from_lankadeepa_comments_all_punct_removed/comments_all_skipgram_remove_puncuation_removed_300_10")
    clean_train_comments = []
    clean_test_comments = []

    for comment in trainData["comment"]:
        clean_train_comments.append(comment.split())
    for comment in testData["comment"]:
        clean_test_comments.append(comment.split())

    if tfidf:
        word2doc_frequency = calculate_idf(trainData["comment"])
        train_data_vecs = get_avg_feature_vecs_tfidf(clean_train_comments, model, num_features, word2doc_frequency)
        test_data_vecs = get_avg_feature_vecs_tfidf(clean_test_comments, model, num_features, word2doc_frequency)
    else:
        train_data_vecs = get_avg_feature_vecs(clean_train_comments, model, num_features)
        test_data_vecs = get_avg_feature_vecs(clean_test_comments, model, num_features)

    return train_data_vecs, test_data_vecs

In [None]:
def clssify_using_random_forest(train_data_vec, test_data_vec, train_labels, test_labels):
    forest = RandomForestClassifier(n_estimators=100)
    print("Fitting a random forest to labeled training data...")
    forest = forest.fit(train_data_vec, train_labels)
    result = forest.predict(test_data_vec)

    confusion_matrix = pd.crosstab(test_labels, result, rownames=["Actual"], colnames=["Predicted"])
    print(confusion_matrix)

    label_binarizer = preprocessing.LabelBinarizer()
    label_binarizer.fit(['NEGATIVE', 'POSITIVE'])
    test_sentiment = label_binarizer.transform(test_labels)
    predict_sentiment = label_binarizer.transform(result)
    accuracy_str = str(accuracy_score(test_sentiment, predict_sentiment))
    precision_str = str(precision_score(test_sentiment, predict_sentiment))
    f1_score_str = str(f1_score(test_sentiment, predict_sentiment))
    print("Accuracy = %s \nPrecision = %s \nF1score = %s \n" % (accuracy_str, precision_str, f1_score_str))
    return

In [None]:
def clssify_using_svm(train_data_vec, test_data_vec, train_labels, test_labels):
    svm = SVC(C=1, kernel='linear')
    print("Fitting a SVM to labeled training data...")
    svm = svm.fit(train_data_vec, train_labels)
    result = svm.predict(test_data_vec)

    confusion_matrix = pd.crosstab(test_labels, result, rownames=["Actual"], colnames=["Predicted"])
    print(confusion_matrix)

    label_binarizer = preprocessing.LabelBinarizer()
    label_binarizer.fit(['NEGATIVE', 'POSITIVE'])
    test_sentiment = label_binarizer.transform(test_labels)
    predict_sentiment = label_binarizer.transform(result)
    accuracy_str = str(accuracy_score(test_sentiment, predict_sentiment))
    precision_str = str(precision_score(test_sentiment, predict_sentiment))
    f1_score_str = str(f1_score(test_sentiment, predict_sentiment))
    print("Accuracy = %s \nPrecision = %s \nF1score = %s \n" % (accuracy_str, precision_str, f1_score_str))
    return

In [None]:
def clssify_using_logistic_regression(train_data_vec, test_data_vec, train_labels, test_labels):
    logistic_regression = LogisticRegression()
    print("Fitting a logistic regression to labeled training data...")
    logistic_regression = logistic_regression.fit(train_data_vec, train_labels)
    result = logistic_regression.predict(test_data_vec)

    confusion_matrix = pd.crosstab(test_labels, result, rownames=["Actual"], colnames=["Predicted"])
    print(confusion_matrix)

    label_binarizer = preprocessing.LabelBinarizer()
    label_binarizer.fit(['NEGATIVE', 'POSITIVE'])
    test_sentiment = label_binarizer.transform(test_labels)
    predict_sentiment = label_binarizer.transform(result)
    accuracy_str = str(accuracy_score(test_sentiment, predict_sentiment))
    precision_str = str(precision_score(test_sentiment, predict_sentiment))
    f1_score_str = str(f1_score(test_sentiment, predict_sentiment))
    print("Accuracy = %s \nPrecision = %s \nF1score = %s \n" % (accuracy_str, precision_str, f1_score_str))
    return

In [None]:
def clssify_using_naive_bayes(train_data_vec, test_data_vec, train_labels, test_labels):
    naive_bayes = GaussianNB()
    print("Fitting a naive bayes to labeled training data...")
    naive_bayes = naive_bayes.fit(train_data_vec, train_labels)
    result = naive_bayes.predict(test_data_vec)

    confusion_matrix = pd.crosstab(test_labels, result, rownames=["Actual"], colnames=["Predicted"])
    print(confusion_matrix)

    label_binarizer = preprocessing.LabelBinarizer()
    label_binarizer.fit(['NEGATIVE', 'POSITIVE'])
    test_sentiment = label_binarizer.transform(test_labels)
    predict_sentiment = label_binarizer.transform(result)
    accuracy_str = str(accuracy_score(test_sentiment, predict_sentiment))
    precision_str = str(precision_score(test_sentiment, predict_sentiment))
    f1_score_str = str(f1_score(test_sentiment, predict_sentiment))
    print("Accuracy = %s \nPrecision = %s \nF1score = %s \n" % (accuracy_str, precision_str, f1_score_str))
    return


In [None]:
main()