In [1]:
import numpy as np
import re
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegressionCV
import os

In [2]:
veryNegative = []
Negative = []
Positive = []
veryPositive = []
data_X = ""
data_Y = ""

In [3]:
def generateStopWordList():

    #Fetch stopwords.txt file path
    stopWords_dataset = dirPath+"/Data/stopwords.txt"

    #stopwords List
    stopWords = []

    #Open the stopwords txt file read the data and store in a list
    try:
        fp = open(stopWords_dataset, 'r')
        line = fp.readline()
        while line:
            word = line.strip()
            stopWords.append(word)
            line = fp.readline()
        fp.close()
    except:
        print("ERROR: Opening File")

    return stopWords

In [4]:
def generate_AffinityList(datasetLink):

    affin_dataset = datasetLink
    try:
        affin_list = open(affin_dataset).readlines()
    except:
        print("", affin_dataset)
        exit(0)
    
    return affin_list


In [5]:
def createDictionaryFromPolarity(affin_list):

    # Create list to store the words and its polarity score
    words = []
    score = []

    # Iterate and assign words and thier polarities
    for word in affin_list:
        words.append(word.split("\t")[0].lower())
        score.append(int(word.split("\t")[1].split("\n")[0]))

    #Categorize words into different categories based on polarities
    for elem in range(len(words)):
        if score[elem] == -4 or score[elem] == -5:
            veryNegative.append(words[elem])
        elif score[elem] == -3 or score[elem] == -2 or score[elem] == -1:
            Negative.append(words[elem])
        elif score[elem] == 3 or score[elem] == 2 or score[elem] == 1:
            Positive.append(words[elem])
        elif score[elem] == 4 or score[elem] == 5:
            veryPositive.append(words[elem])

In [6]:
def preprocessing(dataSet):

    processed_data = []

    #Make a list of all the Stopwords to be removed
    stopWords = generateStopWordList()

    #For every TWEET in the dataset do,
    for tweet in dataSet:

        temp_tweet = tweet

        #Convert @username to USER_MENTION
        tweet = re.sub('@[^\s]+','USER_MENTION',tweet).lower()
        tweet.replace(temp_tweet, tweet)

        #Remove the unnecessary white spaces
        tweet = re.sub('[\s]+',' ', tweet)
        tweet.replace(temp_tweet,tweet)

        #Replace HASH (#) symbol in hastag
        tweet = re.sub(r'#([^\s]+)', r'\1', tweet)

        #Replace all the numeric terms
        tweet = re.sub('[0-9]+', "",tweet)
        tweet.replace(temp_tweet,tweet)

        #Remove all the STOP WORDS
        for sw in stopWords:
            if sw in tweet:
                tweet = re.sub(r'\b' + sw + r'\b'+" ","",tweet)

        tweet.replace(temp_tweet, tweet)

        #Replace all Punctuations
        tweet = re.sub('[^a-zA-z ]',"",tweet)
        tweet.replace(temp_tweet,tweet)

        #Remove additional white spaces
        tweet = re.sub('[\s]+',' ', tweet)
        tweet.replace(temp_tweet,tweet)

        #Save the Processed Tweet after data cleansing
        processed_data.append(tweet)

    return processed_data

In [7]:
def FeaturizeTrainingData(dataset, type_class):

    neutral_list = []
    i=0

    # For each Tweet split the Tweet by " " (space) i.e. split every word of the Tweet
    data = [tweet.strip().split(" ") for tweet in dataset]
    #print(data)

    # Feature Vector is to store the feature of the TWEETs
    feature_vector = []

    # for every sentence i.e. TWEET find the words and their category
    for sentence in data:
        # Category count for every Sentence or TWEET
        veryNegative_count = 0
        Negative_count = 0
        Positive_count = 0
        veryPositive_count = 0

        # for every word in sentence, categorize
        # and increment the count by 1 if found
        for word in sentence:
            if word in veryPositive:
                veryPositive_count = veryPositive_count + 1
            elif word in Positive:
                Positive_count = Positive_count + 1
            elif word in veryNegative:
                veryNegative_count = veryNegative_count + 1
            elif word in Negative:
                Negative_count = Negative_count + 1
        i+=1

        #Assign Class Label
        if veryPositive_count == veryNegative_count == Positive_count == Negative_count:
            feature_vector.append([veryPositive_count, Positive_count, Negative_count, veryNegative_count, "neutral"])
            neutral_list.append(i)
        else:
            feature_vector.append([veryPositive_count, Positive_count, Negative_count, veryNegative_count, type_class])

    #print(neutral_list)
    return feature_vector

In [8]:
def FeatureizeTestData(dataset):

    data = [tweet.strip().split(" ") for tweet in dataset]
    #print(data)
    count_Matrix = []
    feature_vector = []

    for sentence in data:
        veryNegative_count = 0
        Negative_count = 0
        Positive_count = 0
        veryPositive_count = 0

        # for every word in sentence, categorize
        # and increment the count by 1 if found
        for word in sentence:
            if word in veryPositive:
                veryPositive_count = veryPositive_count + 1
            elif word in Positive:
                Positive_count = Positive_count + 1
            elif word in veryNegative:
                veryNegative_count = veryNegative_count + 1
            elif word in Negative:
                Negative_count = Negative_count + 1

        if (veryPositive_count + Positive_count) > (veryNegative_count + Negative_count):
            feature_vector.append([veryPositive_count, Positive_count, Negative_count, veryNegative_count, "positive"])
            #neutral_list.append(i)
        elif (veryPositive_count + Positive_count) < (veryNegative_count + Negative_count):
            feature_vector.append([veryPositive_count, Positive_count, Negative_count, veryNegative_count, "negative"])
        else:
            feature_vector.append([veryPositive_count, Positive_count, Negative_count, veryNegative_count, "neutral"])

    return feature_vector

In [9]:
def classify_naive_bayes(train_X, train_Y, test_X):

    print("Classifying using Gaussian Naive Bayes ...")

    gnb = GaussianNB()
    yPred = gnb.fit(train_X,train_Y).predict(test_X)

    return yPred

def classify_svm(train_X, train_Y, test_X):

    print("Classifying using Support Vector Machine ...")

    clf = SVC()
    clf.fit(train_X,train_Y)
    yPred = clf.predict(test_X)

    return yPred

def classify_maxEnt(train_X, train_Y, test_X):

    print("Classifying using Maximum Entropy ...")
    maxEnt = LogisticRegressionCV()
    maxEnt.fit(train_X, train_Y)
    yPred = maxEnt.predict(test_X)

    return yPred


#########FOR TEST DATA CLASSIFICATION########


# For Twitter test data classification

In [10]:
def classify_naive_bayes_twitter(train_X, train_Y, test_X, test_Y):

    print("Classifying using Gaussian Naive Bayes Algorithm...")
    gnb = GaussianNB()
    yPred = gnb.fit(train_X,train_Y).predict(test_X)
    import pandas as pd
    pd.DataFrame(yPred).to_csv("/Users/vineethkumarrenukuntla/Desktop/vinks/Data/PredictionsforMortalKombat/gnbpredfile.csv")
    conf_mat = confusion_matrix(test_Y,yPred)
    print(conf_mat)
    Accuracy = (sum(conf_mat.diagonal())) / np.sum(conf_mat)
    print("Accuray: ", Accuracy)
    evaluate_classifier(conf_mat)


def classify_svm_twitter(train_X, train_Y, test_X, test_Y):

    print("Classifying using Support Vector Machine Algorithm...")
    clf = SVC()
    clf.fit(train_X,train_Y)
    yPred = clf.predict(test_X)
    import pandas as pd
    pd.DataFrame(yPred).to_csv("/Users/vineethkumarrenukuntla/Desktop/vinks/Data/PredictionsforMortalKombat/svmpredfile.csv")
    conf_mat = confusion_matrix(test_Y,yPred)
    print(conf_mat)
    Accuracy = (sum(conf_mat.diagonal())) / np.sum(conf_mat)
    print("Accuracy: ", Accuracy)
    evaluate_classifier(conf_mat)

def classify_maxEnt_twitter(train_X, train_Y, test_X, test_Y):

    print("Classifying using Maximum Entropy Algorithm...")
    maxEnt = LogisticRegressionCV()
    maxEnt.fit(train_X, train_Y)
    yPred = maxEnt.predict(test_X)
    import pandas as pd
    #Downloading the prediction to CSV file
    pd.DataFrame(yPred).to_csv("/Users/vineethkumarrenukuntla/Desktop/vinks/Data/PredictionsforMortalKombat/maxEntpredfile.csv")
    conf_mat = confusion_matrix(test_Y,yPred)
    print(conf_mat)
    Accuracy = (sum(conf_mat.diagonal())) / np.sum(conf_mat)
    print("Accuracy: ", Accuracy)
    evaluate_classifier(conf_mat)


In [11]:
def classify_twitter_data(file_name):

    test_data = open(dirPath+"/vinks/Data/"+file_name, encoding="utf8").readlines()
    test_data = preprocessing(test_data)
    test_data = FeatureizeTestData(test_data)
    test_data = np.reshape(np.asarray(test_data),newshape=(len(test_data),5))
    print(len(test_data))

    #Split Data into Features and Classes
    data_X_test = test_data[:,:4].astype(int)
    data_Y_test = test_data[:,4]

    print("Classifying",)
    classify_naive_bayes_twitter(data_X, data_Y, data_X_test, data_Y_test)
    classify_svm_twitter(data_X, data_Y, data_X_test, data_Y_test)
    classify_maxEnt_twitter(data_X, data_Y, data_X_test, data_Y_test)


In [12]:
def evaluate_classifier(conf_mat):
    Precision = conf_mat[0,0]/(sum(conf_mat[0]))
    Recall = conf_mat[0,0] / (sum(conf_mat[:,0]))
    F_Measure = (2 * (Precision * Recall))/ (Precision + Recall)

    print("Precision: ",Precision)
    print("Recall: ", Recall)
    print("F-Measure: ", F_Measure)

In [13]:
os.chdir('../')        #!!!!!IMPORTANT UNCOMMENT
dirPath = os.getcwd()

In [14]:
print("Please wait while we Classify your data ...")
affin_list = generate_AffinityList(dirPath+"/vinks/Data/Affin_Data.txt")
createDictionaryFromPolarity(affin_list)

Please wait while we Classify your data ...


In [15]:
print("Preprocessing in progress !")
negative_data = open(dirPath+"/vinks/Data/Rt-polarity-neg.txt").readlines()
positive_data = open(dirPath+"/vinks/Data/Rt-polarity-pos.txt").readlines()
positive_data = preprocessing(positive_data)
negative_data = preprocessing(negative_data)

Preprocessing in progress !
ERROR: Opening File
ERROR: Opening File


In [16]:
print("Generating the Feature Vectors ...")
positive_sentiment = FeaturizeTrainingData(positive_data, "positive")
negative_sentiment = FeaturizeTrainingData(negative_data,"negative")

Generating the Feature Vectors ...


In [17]:
final_data = positive_sentiment + negative_sentiment
final_data = np.reshape(np.asarray(final_data),newshape=(len(final_data),5))

In [18]:
data_X = final_data[:,:4].astype(int)
data_Y = final_data[:,4]

In [19]:
print("Training the Classifer according to the data provided ...")
print("Classifying the Test Data ...")
print("Evaluation Results will be displayed Shortly ...")

yPred = classify_naive_bayes(data_X, data_Y, data_X)
conf_mat = confusion_matrix(data_Y, yPred)
print(conf_mat)
Accuracy = (sum(conf_mat.diagonal())) / np.sum(conf_mat)
print("Accuracy: ", Accuracy)
evaluate_classifier(conf_mat)

yPred = classify_svm(data_X, data_Y, data_X)
conf_mat = confusion_matrix(data_Y, yPred)
print(conf_mat)
Accuracy = (sum(conf_mat.diagonal())) / np.sum(conf_mat)
print("Accuracy: ", Accuracy)
evaluate_classifier(conf_mat)

yPred = classify_maxEnt(data_X, data_Y, data_X)
conf_mat = confusion_matrix(data_Y, yPred)
print(conf_mat)
Accuracy = (sum(conf_mat.diagonal())) / np.sum(conf_mat)
print("Accuracy: ", Accuracy)
evaluate_classifier(conf_mat)

Training the Classifer according to the data provided ...
Classifying the Test Data ...
Evaluation Results will be displayed Shortly ...
Classifying using Gaussian Naive Bayes ...
[[2297    0 2239]
 [   1 1488    0]
 [ 843    0 3792]]
Accuracy:  0.7107879924953096
Precision:  0.5063932980599647
Recall:  0.7312957656797199
F-Measure:  0.5984108375667578
Classifying using Support Vector Machine ...
[[3013    0 1523]
 [   0 1488    1]
 [1362    0 3273]]
Accuracy:  0.7292682926829268
Precision:  0.6642416225749559
Recall:  0.6886857142857142
F-Measure:  0.676242845920772
Classifying using Maximum Entropy ...
[[2992    0 1544]
 [   1 1488    0]
 [1353    0 3282]]
Accuracy:  0.7281425891181988
Precision:  0.6596119929453262
Recall:  0.6884491486424298
F-Measure:  0.673722134654357


# Using trained models to depict the sentiment in Movie Tweets (#MortalKombatmovie) 

In [20]:
 classify_twitter_data(file_name="MortalKombattweets40K.txt")

ERROR: Opening File
40007
Classifying
Classifying using Gaussian Naive Bayes Algorithm...
[[ 6614     0   131]
 [  933 16449  2084]
 [  382     0 13414]]
Accuray:  0.9117654410478166
Precision:  0.9805782060785767
Recall:  0.8341531088409635
F-Measure:  0.9014583617282267
Classifying using Support Vector Machine Algorithm...
[[ 6639     0   106]
 [ 2547 16449   470]
 [  271     0 13525]]
Accuracy:  0.9151648461519234
Precision:  0.9842846553002224
Recall:  0.7020196679708153
F-Measure:  0.8195284532773731
Classifying using Maximum Entropy Algorithm...
[[ 6744     0     1]
 [ 2587 16449   430]
 [   71     0 13725]]
Accuracy:  0.9227885120103981
Precision:  0.9998517420311341
Recall:  0.7172941927249521
F-Measure:  0.8353254474515389
