In [4]:
import numpy as np
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import malaya
import re
from statistics import mean
import pickle

mly = malaya.sentiment.multinomial()
sia = SentimentIntensityAnalyzer()

malay_stopwords = np.load('data/malay_stopwords.npy')
positive_malay_words = np.load('data/positive_malay_words.npy')
negative_malay_words = np.load('data/negative_malay_words.npy')
positive_word_top_100 = np.load('data/positive_word_top_100.npy')
negative_word_top_100 = np.load('data/negative_word_top_100.npy')
learned_positive_words = np.load('data/learned_positive_words.npy')
learned_negative_words = np.load('data/learned_negative_words.npy')

def get_features(tweet):
    features = dict()
    top_positive_words_frequency = 0
    top_negative_words_frequency = 0
    positive_malay_words_frequency = 0
    negative_malay_words_frequency = 0
    learned_positive_words_frequency = 0
    learned_negative_words_frequency = 0
    english_compound_scores = list()
    english_positive_scores = list()
    english_negative_scores = list()
    malaya_positive_scores = list()
    malaya_negative_scores = list()

    for sentence in nltk.sent_tokenize(tweet):
        #removes hyperlinks and twitter mentions
        sentence = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','', sentence)
        sentence = re.sub('(@[A-Za-z0-9_]+)','', sentence)

        for word in nltk.word_tokenize(sentence):
            if word.lower() in positive_word_top_100:
                top_positive_words_frequency += 1
            if word.lower() in negative_word_top_100:
                top_negative_words_frequency += 1

            if word.lower() in positive_malay_words:
                positive_malay_words_frequency += 1
            if word.lower() in negative_malay_words:
                negative_malay_words_frequency += 1

            if word.lower() in learned_positive_words:
                learned_positive_words_frequency += 1
            if word.lower() in learned_negative_words:
                learned_negative_words_frequency += 1

        malaya_positive_scores.append(mly.predict_proba([sentence])[0]["positive"])
        malaya_negative_scores.append(mly.predict_proba([sentence])[0]["negative"])

        english_compound_scores.append(sia.polarity_scores(sentence)["compound"])
        english_positive_scores.append(sia.polarity_scores(sentence)["pos"])
        english_negative_scores.append(sia.polarity_scores(sentence)["neg"])

    features["malaya_mean_positive"] = mean(malaya_positive_scores)
    features["malaya_mean_negative"] = mean(malaya_negative_scores)
    
    features["english_mean_compound"] = mean(english_compound_scores) + 1
    features["english_mean_positive"] = mean(english_positive_scores)
    features["english_mean_negative"] = mean(english_negative_scores)
    
    features["top_positive_words_frequency"] = top_positive_words_frequency
    features["top_negative_words_frequency"] = top_negative_words_frequency

    features["positive_malay_words_frequency"] = positive_malay_words_frequency
    features["negative_malay_words_frequency"] = negative_malay_words_frequency

    features["learned_positive_words_frequency"] = learned_positive_words_frequency
    features["learned_negative_words_frequency"] = learned_negative_words_frequency

    return features

In [11]:
loaded_classifier = pickle.load(open('models/KNeighborsClassifier.pickle', 'rb'))

df = pd.read_csv('datasets/comments.csv')
df = df.sample(frac=1).reset_index(drop=True) #shuffles the dataframe

sentiment = np.array(df['Sentiment'])
comments = np.array(df['Comment'])

scorelist = []
results = []
for comment in comments:
    #scores = get_features(tweet)
    prediction = loaded_classifier.classify(get_features(comment))
    
    #scorelist.append(scores)
    results.append(prediction)

correct = 0
for i in range(0, len(results)):
    print(F"\nTweet: {comments[i]}")
    #print(F"Scores: {scorelist[i]}")
    print(F"Prediction: {results[i]} | Sentiment: {sentiment[i]}")

    if results[i] == sentiment[i]:
        correct += 1

total = len(results)
accuracy = correct / total

print(F"\nTotal: {total} | Correct: {correct}")
print(F"Accuracy: {accuracy:.2%}")


Tweet: fak!! aku dh beli phase 1
Prediction: negative | Sentiment: negative

Tweet: The website previously mentioned no single dah VIP. Now what is this mannn???
Prediction: negative | Sentiment: negative

Tweet: DAMN!!!!
Prediction: negative | Sentiment: negative

Tweet: Anyone know when will dpr perform ?! 😭
Prediction: positive | Sentiment: positive

Tweet: GOODVIBES IS SUCH A JOKE NEXT TIME FOR SURE I BOIKOT WEH
Prediction: positive | Sentiment: negative

Tweet: Nak tgnk the strokes je
Prediction: negative | Sentiment: positive

Tweet: I wanna see rini so bad bruhhhh the ticket I couldn't
Prediction: negative | Sentiment: negative

Tweet: hi guys, im selling my 3 days pass for RM1500 (can pay in installments). lemme know if ure interested!
Prediction: negative | Sentiment: positive

Tweet: I thought there wasn’t gonna be a VIP single day pass which was why I bought a 3 day VIP. Lmao got scammed 💀 why tho :(
Prediction: negative | Sentiment: negative

Tweet: Hahahaha tiberr ni ada 

In [12]:
loaded_classifier = pickle.load(open('models/KNeighborsClassifier.pickle', 'rb'))

df = pd.read_csv('datasets/comments.csv')
df = df.sample(frac=1).reset_index(drop=True) #shuffles the dataframe

sentiment = np.array(df['Sentiment'])
comments = np.array(df['Comment'])

results = []
for comment in comments:
    prediction = loaded_classifier.classify(get_features(comment))
    results.append(prediction)

wrong = 0
for i in range(0, len(results)):
    if results[i] != sentiment[i]:
        print(F"\nTweet: {comments[i]}")
        print(F"Prediction: {results[i]} | Sentiment: {sentiment[i]}")
        wrong += 1

total = len(results)
print(F"\nTotal Predictions: {total} | Wrong Predictions: {wrong}")


Tweet: Anyone needs Ga tickets? I’m selling 3 tickets for a reduced price ❤️❤️👏
Prediction: negative | Sentiment: positive

Tweet: The strokes & The 1975 ❤️
Prediction: negative | Sentiment: positive

Tweet: Dah ada single day pass ni it’s up to you nak beli ke tak. Tak perlu nak royanan intelektual sangat lah. Dah ada options kan?😁🤷🏽‍♀️
Prediction: negative | Sentiment: positive

Tweet: IM GOING ON SATURDAY
Prediction: negative | Sentiment: positive

Tweet: yall better add in frank ocean
Prediction: negative | Sentiment: positive

Tweet: @ddrew.13 Saturday dpr hahhahah
Prediction: negative | Sentiment: positive

Tweet: Hi @goodvibesfest , may I know if we have to choose a specified date upon purchase tomorrow? Or the ticket allows just any one of the days that we can decide later?
Prediction: negative | Sentiment: positive

Tweet: Die die die die
Prediction: positive | Sentiment: negative

Tweet: @goodvibesfest can i upgrade my GA ticket to VIP (3day pass)? if yes, how do i make the 