In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import malaya
import re
from statistics import mean
import pickle

mly = malaya.sentiment.multinomial()
sia = SentimentIntensityAnalyzer()

malay_stopwords = np.load('data/malay_stopwords.npy')
positive_malay_words = np.load('data/positive_malay_words.npy')
negative_malay_words = np.load('data/negative_malay_words.npy')
positive_word_top_100 = np.load('data/positive_word_top_100.npy')
negative_word_top_100 = np.load('data/negative_word_top_100.npy')
learned_words = np.load('data/learned_words.npy')

def get_features(tweet):
    features = dict()
    top_positive_words_frequency = 0
    top_negative_words_frequency = 0
    positive_malay_words_frequency = 0
    negative_malay_words_frequency = 0
    english_compound_scores = list()
    english_positive_scores = list()
    english_negative_scores = list()
    malaya_positive_scores = list()
    malaya_negative_scores = list()

    word_tokens = set(nltk.word_tokenize(tweet))

    for sentence in nltk.sent_tokenize(tweet):
        #removes hyperlinks and twitter mentions
        sentence = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','', sentence)
        sentence = re.sub('(@[A-Za-z0-9_]+)','', sentence)

        for word in nltk.word_tokenize(sentence):
            if word.lower() in positive_word_top_100:
                top_positive_words_frequency += 1
            if word.lower() in negative_word_top_100:
                top_negative_words_frequency += 1
            if word.lower() in positive_malay_words:
                positive_malay_words_frequency += 1
            if word.lower() in negative_malay_words:
                negative_malay_words_frequency += 1
        
        malaya_positive_scores.append(mly.predict_proba([sentence])[0]["positive"])
        malaya_negative_scores.append(mly.predict_proba([sentence])[0]["negative"])

        english_compound_scores.append(sia.polarity_scores(sentence)["compound"])
        english_positive_scores.append(sia.polarity_scores(sentence)["pos"])
        english_negative_scores.append(sia.polarity_scores(sentence)["neg"])

    features["malaya_mean_positive"] = mean(malaya_positive_scores)
    features["malaya_mean_negative"] = mean(malaya_negative_scores)
    
    features["english_mean_compound"] = mean(english_compound_scores) + 1
    features["english_mean_positive"] = mean(english_positive_scores)
    features["english_mean_negative"] = mean(english_negative_scores)
    
    features["top_positive_words_frequency"] = top_positive_words_frequency
    features["top_negative_words_frequency"] = top_negative_words_frequency
    
    features["positive_malay_words_frequency"] = positive_malay_words_frequency
    features["negative_malay_words_frequency"] = negative_malay_words_frequency

    for word in learned_words:
        features[F"contains({word})"] = (word in word_tokens)

    return features

  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))


In [4]:
new_tweet = ""

loaded_classifier = pickle.load(open('models/MLPClassifier.pickle', 'rb'))
loaded_classifier.classify(get_features(new_tweet))

'negative'

In [None]:
loaded_classifier = pickle.load(open('models/MLPClassifier.pickle', 'rb'))

df = pd.read_csv('datasets/duck_training.csv')
df = df.replace(-1, 'negative')
df = df.replace(0, 'neutral')
df = df.replace(1, 'positive')
df = df[df['TweetSentiment'] != 'neutral']

df2 = pd.read_csv('datasets/student_tweet_training.csv')

df3 = pd.read_csv('datasets/malay_dataset_twitter_training.csv')
df3 = df3[df3['TweetSentiment'] != 'neutral']

dfs = [df3]
df = pd.concat(dfs, ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True) #shuffles the dataframe

no_neutrals = df[df['TweetSentiment'] != 'neutral']
sentiment = np.array(no_neutrals['TweetSentiment'])
tweets = np.array(no_neutrals['TweetText'])

scorelist = []
results = []
for tweet in tweets:
    #scores = get_features(tweet)
    prediction = loaded_classifier.classify(get_features(tweet))
    
    #scorelist.append(scores)
    results.append(prediction)

correct = 0
for i in range(0, len(results)):
    print(F"\nTweet: {tweets[i]}")
    #print(F"Scores: {scorelist[i]}")
    print(F"Prediction: {results[i]} | Sentiment: {sentiment[i]}")

    if results[i] == sentiment[i]:
        correct += 1

total = len(results)
accuracy = correct / total

print(F"\nTotal: {total} | Correct: {correct}")
print(F"Accuracy: {accuracy:.2%}")

In [None]:
loaded_classifier = pickle.load(open('models/MLPClassifier.pickle', 'rb'))

df = pd.read_csv('datasets/duck_training.csv')
df = df.replace(-1, 'negative')
df = df.replace(0, 'neutral')
df = df.replace(1, 'positive')
df = df[df['TweetSentiment'] != 'neutral']

df2 = pd.read_csv('datasets/student_tweet_training.csv')

df3 = pd.read_csv('datasets/malay_dataset_twitter_training.csv')
df3 = df3[df3['TweetSentiment'] != 'neutral']

dfs = [df3]
df = pd.concat(dfs, ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True) #shuffles the dataframe

no_neutrals = df[df['TweetSentiment'] != 'neutral']
sentiment = np.array(no_neutrals['TweetSentiment'])
tweets = np.array(no_neutrals['TweetText'])

results = []
for tweet in tweets:
    prediction = loaded_classifier.classify(get_features(tweet))
    results.append(prediction)

wrong = 0
for i in range(0, len(results)):
    if results[i] != sentiment[i]:
        print(F"\nTweet: {tweets[i]}")
        print(F"Prediction: {results[i]} | Sentiment: {sentiment[i]}")
        wrong += 1

total = len(results)
print(F"\nTotal Predictions: {total} | Wrong Predictions: {wrong}")