<a href="https://colab.research.google.com/github/MahdiRahbar/NLP_SLU/blob/main/sentiment/WelshSentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import random
import math
from collections import defaultdict
import math
from typing import List
import os
import re

In [2]:
fn = os.path.join("..","data","train-v2.tsv")
handle = open(fn, "r")
res = list()
tweets = list()
for i in handle:
    entry = i.strip().split("\t")
    res.append(int(entry[0]))
    tweets.append(entry[1])

In [3]:
# i tried
def reformat_tweet(tweet):
    #get rid of mentions, links, hashtags and numbers
    tweet2 = re.sub("(@USER|\{URL\}|#\w*|[0-9]+)", "", tweet).strip()
    #lowercase everything
    tweet2 =  tweet2.lower()
    #tokenize
    tweet2 = tweet2.split(" ")
    
    clean_tweet = []
    for word in tweet2:
        #keep only face emojis and alphas
        relevant_piece = re.match("[\w\U00010000-\U0010ffff]+", word)
        if relevant_piece:
            clean_tweet.append(relevant_piece.group(0))
    return clean_tweet

def make_bag_of_words(tweets: List[list]): # list of list here
    bag_of_words = defaultdict(int)  # key-word, val-count
    for tweet in tweets:
        for word in tweet:
            bag_of_words[word] += 1
    return bag_of_words


In [4]:
#reference: pseudocode from the book
class NaiveBayes():
    def __init__(self, xVec: List[list], yVec: list, vocabs: defaultdict(int)):
        self.xVec = xVec #assuming xVec = yVec no missing data
        self.yVec = yVec
        self.classes = set(yVec)

        self.vocabs = vocabs #frequency list of all words appear in all docs
        self.prior = defaultdict(float) #dictionary holds prior probability for all classes
        self.log_like = defaultdict(lambda: defaultdict(float)) #dictionary holds log likelihood of a word, separated by class

    def _sep_by_class(self):
        big_doc = defaultdict(lambda: defaultdict(int))
        for i in range( len(self.xVec)) :
            c = self.yVec[i]
            for w in self.xVec[i]:
                big_doc[c][w] += 1
        return big_doc #big doc contains word count separated by class {c_1: {word1: count, word2:count,etc}, c_2: {word1: count, etc},...}

    def train(self, transform_func=math.log, smoothing=lambda x, y: (x/y) ):
        N_all = len(self.xVec)
        big_doc = self._sep_by_class()

        for c in self.classes:
            N_c = sum(1 for i in self.yVec if i == c)
            self.prior[c] = transform_func(N_c / N_all) #calculate prior

            words_in_c = big_doc[c]
            count_all_words_in_c = sum(words_in_c[w] for w in self.vocabs) #all words in a class

            for word in self.vocabs:
                count_word_in_c = words_in_c[word]
                
                # calculate likelihood

                self.log_like[word][c] = transform_func(smoothing(
                    count_word_in_c, count_all_words_in_c))
                    
    def test(self,tweet, impute_for_never_seen=0):
        bestClass = None
        bestLogPrior = None
        for c in self.classes:
            probability = self.prior[c]
            for w in tweet:
                if w in self.vocabs:
                    probability += self.log_like[w][c]
                else:
                    if impute_for_never_seen:
                        probability = impute_for_never_seen
            if bestLogPrior == None:
                bestLogPrior = probability
                bestClass = c
            elif bestLogPrior < probability:
                bestLogPrior = probability
                bestClass = c
        return bestClass


In [5]:


#divide train-test
import random
tweets_reformatted = list(map(lambda x: reformat_tweet(x), tweets))
random.seed(42)
train_indexes = random.sample(range(80000), 64000)
test_indexes = [i for i in range(80000) if i not in train_indexes]
trainX = [tweets_reformatted[i] for i in train_indexes]
trainY = [res[i] for i in train_indexes]
vocabs = make_bag_of_words(trainX)
testX = [tweets_reformatted[i] for i in test_indexes]
testY = [res[i] for i in test_indexes]



In [6]:
#smoothing add 1. im just experimenting some stuff here.
def smoothing_add_alpha_prob(c_w, count_all_words, alpha = 1): 
    return (c_w+alpha) * (count_all_words+len(vocabs))
nb = NaiveBayes(trainX, trainY, vocabs)
nb.train(smoothing = smoothing_add_alpha_prob)
#sum([nb.test(trainX[i]) == trainY[i] for i in range(len(trainX))])/len(trainX)
sum([nb.test(testX[i]) == testY[i] for i in range(len(testX))])/len(testX)

0.725875

In [7]:
def smoothing_add_alpha_prob(c_w, count_all_words, alpha=1):
    return (c_w+alpha) * (count_all_words+len(vocabs))


nb3 = NaiveBayes(trainX, trainY, vocabs)
nb3.train(smoothing=lambda x, y: smoothing_add_alpha_prob(x, y, alpha=0.4))
#sum([nb.test(trainX[i]) == trainY[i] for i in range(len(trainX))])/len(trainX)
sum([nb3.test(testX[i]) == testY[i] for i in range(len(testX))])/len(testX)


0.723375

In [8]:
#smoothing add alpha. im just experimenting some stuff here.
nb2 = NaiveBayes(trainX, trainY, vocabs)
nb2.train(smoothing=lambda x, y: smoothing_add_alpha_prob(x, y, alpha=0.5) )
# i am imputing the likelihood for words/ characters that were never seen before with N_1/N_v (ref: lecture)
all_count_ones = sum(nb2.vocabs[i] for i in nb2.vocabs if nb2.vocabs[i] == 1) #count all vocabs that appear once
impute_for_unknown = all_count_ones/len(vocabs)

sum([nb2.test(testX[i], impute_for_never_seen=impute_for_unknown) == testY[i]
    for i in range(len(testX))]) /len(testX)


0.7049375

In [11]:
# this should return either 0 (negative sentiment) or 1 (positive sentiment)
TRAIN_X = list(
    map(lambda x: reformat_tweet(x), tweets))
TRAIN_Y = res
PROVIDED_VOCABS = make_bag_of_words( TRAIN_X ) #this builds a vocabulary size based on 80k tweets
TRAINED_NB = NaiveBayes(TRAIN_X, TRAIN_Y, vocabs)
TRAINED_NB.train(smoothing=lambda x,
                 y: smoothing_add_alpha_prob(x, y, alpha=0.5))


def predict_from_scratch(tweet, trained_model= TRAINED_NB,vocabs=PROVIDED_VOCABS):
    return trained_model.test(tweet)


In [12]:
# this should return either 0 (negative sentiment) or 1 (positive sentiment)
def predict_anything_goes(tweet):
  # do something complicated here
  return random.randint(0,1)

In [47]:
def evaluate():
    total = 0
    correct_from_scratch = 0
    correct_anything_goes = 0
    testfile = open(os.path.join(".", "data", 'test.tsv'), 'r')
    testfile = testfile.readlines()
    
    for line in testfile:
        total += 1
        pieces = line.rstrip("\n").split("\t")
        test_sample = list(map(lambda x: reformat_tweet(x), [pieces[1]]))
        if predict_from_scratch(test_sample[0]) == int(pieces[0]):
            correct_from_scratch += 1

    return (correct_from_scratch/total)
  

In [48]:
evaluate()

0.76