## Dataset3 Sentiment Analysis YouTube Comments (Step2)
### Import Required Packages

In [1]:
from math import log
from bs4 import BeautifulSoup
import string
import re

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

#nltk.download('all')

In [2]:
def load_comments(fileName):
    text =  open(fileName,  encoding="utf8")
    line_count = sum(1 for line in open(fileName, encoding="utf8"))

    if "train" in fileName: 
        text = text.read() # read the text in the file
        checking = BeautifulSoup(text, 'lxml')  # pulling data out of HTML decoding
        text = checking.get_text()
    print ("File Type: ", type(text)) # return the type of content in the file str
    return text, line_count # return the whole content and number of lines

In [3]:
def emojis(comment):
    # Smile emojis
    comment = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' emojipositive ', comment)
    # Laugh emojis
    comment = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' emojipositive ', comment)
    # Love emojis
    comment = re.sub(r'(<3|:\*)', ' emojipositive ', comment)
    # Wink emojis
    comment = re.sub(r'(;-?\)|;-?D|\(-?;)', ' emojipositive ', comment)
    # Sad emojis
    comment = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' emojinegative ', comment)
    # Cry emojis
    comment = re.sub(r'(:,\(|:\'\(|:"\()', ' emojinegative ', comment)
    
    return comment


def clean_words(stringL):
    
    comment = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', '', stringL)# urls removed
    comment = re.sub(r'@(\S+)', '', comment)      # @ removed
    comment = re.sub(r'#(\S+)', '', comment)      # hashtags removed
    comment = re.sub('\n', ' ', comment)            # more preprocessing
    comment = re.sub("\d+", " ", comment)           # replace numbers
    comment = re.sub(r'(.)\1+', r'\1\1', comment)   # more than two repeating charaters removed
    comment = re.sub(r'\s+', ' ', comment)          # replace additional whitespace
    comment = emojis(comment)                       # calls the emojis function
    
    tr = str.maketrans("", "", string.punctuation)
    comment = comment.translate(tr)   # remove punctuation from string
    
    text = comment.lower().split() # convert all tweets to lower-case
   
    """
    stop_words = stopwords.words("english")
    words = [word for word in text if word not in stop_words]   #removes stopwords that exist
    """
    # lemmatise: grouping together the different inflected forms of a word
    # so they can be analyzed as a single item.
    lmtzr = WordNetLemmatizer() 
    wordlist = [lmtzr.lemmatize(x) for x in text] # lemmatization
    """
    porter = PorterStemmer()
    wordlist2 = [porter.stem(x) for x in text]    # stemmer can be added if needed
    
    snowball = SnowballStemmer('english')
    wordlist3 = [snowball.stem(x) for x in words]
    """
    return wordlist

In [4]:
print (clean_words("It is very exciting okkkkkkkk www.in.ie @oo  #opoi"))
print (clean_words("It is toooooo bad\!        ll"))
print (clean_words("I am going to colleges :-)  "))

['it', 'is', 'very', 'exciting', 'okk']
['it', 'is', 'too', 'bad', 'll']
['i', 'am', 'going', 'to', 'college', 'emojipositive']


In [5]:
def dictionary_probabilities(vocabulary, text):
    wordcount = len(text)
    # creates a dictionary from the given sequence of keys and values.
    # initialze vocabulary to be the keys of the dictionary
    dictionary = dict.fromkeys(vocabulary,0) 
    
    for words in text:
        dictionary[words]=dictionary[words]+1 # calculate frequency of words in list
    
    for key in dictionary:
        #calculate the conditional log propbabilities using laplace smoothing
        dictionary[key]= log((dictionary[key] + 1) / (wordcount + len(vocabulary))) 
        
    return dictionary, wordcount

In [6]:
def test_accuracy(fileName, vocabulary, textNeg, textPos, priorNeg, priorPos): 
    
    neg_D, wordcount_Neg = dictionary_probabilities(vocabulary, textNeg)
    pos_D, wordcount_Pos = dictionary_probabilities(vocabulary, textPos)

    # load the testing file in
    fileObj =  open(fileName, encoding="utf8")
    # counts how many comments there are in file
    total_comments = sum(1 for line in open(fileName, encoding="utf8"))     
    text = fileObj.readlines()
    print ("Num test comment = ", len(text))
    
    correct = 0
    name = 0
    
    for line in text:
        checking = BeautifulSoup(line, 'lxml')    # HTML decoding    
        line = checking.get_text().lower().split()

        text = " ".join(line)
        comment = clean_words(text) # clean the testing data

        sumNeg = 0
        sumPos = 0
        for word in comment:
            if word in vocabulary:
                sumNeg += neg_D[word]    # add the log conditional probability when found
                sumPos += pos_D[word]

           
                
        sumNeg += priorNeg   # add the prior probability
        sumPos += priorPos
    
        if "Neg" in fileName:
            if sumNeg > sumPos:     
                correct +=1         # counting the no of correct predictions 
                name = "negative" 
        else:
            if sumPos > sumNeg:
                correct +=1
                name = "positive"

    print("The accuracy for", name, "testing comments is:", round(((correct/total_comments)*100),2), "%") # print the accuracy


In [7]:
def main():
    
    # Load the positive and negative training documents 
    textNeg, total_neg_comments = load_comments("train_neg.txt")
    print ("Finished with neg train ", total_neg_comments)
    
    textPos, total_pos_comments = load_comments("train_pos.txt")
    print ("Finished with pos train", total_pos_comments)
    
       
    # Apply cleaning techniques and return positive and negative lists of words 
    contentsNeg = clean_words(textNeg)
    contentsPos = clean_words(textPos)
    
    print("contentsNeg", len(contentsNeg))
    print("contentsPos", len(contentsPos))

    
    # Create a vocabulary from combining sets of positive and negative words
    vocab = set(contentsNeg) | set(contentsPos)
    print("Vocab", len(vocab))
    
    # Calculate the prior probability for both classes
    priorNeg = log(total_neg_comments/(total_neg_comments+total_pos_comments))
    priorPos = log(total_pos_comments/(total_neg_comments+total_pos_comments))
    
    print ("Starting test")
    # Pass pieces of information needed to classify unseen documents
    test_accuracy("test_Neg.txt", vocab, contentsNeg, contentsPos, priorNeg, priorPos)
    test_accuracy("test2_Neg.txt", vocab, contentsNeg, contentsPos, priorNeg, priorPos)
    print ("Neg test finished")
    
    test_accuracy("test_Pos.txt", vocab, contentsNeg, contentsPos, priorNeg, priorPos)
    test_accuracy("test2_Pos.txt", vocab, contentsNeg, contentsPos, priorNeg, priorPos)
    print ("Pos test finished")
    
main()


File Type:  <class 'str'>
Finished with neg train  419545
File Type:  <class 'str'>
Finished with pos train 1309125
contentsNeg 913481
contentsPos 175136
Vocab 35949
Starting test
Num test comment =  208
The accuracy for negative testing comments is: 92.31 %
Num test comment =  158
The accuracy for negative testing comments is: 92.41 %
Neg test finished
Num test comment =  882
The accuracy for positive testing comments is: 71.88 %
Num test comment =  658
The accuracy for positive testing comments is: 71.58 %
Pos test finished
