# <center><b>CS5062 (Machine Learning) Assignment II: Rule-based system for Sentiment Classification</b></center>

#### Mount google drive

In [0]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/Colab Notebooks/CS5062ML: Sentiment Analysis

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/Colab Notebooks/CS5062ML: Sentiment Analysis


#### Import libraries

In [0]:
import re, random, math, collections, itertools
import numpy as np

#### Read the datasets

In [0]:
def readFiles():
    '''
    Reads the samples within each dataset, splits them and saves them to a list. Populates
    the sentiment dictionary with positive and negative sentiments.
    
        Args:
            
        Returns:
            sentimentDictionary(dict): the lookup dictionary
            pos_rt(list): the positive samples from Rotten Tomatoes
            neg_rt(list): the negative samples from Rotten Tomatoes
            pos_nokia(list): the positive samples from the Nokia dataset 
            neg_nokia(list): the negative samples from the Nokia dataset
    '''
    sentimentDictionary = dict()
    #reading pre-labeled input and splitting into lines
    posSentences = open('rt-polarity.pos', 'r', encoding="ISO-8859-1")
    pos_rt = re.split(r'\n', posSentences.read())

    negSentences = open('rt-polarity.neg', 'r', encoding="ISO-8859-1")
    neg_rt = re.split(r'\n', negSentences.read())

    posSentencesNokia = open('nokia-pos.txt', 'r')
    pos_nokia = re.split(r'\n', posSentencesNokia.read())

    negSentencesNokia = open('nokia-neg.txt', 'r', encoding="ISO-8859-1")
    neg_nokia = re.split(r'\n', negSentencesNokia.read())
 
    posDictionary = open('positive-words.txt', 'r', encoding="ISO-8859-1")
    posWordList = posDictionary.readlines()
    posWordList = [line.strip() for line in posWordList if not line.startswith(";") and not line == '\n']
    #posWordList = re.findall(r"[a-z\-]+", posDictionary.read())

    negDictionary = open('negative-words.txt', 'r', encoding="ISO-8859-1")
    negWordList = negDictionary.readlines()
    negWordList = [line.strip() for line in negWordList if not line.startswith(";") and not line == '\n']
    #negWordList = re.findall(r"[a-z\-]+", negDictionary.read())

    for i in posWordList:
        sentimentDictionary[i] = 1
    for i in negWordList:
        sentimentDictionary[i] = -1

    return sentimentDictionary, pos_rt, neg_rt, pos_nokia, neg_nokia

In [0]:
sent_dict, pos_rt, neg_rt, pos_nokia, neg_nokia = readFiles()

In [0]:
print('Print total positive words in sentiment dictionary:', sum([v == 1 for v in sent_dict.values()]))
print('Print total negative words in sentiment dictionary:', sum([v == -1 for v in sent_dict.values()]))

Print total positive words in sentiment dictionary: 2003
Print total negative words in sentiment dictionary: 4783


In [0]:
print(len(pos_rt), len(neg_rt))

5332 5332


In [0]:
print(len(pos_nokia), len(neg_nokia))

194 80


In [0]:
def create_dataset(pos_rt, neg_rt):
  '''
    Concatenates the positive and negative samples into one dataset and creates a list of labels.
    
        Args:
            pos_rt(list): the positive samples from the dataset
            neg_rt(list): the negative samples from the dataset
        Returns:
            rt_data(list): the list of concatenated samples
            rt_labels(list): the list of annotations for each sample
  '''
  rt_labels = list()
  rt_data = pos_rt + neg_rt

  for i in range(len(pos_rt)):
    rt_labels.append(1)
  
  for i in range(len(neg_rt)):
    rt_labels.append(0)

  return rt_data, rt_labels

#### Prepare the Rotten Tomatoes dataset

In [0]:
rt_data, rt_labels = create_dataset(pos_rt, neg_rt)

In [0]:
print(sum(v == 1 for v in rt_labels))
print(sum(v == 0 for v in rt_labels))

5332
5332


In [0]:
print(rt_data[0])

the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 


In [0]:
#90/10 split
test_size = int(0.1 * len(rt_data))
rt_data_np = np.vstack((rt_data, rt_labels)).T
np.random.shuffle(rt_data_np)
rt_features_train = list(rt_data_np[:, 0][:-test_size])
rt_labels_train = list(rt_data_np[:, 1][:-test_size])
rt_features_test = list(rt_data_np[:, 0][-test_size:])
rt_labels_test = list(rt_data_np[:, 1][-test_size:])

#### Train the rule-based model

In [0]:
def classify_rule_based(data, labels, sent_dict, threshold=0, sigma=1, trigram_ps=1, trigram_ns=1):
    '''
      The modified rule-based classification function using trigrams, scoring functions and a 
      weighed coefficient for the threshold values.
      
          Args:
              data(list): the input sentences
              labels(list): the list of annotations
              sent_dict(dict): the lookup dictionary
              threshold(float): the decision boundary for the sentiments
              sigma(float): the weighed coefficient for the threshold
              trigram_ps(int): the scoring weight for the positive samples
              trigram_ns(int): the scoring weight for the negative samples
          Returns:
              mean_f_measure(float): the optimization target - Mean F1-Score
              np.mean(scores)(float): the mean value of all scored samples
    '''
    total=0
    correct=0
    totalpos=0
    totalneg=0
    totalpospred=0
    totalnegpred=0
    correctpos=0
    correctneg=0
    scores = list()
    for i in range(len(data)):
      wordList = re.findall(r"[\w']+", data[i])
      trigramList = wordList.copy()  # initialise trigramList
      for x in range(len(wordList) - 2):
          trigramList.append(wordList[x] + "_" + wordList[x + 1] + "_" + wordList[x + 2])
      
      score = 0
      for trigram in trigramList:
        words = trigram.split("_")
        for word in words:
          cur_score = 0
          if word in sent_dict:
            cur_score+=sent_dict[word]
        if cur_score > 0:
          score+=trigram_ps
        else:
          score-=trigram_ns

      #print('Total score:', score)
      
      scores.append(score)

      total+=1

      if labels[i]=='1':
        totalpos+=1
        if score>threshold*sigma:
          correct+=1
          correctpos+=1
          totalpospred+=1
        else:
          totalnegpred+=1
          if PRINT_ERRORS:
            print ("ERROR (pos classed as neg %0.2f):", data[i])
      else:
        totalneg+=1
        if score<=threshold*sigma:
            correct+=1
            correctneg+=1
            totalnegpred+=1
        else:
            totalpospred+=1
            if PRINT_ERRORS:
              print ("ERROR (neg classed as pos %0.2f):", data[i])

    acc=correct/float(total)
    print('Sigma:', sigma)
    print('Threshold:', threshold)
    print('Mean score:', np.mean(scores))
    print ("Accuracy (All)=%0.2f" % acc + " (%d" % correct + "/%d" % total + ")\n")


    # smoothing to avoid division by zero errors
    precision_pos=correctpos/round(float(totalpospred) + 0.01, 2)
    recall_pos=correctpos/round(float(totalpos) + 0.01, 2)
    precision_neg=correctneg/round(float(totalnegpred) + 0.01, 2)
    recall_neg=correctneg/round(float(totalneg) + 0.01, 2)
    f_pos=2*precision_pos*recall_pos/round(float(precision_pos+recall_pos) + 0.01, 2);
    f_neg=2*precision_neg*recall_neg/round(float(precision_neg+recall_neg) + 0.01, 2);

    print('Positive scores..')
    print("Precision (Pos)=%0.2f" % precision_pos + " (%d" % correctpos + "/%d" % totalpospred + ")")
    print("Recall (Pos)=%0.2f" % recall_pos + " (%d" % correctpos + "/%d" % totalpos + ")")
    print("F-measure (Pos)=%0.2f" % f_pos)

    print('Negative scores..')
    print("Precision (Neg)=%0.2f" % precision_neg + " (%d" % correctneg + "/%d" % totalnegpred + ")")
    print("Recall (Neg)=%0.2f" % recall_neg + " (%d" % correctneg + "/%d" % totalneg + ")")
    print("F-measure (Neg)=%0.2f" % f_neg + "\n")

    print('Mean scores..')
    print("Precision (Mean)={:.2f}".format((precision_pos + precision_neg) / 2))
    print("Recall (Mean)={:.2f}".format((recall_pos + recall_neg) / 2))
    print("F-measure (Mean)={:.2f}".format((f_pos + f_neg) / 2))
    print()

    mean_f_measure = (f_pos + f_neg) / 2
    return mean_f_measure, np.mean(scores)

In [0]:
PRINT_ERRORS = 0
classify_rule_based(rt_features_test, rt_labels_test, sent_dict, trigram_ps=2, trigram_ns=1)

Sigma: 1
Threshold: 0
Mean score: -29.5422138836773
Accuracy (All)=0.51 (548/1066)

Positive scores..
Precision (Pos)=0.91 (10/11)
Recall (Pos)=0.02 (10/527)
F-measure (Pos)=0.04
Negative scores..
Precision (Neg)=0.51 (538/1055)
Recall (Neg)=1.00 (538/539)
F-measure (Neg)=0.67

Mean scores..
Precision (Mean)=0.71
Recall (Mean)=0.51
F-measure (Mean)=0.35



(0.35319767834640153, -29.5422138836773)

#### Optimize the model by tuning the $\sigma$(weight coefficient for the threshold) and the scoring functions for the trigrams

In [0]:
def optimize_rule_based(features_test, labels_test, sent_dict, threshold=0, sigma_range=[-4, 4], trigram_ps=1, trigram_ns=1):
  '''
      The main function for optimizing the rule-based method. Uses the mean sentiment score as the initial threshold
      and does a parameter search for the best sigma.
      
          Args:
              features_test(list): the input sentences
              labels_test(list): the list of annotations
              sent_dict(dict): the lookup dictionary
              threshold(float): the decision boundary for the sentiments
              sigma_range(list): the valid range of sigma for the parameter search
              trigram_ps(int): the scoring weight for the positive samples
              trigram_ns(int): the scoring weight for the negative samples
          Returns:
          
  '''
  #set the threshold as the mean value of the scores and optimize sigma
  sigmas = list(np.linspace(sigma_range[0], sigma_range[1], num=100))
  print('Train with default threshold..')
  fm, mean_score = classify_rule_based(features_test, labels_test, sent_dict)
  threshold = mean_score
  print('Train with optimized threshold, default sigma..')
  fm, _ = classify_rule_based(features_test, labels_test, sent_dict, threshold=threshold)
  print()
  print('Optimizing sigma..')
  best_fm = fm
  best_sigma = 1
  for sigma in sigmas:
    print()
    print('Sigma:', sigma)
    print()
    sig_fm, _ = classify_rule_based(features_test, labels_test, sent_dict, threshold=threshold, 
                                    sigma=sigma, trigram_ps=trigram_ps, trigram_ns=trigram_ns)
    if sig_fm > best_fm:
      print('Found better sigma:', sigma)
      best_fm = sig_fm
      best_sigma = sigma
  
  print()
  print('Final results')
  print('Best mean f-measure:', best_fm)
  print('Best sigma:', best_sigma)
  print('Best threshold:', threshold)

In [0]:
optimize_rule_based(rt_features_test, rt_labels_test, sent_dict, trigram_ps=8, trigram_ns=1)

Train with default threshold..
Sigma: 1
Threshold: 0
Mean score: -31.79924953095685
Accuracy (All)=0.51 (540/1066)

Positive scores..
Precision (Pos)=0.99 (1/1)
Recall (Pos)=0.00 (1/527)
F-measure (Pos)=0.00
Negative scores..
Precision (Neg)=0.51 (539/1065)
Recall (Neg)=1.00 (539/539)
F-measure (Neg)=0.67

Mean scores..
Precision (Mean)=0.75
Recall (Mean)=0.50
F-measure (Mean)=0.33

Train with optimized threshold, default sigma..
Sigma: 1
Threshold: -31.79924953095685
Mean score: -31.79924953095685
Accuracy (All)=0.52 (559/1066)

Positive scores..
Precision (Pos)=0.52 (281/542)
Recall (Pos)=0.53 (281/527)
F-measure (Pos)=0.52
Negative scores..
Precision (Neg)=0.53 (278/524)
Recall (Neg)=0.52 (278/539)
F-measure (Neg)=0.52

Mean scores..
Precision (Mean)=0.52
Recall (Mean)=0.52
F-measure (Mean)=0.52


Optimizing sigma..

Sigma: -4.0

Sigma: -4.0
Threshold: -31.79924953095685
Mean score: -16.0
Accuracy (All)=0.51 (539/1066)

Positive scores..
Precision (Pos)=0.00 (0/0)
Recall (Pos)=0.00 

In [0]:
classify_rule_based(rt_features_test, rt_labels_test, sent_dict, threshold=-31.44, sigma=0.28, trigram_ps=8, trigram_ns=1)

Sigma: 0.28
Threshold: -31.44
Mean score: -16.0
Accuracy (All)=0.61 (653/1066)

Positive scores..
Precision (Pos)=0.64 (255/396)
Recall (Pos)=0.48 (255/527)
F-measure (Pos)=0.55
Negative scores..
Precision (Neg)=0.59 (398/670)
Recall (Neg)=0.74 (398/539)
F-measure (Neg)=0.65

Mean scores..
Precision (Mean)=0.62
Recall (Mean)=0.61
F-measure (Mean)=0.60



(0.6006349014666212, -16.0)

In [0]:
PRINT_ERRORS=1
classify_rule_based(rt_features_test, rt_labels_test, sent_dict, threshold=-31.44, sigma=0.28, trigram_ps=8, trigram_ns=1)
PRINT_ERRORS=0

ERROR (pos classed as neg %0.2f): this enthralling documentary . . . is at once playful and haunting , an in-depth portrait of an iconoclastic artist who was fundamentally unknowable even to his closest friends . 
ERROR (pos classed as neg %0.2f): some people march to the beat of a different drum , and if you ever wondered what kind of houses those people live in , this documentary takes a look at 5 alternative housing options . 
ERROR (neg classed as pos %0.2f): the leads we are given here are simply too bland to be interesting . 
ERROR (pos classed as neg %0.2f): the trinity assembly approaches the endeavor with a shocking lack of irony , and george ratliff's documentary , hell house , reflects their earnestness  which makes for a terrifying film . 
ERROR (pos classed as neg %0.2f): despite the long running time , the pace never feels slack -- there's no scene that screams " bathroom break ! " 
ERROR (pos classed as neg %0.2f): although it lacks the detail of the book , the film doe

### Nokia dataset

In [0]:
nk_data, nk_labels = create_dataset(pos_nokia, neg_nokia)

In [0]:
nk_data_np = np.vstack((nk_data, nk_labels)).T
np.random.shuffle(nk_data_np)
nk_features = list(nk_data_np[:, 0])
nk_labels = list(nk_data_np[:, 1])

#### Baseline

In [0]:
PRINT_ERRORS = 0
classify_rule_based(nk_features, nk_labels, sent_dict, trigram_ps=1, trigram_ns=1)

Sigma: 1
Threshold: 0
Mean score: -25.14963503649635
Accuracy (All)=0.29 (80/274)

Positive scores..
Precision (Pos)=0.00 (0/0)
Recall (Pos)=0.00 (0/194)
F-measure (Pos)=0.00
Negative scores..
Precision (Neg)=0.29 (80/274)
Recall (Neg)=1.00 (80/80)
F-measure (Neg)=0.45

Mean scores..
Precision (Mean)=0.15
Recall (Mean)=0.50
F-measure (Mean)=0.22



(0.2245566592175012, -25.14963503649635)

#### Tuned

In [0]:
optimize_rule_based(nk_data, nk_labels, sent_dict, sigma_range=[-4, 4], trigram_ps=-7, trigram_ns=1)

Train with default threshold..
Sigma: 1
Threshold: 0
Mean score: -25.14963503649635
Accuracy (All)=0.29 (80/274)

Positive scores..
Precision (Pos)=0.00 (0/0)
Recall (Pos)=0.00 (0/194)
F-measure (Pos)=0.00
Negative scores..
Precision (Neg)=0.29 (80/274)
Recall (Neg)=1.00 (80/80)
F-measure (Neg)=0.45

Mean scores..
Precision (Mean)=0.15
Recall (Mean)=0.50
F-measure (Mean)=0.22

Train with optimized threshold, default sigma..
Sigma: 1
Threshold: -25.14963503649635
Mean score: -25.14963503649635
Accuracy (All)=0.54 (148/274)

Positive scores..
Precision (Pos)=0.71 (115/162)
Recall (Pos)=0.59 (115/194)
F-measure (Pos)=0.64
Negative scores..
Precision (Neg)=0.29 (33/112)
Recall (Neg)=0.41 (33/80)
F-measure (Neg)=0.34

Mean scores..
Precision (Mean)=0.50
Recall (Mean)=0.50
F-measure (Mean)=0.49


Optimizing sigma..

Sigma: -4.0

Sigma: -4.0
Threshold: -25.14963503649635
Mean score: -43.222627737226276
Accuracy (All)=0.29 (80/274)

Positive scores..
Precision (Pos)=0.00 (0/0)
Recall (Pos)=0.0

In [0]:
classify_rule_based(nk_data, nk_labels, sent_dict, threshold=-25.15, sigma=2.22, trigram_ps=-7, trigram_ns=1)

Sigma: 2.22
Threshold: -25.15
Mean score: -43.222627737226276
Accuracy (All)=0.62 (170/274)

Positive scores..
Precision (Pos)=0.73 (144/198)
Recall (Pos)=0.74 (144/194)
F-measure (Pos)=0.73
Negative scores..
Precision (Neg)=0.34 (26/76)
Recall (Neg)=0.32 (26/80)
F-measure (Neg)=0.33

Mean scores..
Precision (Mean)=0.53
Recall (Mean)=0.53
F-measure (Mean)=0.53



(0.5281779074876155, -43.222627737226276)

In [0]:
PRINT_ERRORS=1
classify_rule_based(nk_data, nk_labels, sent_dict, threshold=-25.15, sigma=2.22, trigram_ps=-7, trigram_ns=1)
PRINT_ERRORS=0

ERROR (pos classed as neg %0.2f): there is much which has been said in other reviews about the features of this phone , it is a great phone , mine worked without any problems right out of the box . 
ERROR (pos classed as neg %0.2f): here 's the brief synopsis : the phone is tiny , cute , feels kind of " plastic-like " ( as if it might break ) , but seems pretty sturdy . 
ERROR (pos classed as neg %0.2f): it has lots of little cute features , my favorite being the games and the pim ( personal information manager -- i.e. organizer ) , and the radio ! 
ERROR (pos classed as neg %0.2f): the phone comes with okay ringtones , some decent backgrounds / screensavers , but the phone has very little memory ( mine had 230kb as it arrived from amazon , so you do n't have too many options on what you can put on there ) . 
ERROR (pos classed as neg %0.2f): in all fairness , customer reps are very nice , and they ( most of them ) try hard to answer your questions . 
ERROR (pos classed as neg %0.2f): 