<a href="https://colab.research.google.com/github/JayLeoK/NLP-Sentiment-Analysis/blob/master/Sentiment-Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import sys, argparse
from scipy import sparse
from sklearn import linear_model
from collections import Counter
import numpy as np
import re
from collections import Counter
from collections import defaultdict

In [0]:
######################################################################
## This defines the dumb features the model starts with.
######################################################################


def dumb_featurize(text):
	feats = {}
	words = text.split(" ")

	for word in words:
		if word == "love" or word == "like" or word == "best":
			feats["contains_positive_word"] = 1
		if word == "hate" or word == "dislike" or word == "worst" or word == "awful":
			feats["contains_negative_word"] = 1

	return feats



In [0]:
######################################################################
## This defines the sentiment classification class which
## loads the data and sets up the model.
######################################################################

class SentimentClassifier:

	def __init__(self, feature_method):
		self.feature_vocab = {}
		self.feature_method = feature_method


	# Read data from file
	def load_data(self, filename):
		data = []
		with open(filename, encoding="utf8") as file:
			for line in file:
				cols = line.split("\t")
				label = cols[0]
				text = cols[1].rstrip()

				data.append((label, text))
		return data

	# Featurize entire dataset
	def featurize(self, data):
		featurized_data = []
		for label, text in data:
			feats = self.feature_method(text)
			featurized_data.append((label, feats))
		return featurized_data

	# Read dataset and returned featurized representation as sparse matrix + label array
	def process(self, dataFile, training = False):
		data = self.load_data(dataFile)
		data = self.featurize(data)

		if training:			
			fid = 0
			feature_doc_count = Counter()
			for label, feats in data:
				for feat in feats:
					feature_doc_count[feat]+= 1

			for feat in feature_doc_count:
				if feature_doc_count[feat] >= MIN_FEATURE_COUNT[self.feature_method.__name__]:
					self.feature_vocab[feat] = fid
					fid += 1

		F = len(self.feature_vocab)
		D = len(data)
		X = sparse.dok_matrix((D, F))
		Y = np.zeros(D)
		for idx, (label, feats) in enumerate(data):
			for feat in feats:
				if feat in self.feature_vocab:
					X[idx, self.feature_vocab[feat]] = feats[feat]
			Y[idx] = 1 if label == "pos" else 0

		return X, Y

	def load_test(self, dataFile):
		data = self.load_data(dataFile)
		data = self.featurize(data)

		F = len(self.feature_vocab)
		D = len(data)
		X = sparse.dok_matrix((D, F))
		Y = np.zeros(D, dtype = int)
		for idx, (data_id, feats) in enumerate(data):
			# print (data_id)
			for feat in feats:
				if feat in self.feature_vocab:
					X[idx, self.feature_vocab[feat]] = feats[feat]
			Y[idx] = data_id

		return X, Y

	# Train model and evaluate on held-out data
	def evaluate(self, trainX, trainY, devX, devY):
		(D,F) = trainX.shape
		self.log_reg = linear_model.LogisticRegression(C = L2_REGULARIZATION_STRENGTH[self.feature_method.__name__])	
		self.log_reg.fit(trainX, trainY)
		training_accuracy = self.log_reg.score(trainX, trainY)
		development_accuracy = self.log_reg.score(devX, devY)
		print("Method: %s, Features: %s, Train accuracy: %.3f, Dev accuracy: %.3f" % (self.feature_method.__name__, F, training_accuracy, development_accuracy))
		

	# Predict labels for new data
	def predict(self, testX, idsX):
		predX = self.log_reg.predict(testX)

		out = open("%s_%s" % (self.feature_method.__name__, "predictions.csv"), "w", encoding="utf8")
		out.write("Id,Expected\n")
		for idx, data_id in enumerate(testX):
			out.write("%s,%s\n" % (idsX[idx], int(predX[idx])))
		out.close()

	# Write learned parameters to file
	def printWeights(self):
		out = open("%s_%s" % (self.feature_method.__name__, "weights.txt"), "w", encoding="utf8")
		reverseVocab = [None]*len(self.feature_vocab)
		for feat in self.feature_vocab:
			reverseVocab[self.feature_vocab[feat]] = feat

		out.write("%.5f\t__BIAS__\n" % self.log_reg.intercept_)
		for (weight, feat) in sorted(zip(self.log_reg.coef_[0], reverseVocab)):
			out.write("%.5f\t%s\n" % (weight, feat))
		out.close()



In [0]:
######################################################################
##change these parameters to prevent the model from overfitting 
##and achieve higher performance
######################################################################

# regularization strength to control overfitting (values closer to 0  = stronger regularization)
L2_REGULARIZATION_STRENGTH = {"dumb_featurize": 1, "fancy_featurize": 0.15 }

# must observe feature at least this many times in training data to include in model
MIN_FEATURE_COUNT = {"dumb_featurize": 10,  "fancy_featurize": 7}

In [0]:
# fancy_featurization implementation
def fancy_featurize(text):
  features = {}
  # adds bag of word representation to features

  features.update(bag_of_words(text))
  # Your code goes here
  words = text.split(' ')
  features.update({'number of words':len(words)})

  features.update(VADER_ngram(words))

  features.update(MPQA_counts(words))

  #try has positive, has negative, and count positive, count negative
  return features

In [0]:
# Adds the bag of words representation of the text to feats
def bag_of_words(text):
  word_bag = {}
	# Your code goes here
  used = set()
  for word in text.split(' '):
    if word in used:
      word_bag[word]+=1
    else:
      word_bag[word]=1
      used.add(word)
  return word_bag

In [0]:
def MPQA_counts(words):    #adds MPQA negative and positive counts
  #mounted google drive before accessing the .tff file
  features = {}
  with open('/content/drive/My Drive/subjclueslen1-HLTEMNLP05.tff', 'r') as file:
    pos_lexicon = set()
    neg_lexicon = set()
    for line in file:
      if "priorpolarity=positive" in line:
        word = line.split()[2].split("=")[1]
        pos_lexicon.add(word)
      if "priorpolarity=negative" in line:
        word = line.split()[2].split("=")[1]
        neg_lexicon.add(word)
  
  sentiments = []
  for word in words:
    if word in pos_lexicon:
      sentiments.append(1)
    elif word in neg_lexicon:
      sentiments.append(-1)
    else:
      sentiments.append(0)
    
  hasPos,hasNeg = 0,0
  posCount = sum([i>0 for i in sentiments])
  negCount = sum([i<0 for i in sentiments])
  if posCount>1: hasPos = 1
  if negCount>1: hasNeg = 1
  features['MPQA Has Positive']= hasPos
  features['MPQA Has Negative']= hasNeg
  features['MPQA count difference'] = posCount-negCount
  features['MPQA Positive Count']= posCount
  features['MPQA Negative Count']= negCount
  features['MPQA pos/neg Ratio']= (posCount+1)/(negCount+1)
  
  return features


In [0]:
def count_threshold(value,pos,neg,threshold):
  if value<(-threshold/2):
    neg = neg+1
  elif value>(threshold/2):
    pos = pos+1
  return pos,neg

In [0]:
def VADER_ngram(words):   #implements bigram, trigram, and negation
  with open('./vader_lexicon.txt') as file:
    lexicon = {}
    tokens = set()
    for line in file:
      items = line.split('\t')
      token,rating = items[0], float(items[1])
      tokens.add(token)
      lexicon[token]=rating

  features = {}
  sentiments = []
  for i in range(0,len(words),3):
    first = words[i]
    if first in tokens:
      sentiments.append(lexicon[first])
    
    negation = first == ('not' or 'n\'t')
    used = set()
    if i+1<len(words):
      second = words[i+1]
      if not negation: 
        bigram = first +' '+ second
        if second in tokens: sentiments.append(lexicon[second])
        else: sentiments.append(0)
      else:   #negation reverses effect on count, sentiment
        bigram = first+' NEG_'+second
        if second in tokens: sentiments.append(-lexicon[second]) #flips sign
        else: sentiments.append(0)
      if bigram in used:
        features[bigram] += 1 
      else:
        features[bigram] = 1
        used.add(bigram)

    if i+2<len(words):
      third = words[i+2]
      if not negation: 
        trigram = bigram +' '+ third
        if third in tokens: sentiments.append(lexicon[third])
        else: sentiments.append(0)
      else:   #negation reverses effect on count, sentiment
        trigram = bigram+' NEG_'+third
        if third in lexicon:sentiments.append(-lexicon[third]) #flips sign
        else: sentiments.append(0)
      if trigram in used:
        features[trigram] += 1 
      else:
        features[trigram] = 1
        used.add(trigram)
  threshold = 2
  posCount, negCount = sum([i>threshold/2 for i in sentiments]), sum([i<-threshold/2 for i in sentiments])
  features['VADER positive count'] = posCount
  features['VADER negative count'] = negCount
  features['VADER count difference'] = posCount-negCount
  features['VADER total sentiment'] = sum(sentiments)
  features['VADER pos/neg ratio'] = (posCount+1)/(negCount+1)

  return features

In [0]:
#Code retreives VADER
!wget https://raw.githubusercontent.com/cjhutto/vaderSentiment/master/vaderSentiment/vader_lexicon.txt

--2020-02-04 05:54:44--  https://raw.githubusercontent.com/cjhutto/vaderSentiment/master/vaderSentiment/vader_lexicon.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 426687 (417K) [text/plain]
Saving to: ‘vader_lexicon.txt.2’


2020-02-04 05:54:46 (5.74 MB/s) - ‘vader_lexicon.txt.2’ saved [426687/426687]



In [0]:
#This code gets the train/dev/test files from github and imports them into Colab
!wget https://raw.githubusercontent.com/dbamman/nlp20/master/HW_1/train.txt
!wget https://raw.githubusercontent.com/dbamman/nlp20/master/HW_1/dev.txt
!wget https://raw.githubusercontent.com/dbamman/nlp20/master/HW_1/test.txt.zip
!unzip test.txt.zip

--2020-02-04 05:54:47--  https://raw.githubusercontent.com/dbamman/nlp20/master/HW_1/train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1427184 (1.4M) [text/plain]
Saving to: ‘train.txt.1’


2020-02-04 05:54:47 (15.7 MB/s) - ‘train.txt.1’ saved [1427184/1427184]

--2020-02-04 05:54:48--  https://raw.githubusercontent.com/dbamman/nlp20/master/HW_1/dev.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1474560 (1.4M) [text/plain]
Saving to: ‘dev.txt.1’


2020-02-04 05:54:48 (14.5 MB/s) - ‘dev.txt.1’ saved [1474560/1474560]

--2020-02-

In [0]:
#This cell trains two models: one on the dumb features and one on your fancy
#features.  It will store the test set predictions in a csv.
#The weights will be stored in a text file. 
#To access the files, click on the folder icon in the left sidebar.
#You can preview the files in Colab by double clicking or download the files by 
#right clicking and selecting Download.
if __name__ == "__main__":
  trainingFile = "./train.txt"
  evaluationFile = "./dev.txt"
  testFile = "./test.txt"

  for feature_method in [dumb_featurize, fancy_featurize]:
    sentiment_classifier = SentimentClassifier(feature_method)
    trainX, trainY = sentiment_classifier.process(trainingFile, training=True)
    devX, devY = sentiment_classifier.process(evaluationFile, training=False)
    testX, idsX = sentiment_classifier.load_test(testFile)
    sentiment_classifier.evaluate(trainX, trainY, devX, devY)
    sentiment_classifier.printWeights()
    sentiment_classifier.predict(testX, idsX)


Method: dumb_featurize, Features: 2, Train accuracy: 0.604, Dev accuracy: 0.611


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Method: fancy_featurize, Features: 4597, Train accuracy: 0.963, Dev accuracy: 0.809


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
