In [33]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
import numpy as np

In [34]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [35]:
def data_reader(name):
  data = pd.read_csv(name, sep=";", header=None)
  data.columns = ["text", "emotion"]
  return data

In [36]:
train = data_reader('train.txt')

In [37]:
train.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [38]:
classes = train.emotion.unique()

In [39]:
classes_token = {classes[i]:i for i in range(len(classes))}

In [40]:
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

sentences = []
for item in train.values:
  sentence = item[0].lower()
  word_tokens = tokenizer.tokenize(sentence)
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  stemmed = [ps.stem(w) for w in word_tokens]
  sentences.append([stemmed, item[1]])

In [41]:
vocabulary = []

for item in sentences:
    for word in item[0]:
      vocabulary.append(word)

vocabulary = set(vocabulary)

In [42]:
len(vocabulary)

11123

In [43]:
vocab_classes = {word:[0, 0, 0, 0, 0, 0] for word in vocabulary}

In [44]:
classes_len = {i:0 for i in classes}

In [45]:
classes_len

{'anger': 0, 'fear': 0, 'joy': 0, 'love': 0, 'sadness': 0, 'surprise': 0}

In [46]:
for c, v in classes_token.items():
  for item in sentences:
    if item[1] == c:
      for word in item[0]:
        vocab_classes[word][v] += 1
        classes_len[c] += 1

In [47]:
def sum_other_lens(cv, classes_len):
  sum = 0
  for c, v in classes_len.items():
    if cv != v:
      sum += classes_len[c]
  return sum

In [48]:
other_lens = {c:sum_other_lens(v, classes_len) for c, v in classes_len.items()}

In [49]:
def sum_word_occ_other_classes(word, cv):
  sum = 0
  for c, v in classes_len.items():
    if cv != v:
      sum += vocab_classes[word][classes_token[c]]
  return sum

In [50]:
probs = {word:[0, 0, 0, 0, 0, 0] for word in vocabulary}

In [51]:
v_len = len(vocabulary)

In [52]:
for word in vocabulary:
  for c, v in classes_token.items():
    cc = (vocab_classes[word][v] + 1) / (classes_len[c] + v_len) 
    oc = (sum_word_occ_other_classes(word, v) + 1) / (other_lens[c] + v_len)
    probs[word][v] = cc / oc

In [53]:
def prob_calculator_for_one_class(c_name, sentence):
  prior = classes_len[c_name] / other_lens[c_name]
  mul = 1
  for word in sentence:
    if word in vocabulary:
      mul *= probs[word][classes_token[c_name]]  
  return mul*prior

In [54]:
def preprocessing(sentence):
  sentence = sentence.lower()
  word_tokens = tokenizer.tokenize(sentence)
  filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
  stemmed = [ps.stem(w) for w in word_tokens]
  return stemmed

In [55]:
def argmax(current_probs):
  max_v = -1
  arg = ''
  for c, v in current_probs.items():
    if max_v < v:
      max_v = v
      arg = c
  return arg

In [56]:
def classifier(sentence):
  current_probs = {classes[i]:0 for i in range(len(classes))}
  sentence = preprocessing(sentence)
  for c_name, c_value in classes_token.items():
    c_p = prob_calculator_for_one_class(c_name, sentence)
    current_probs[c_name] = c_p
  return argmax(current_probs)

In [57]:
test = data_reader('test.txt')

In [58]:
test

Unnamed: 0,text,emotion
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,sadness
3,i left with my bouquet of red and yellow tulip...,joy
4,i was feeling a little vain when i did this one,sadness
...,...,...
1995,i just keep feeling like someone is being unki...,anger
1996,im feeling a little cranky negative after this...,anger
1997,i feel that i am useful to my people and that ...,joy
1998,im feeling more comfortable with derby i feel ...,joy


In [62]:
for item in test.values:
  classified.append(classifier(item[0]))

In [64]:
classified = []
all_corrects = 0
for item in test.values:
  p_c = classifier(item[0])
  classified.append([item[0], p_c])
  if p_c == item[1]:
    all_corrects +=1

accuracy = all_corrects / len(test.values)

In [65]:
print(accuracy)

0.658


In [66]:
output = pd.DataFrame(classified)
output.columns = ["text", "predicted_emotion"]

In [73]:
output.head()

Unnamed: 0,text,predicted_emotion
0,im feeling rather rotten so im not very ambiti...,sadness
1,im updating my blog because i feel shitty,sadness
2,i never make her separate from me because i do...,anger
3,i left with my bouquet of red and yellow tulip...,fear
4,i was feeling a little vain when i did this one,sadness


In [75]:
output.to_csv('output.csv', index=False)