In [34]:
import pandas as pd

# Cleaning &  Preparing the data to classify

In [35]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize 
import re

stemmer = SnowballStemmer("english")
def format_sentence(sent):
    sent = sent.lower()
    sent = re.sub('\s+', ' ', sent) #exstra space
    sent = ' '.join([stemmer.stem(word) for word in re.split('\s+', sent)]) #Stemming
    sent = re.sub("[^0-9A-Za-z ]", "", sent) #Punctuations 
    return {word: (word in word_tokenize(sent)) for word in word_tokenize(sent)}

format_sentence('this’’s is an interesting boy   @#!$%^&')  #testing sentence

{'this': True, 'is': True, 'an': True, 'interest': True, 'boy': True}

In [36]:
data = pd.read_csv('data.csv', encoding= 'unicode_escape').values.tolist()

## Reading the Financial data

In [37]:
pos_data = []
neg_data = []
neu_data = []

for rew in range(len(data)):
    record = data[rew]
    line = str(record[0])   
    category = str(record[1])
    if(category == 'positive'):
        pos_data.append([format_sentence(line), 'pos'])
    elif(category == 'negative'):
        neg_data.append([format_sentence(line), 'neg'])
    elif(category == 'neutral'):
        neu_data.append([format_sentence(line), 'neut'])
    

In [38]:
print(len(pos_data), len(neg_data), len(neu_data))

1852 860 3130


## Reading the question data to enhance neutrality

In [39]:
data = pd.read_csv('questions.csv', ).values.tolist()

In [40]:
neu_questions = []

for rew in range(2500):
    record = data[rew]
    line = str(record[3])   
    neu_questions.append([format_sentence(line), 'neut'])

### Reading extra data

In [41]:
posit_data = []
with open('extra.pos.txt', encoding='latin-1') as f:
    for line in f:
        posit_data.append([format_sentence(line), 'pos'])

In [42]:
negat_data = []
with open('extra.neg.txt', encoding='latin-1') as f:
    for line in f:
        negat_data.append([format_sentence(line), 'neg'])

In [43]:
pos_data.extend(posit_data)
neg_data.extend(negat_data)
neu_data.extend(neu_questions)
pos_data = pos_data[:6000]

In [44]:
print(len(pos_data), len(neg_data), len(neu_data))

6000 6191 5630


In [45]:
training_data = pos_data[:5500] + neg_data[:4500] + neu_data[1100:] 
testing_data  = pos_data[5500:] + neg_data[4500:] + neu_data[:1100]

In [46]:
print(len(training_data), len(testing_data))

14530 3291


In [47]:
from nltk.classify import NaiveBayesClassifier
model = NaiveBayesClassifier.train(training_data)
model.show_most_informative_features()

Most Informative Features
                    film = True              pos : neut   =     84.0 : 1.0
                   india = True             neut : neg    =     65.9 : 1.0
               entertain = True              pos : neut   =     62.9 : 1.0
                  comedi = True              pos : neut   =     49.6 : 1.0
                    lack = True              neg : neut   =     49.3 : 1.0
                 audienc = True              neg : neut   =     48.0 : 1.0
                    isnt = True              neg : neut   =     47.3 : 1.0
                    seem = True              neg : neut   =     44.5 : 1.0
                   drama = True              pos : neut   =     43.7 : 1.0
                   enjoy = True              pos : neut   =     42.6 : 1.0


In [48]:
print(model.classify(format_sentence('this is a nice boy!')))
print(model.classify(format_sentence('this is a bad boy!')))
print(model.classify(format_sentence('what is ur name?')))
print(model.classify(format_sentence('Viking Line has canceled some services.')))
print(model.classify(format_sentence('my name is Mohamed Khalid.')))

pos
neg
neut
neut
neut


In [49]:
from nltk.classify.util import accuracy
accuracy(model, testing_data)

0.7687632938316621