In [885]:
import pandas as pd

# Cleaning &  Preparing the data to classify

In [1002]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize 
import re

stemmer = SnowballStemmer("english")
def format_sentence(sent):
    sent = sent.lower()
    sent = re.sub('\s+', ' ', sent) #exstra space
    sent = ' '.join([stemmer.stem(word) for word in re.split('\s+', sent)]) #Stemming
    sent = re.sub("[^0-9A-Za-z ]", "", sent) #Punctuations 
    return {word: (word in word_tokenize(sent)) for word in word_tokenize(sent)}

format_sentence('this’’s is an interesting boy   @#!$%^&')  #testing sentence

{'this': True, 'is': True, 'an': True, 'interest': True, 'boy': True}

In [967]:
data = pd.read_csv('data.csv', encoding= 'unicode_escape').values.tolist()

## Reading the Financial data

In [968]:
pos_data = []
neg_data = []
neu_data = []

for rew in range(len(data)):
    record = data[rew]
    line = str(record[0])   
    category = str(record[1])
    if(category == 'positive'):
        pos_data.append([format_sentence(line), 'pos'])
    elif(category == 'negative'):
        neg_data.append([format_sentence(line), 'neg'])
    elif(category == 'neutral'):
        neu_data.append([format_sentence(line), 'neut'])
    

In [969]:
print(len(pos_data), len(neg_data), len(neu_data))

1852 860 3130


## Reading the question data to enhance neutrality

In [970]:
data = pd.read_csv('questions.csv', ).values.tolist()

In [971]:
neu_questions = []

for rew in range(1500):
    record = data[rew]
    line = str(record[3])   
    neu_questions.append([format_sentence(line), 'neut'])

### Reading extra data

In [973]:
posit_data = []
with open('extra.pos.txt', encoding='latin-1') as f:
    for line in f:
        posit_data.append([format_sentence(line), 'pos'])

In [974]:
negat_data = []
with open('extra.neg.txt', encoding='latin-1') as f:
    for line in f:
        negat_data.append([format_sentence(line), 'neg'])

In [975]:
pos_data.extend(posit_data)
neg_data.extend(negat_data)
neu_data.extend(neu_questions)

In [976]:
print(len(pos_data), len(neg_data), len(neu_data))

7183 6191 4630


In [977]:
training_data = pos_data[:6800] + neg_data[:4900] + neu_data[1000:] 
testing_data  = pos_data[6800:] + neg_data[4000:] + neu_data[:1000]

In [978]:
print(len(training_data), len(testing_data))

15330 3574


In [979]:
from nltk.classify import NaiveBayesClassifier
model = NaiveBayesClassifier.train(training_data)
#model.show_most_informative_features()

In [997]:
print(model.classify(format_sentence('this is a nice boy!')))
print(model.classify(format_sentence('this is a bad boy!')))
print(model.classify(format_sentence('what is ur name?')))
print(model.classify(format_sentence('Viking Line has canceled some services.')))
print(model.classify(format_sentence('my name is Mohamed Khalid.')))

pos
neg
neut
neut
neut


In [984]:
from nltk.classify.util import accuracy
accuracy(model, testing_data)

0.8640179071068831