In [51]:
import bz2
from collections import Counter
import re
import nltk
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from wordcloud import WordCloud,STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline

In [13]:
train_file = bz2.BZ2File('train.ft.txt.bz2')
test_file = bz2.BZ2File('test.ft.txt.bz2')

train_file = train_file.readlines()
test_file = test_file.readlines()

In [5]:
train_file = [x.decode('utf-8') for x in train_file[:5000]]
test_file = [x.decode('utf-8') for x in test_file[:5000]]

In [6]:
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file]

test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file]


In [7]:
for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])
    

for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])

In [8]:
for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])
        
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", test_sentences[i])

In [9]:
train_sentences[0]

'stuning even for the non-gamer: this sound track was beautiful! it paints the senery in your mind so well i would recomend it even to people who hate vid. game music! i have played the game chrono cross but out of all of the games i have ever played it has the best music! it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. it would impress anyone who cares to listen! ^_^'

In [39]:
Na_train = {'Sentence': train_sentences, 'Label': train_labels}
Nav_train = pd.DataFrame(Na_train)

Na_test = {'Sentence': test_sentences, 'Label': test_labels}
Nav_test = pd.DataFrame(Na_test)

Nav_train.head()

Nav_train = Nav_train.head(2000)
Nav_test = Nav_test.head(1000)

In [41]:
sents = []
alll = []
stopwords_set = set(stopwords.words("english")) #words which add no meaning to sentence articles, prepositions, conjuctions

for index, row in Nav_train.iterrows():
    words_filtered = [e.lower() for e in row.Sentence.split() if len(e) >= 3]
    words_cleaned = [word for word in words_filtered
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')
        and word != 'RT']
    words_without_stopwords = [word for word in words_cleaned if not word in stopwords_set]
    sents.append((words_without_stopwords, row.Label))
    alll.extend(words_without_stopwords )

In [46]:
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    features = wordlist.keys()
    return features

In [47]:
w_features = get_word_features(alll)

In [50]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in w_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [18]:
# Training the Naive Bayes classifier
training_set = nltk.classify.apply_features(extract_features,sents)
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [20]:
train_pos = Nav_train[Nav_train['Label'] == 1]
train_pos = train_pos['Sentence']
train_neg = Nav_train[Nav_train['Label'] == 0]
train_neg = train_neg['Sentence']
test_pos = Nav_test[Nav_test['Label'] == 1]
test_pos = test_pos['Sentence']
test_neg = Nav_test[Nav_test['Label'] == 0]
test_neg = test_neg['Sentence']

In [38]:
print(test_neg.head(5))
print(test_pos.head(5))


2    batteries died within a year ...: i bought thi...
5    dvd player crapped out after one year: i also ...
6    incorrect disc: i love the style of this, but ...
7    dvd menu select problems: i cannot scroll thro...
9    not an "ultimate guide": firstly,i enjoyed the...
Name: Sentence, dtype: object
0    great cd: my lovely pat has one of the great v...
1    one of the best game music soundtracks - for a...
3    works fine, but maha energy is better: check o...
4    great for the non-audiophile: reviewed quite a...
8    unique weird orientalia from the 0000's: exoti...
Name: Sentence, dtype: object


In [22]:
neg_cnt = 0
pos_cnt = 0
for obj in test_neg: 
    res =  classifier.classify(extract_features(obj.split()))
    if(res == 0): 
        neg_cnt = neg_cnt + 1
for obj in test_pos: 
    res =  classifier.classify(extract_features(obj.split()))
    if(res == 1): 
        pos_cnt = pos_cnt + 1
        
print('[Negative]: %s/%s '  % (len(test_neg),neg_cnt))        
print('[Positive]: %s/%s '  % (len(test_pos),pos_cnt))  

[Negative]: 47/42 
[Positive]: 53/40 


In [23]:
acccc= ((neg_cnt+pos_cnt)/(len(test_neg)+len(test_pos))) * 100
print("Accuracy by nltk classifier is", acccc)

Accuracy by nltk classifier is 82.0


In [29]:
print(test_neg.loc[90])
classifier.classify(extract_features(test_neg.loc[90].split()))


painful: this book has to be one of the most tedious works of literature ever written. hawthorne is a great writer, but i don't know how this book made it into that sacred list we call "classics". perhaps on the merit of his name alone?


0

In [36]:
print(test_pos.loc[66])
sentence=test_pos.loc[66]
classifier.classify(extract_features(sentence.split()))

great camera: i recently purchased this camera and i'm very satisfied with it. i find the quality of the pictures with this camera as good if not better than anything else without going to digital. it is very comfortable to hold and the controls are easy to use. i highly reccomend this product to anyone looking for a top quality camera at an affordable price. nice going sony!


1