# Import the Packages

In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
import nltk

In [4]:
import bz2

In [5]:
import chardet

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
from keras.models import Sequential

Using TensorFlow backend.


In [8]:
from keras.layers import Dense, Activation, Dropout
from keras import optimizers
from keras.utils import np_utils

In [16]:
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
import re

# Reading the Dataset

In [14]:
train = open("./train.ft.txt")
train_lines = train.readlines()

In [15]:
train_lines[0]

'__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n'

# Decoding the Lines

In [20]:
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_lines]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_lines]

for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])
                                                           
for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", train_sentences[i])


In [21]:
len(train_labels)

3600000

In [22]:
train_sentences[192]

'great resource for chinese/eastern medincine: this is an excellent book to have as a reference fo eastern methodologies in medicine and healing. unfortunately it is long out of print, but if you can get your hands on a copy of this text, you will not be dissappointed. in fact, if you are unhappy with it, i will be happy to purchase it from you as i have many peers that would happily use it.'

# Translating the Sentences

In [23]:
import string
for s in range(len(train_sentences)):
    train_sentences[s] = train_sentences[s].translate(str.maketrans('','',string.punctuation))

In [24]:
train_sentences[192]

'great resource for chineseeastern medincine this is an excellent book to have as a reference fo eastern methodologies in medicine and healing unfortunately it is long out of print but if you can get your hands on a copy of this text you will not be dissappointed in fact if you are unhappy with it i will be happy to purchase it from you as i have many peers that would happily use it'

# Looking at Other Lines

In [25]:
train_sentences[205]

'smoothing serum the product is wonderful for my hair which is very curly and tends to be frizzy smooths it and gets rid of allof the friz use it once a day even in damp weather'

# Training the DataFrame

In [26]:
train = pd.DataFrame(data=list(zip(train_sentences, train_labels)), 
                     columns=['review_text', 'sentiment_class_label'])

In [27]:
train.head()

Unnamed: 0,review_text,sentiment_class_label
0,stuning even for the nongamer this sound track...,1
1,the best soundtrack ever to anything im readin...,1
2,amazing this soundtrack is my favorite music o...,1
3,excellent soundtrack i truly like this soundtr...,1
4,remember pull your jaw off the floor after hea...,1


In [38]:
train['word_count'] = [len(text.split()) for text in train.review_text]

# Checking the Data Again

In [39]:
train.head()

Unnamed: 0,review_text,sentiment_class_label,word_count
100,textbook book shipped quickly and was in excel...,1,16
669,janes all the worlds aircraft 00000 great to d...,1,18
686,edge of danger 0 star only because thats the ...,0,17
725,needs upgrade only has limited access to obdi...,0,19
881,good read good read ...,1,4


In [40]:
train.shape

(29876, 3)

# Filtering Words that are Less than 30

In [41]:
train = train[train.word_count < 30]
train.shape

(29876, 3)

In [42]:
train = train.drop(columns=['word_count'], axis=1)
train.head()

Unnamed: 0,review_text,sentiment_class_label
100,textbook book shipped quickly and was in excel...,1
669,janes all the worlds aircraft 00000 great to d...,1
686,edge of danger 0 star only because thats the ...,0
725,needs upgrade only has limited access to obdi...,0
881,good read good read ...,1


In [45]:
train = train.set_index(np.arange(len(train)))

# Creating the Review Text for the Items

In [44]:
mp={}
for t in train.review_text:
    for s in t.split():
        if s in mp:
            mp[s]+=1
        else:
            mp[s]=1

In [46]:
word_list=[]
for key, value in mp.items():
    if value>5:
        word_list.append(key)

In [47]:
mp

{'textbook': 45,
 'book': 5281,
 'shipped': 241,
 'quickly': 465,
 'and': 12773,
 'was': 4848,
 'in': 5136,
 'excellent': 2812,
 'condition': 1029,
 'as': 2541,
 'stated': 83,
 'easy': 995,
 'transaction': 181,
 'would': 1361,
 'buy': 1353,
 'again': 877,
 'janes': 1,
 'all': 1777,
 'the': 19105,
 'worlds': 16,
 'aircraft': 7,
 '00000': 78,
 'great': 7478,
 'to': 8100,
 'deal': 166,
 'with': 3836,
 'very': 5321,
 'quick': 316,
 'delivery': 497,
 'highly': 696,
 'recommended': 457,
 'thank': 298,
 'you': 2879,
 'edge': 29,
 'of': 7534,
 'danger': 8,
 '0': 1325,
 'star': 129,
 'only': 802,
 'because': 289,
 'thats': 175,
 'minimumthis': 1,
 'proves': 22,
 'famous': 26,
 'can': 767,
 'publish': 9,
 'anything': 236,
 'needs': 169,
 'upgrade': 31,
 'has': 1090,
 'limited': 60,
 'access': 22,
 'obdii': 1,
 'data': 27,
 'stream': 8,
 'unless': 61,
 'pay': 64,
 'more': 1228,
 'money': 1062,
 'software': 92,
 'good': 4690,
 'read': 1194,
 'a': 10781,
 'disappointment': 206,
 'maeves': 1,
 'fort

In [49]:
len(word_list)

5906

# Looping the Word List

In [None]:
dat_inform=[]
for i in train.iterrows():
    lis=[]
    text = i[1]['review_text'].split()
    for j in word_list:
        if j in text:
            lis.append(1)
        else:
            lis.append(0)
            
            
    dat_inform.append(lis)