In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from textblob import TextBlob
import glob
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
# Reading the stopwords file from the NLTK package

stop_words = stopwords.words('english')
stop_words.extend(['\'nt','\'s','n\'t'])

In [3]:
# Getting the file names for the positive and negative labelled text data

# Negative File Names
files_neg = glob.glob('C:/Users/Mitesh/Desktop/IPython/TEXT DATASETS/3/review_polarity/txt_sentoken/neg/*.txt')

# Positive File Names
files_pos = glob.glob('C:/Users/Mitesh/Desktop/IPython/TEXT DATASETS/3/review_polarity/txt_sentoken/pos/*.txt')


In [4]:
# Function for reading the actual textual data from the data corpus
def in_text(file_names):
    data = []
    for i in file_names:
        with open(i,'r') as fn:
            text = fn.read()
            data.append(text)
    return data

In [5]:
# Reading in textual data for the positive and negative files respectively.

# Positive
pos = in_text(files_pos)
print len(pos)

# Negative
neg = in_text(files_neg)
print len(neg)

1000
1000


In [6]:
# List of numerical & special characters to be removed from the corpus
nos = list(range(10))
nos = [str(x) for x in nos]

l = '!"#$%&()*+,-/:;<=>?@[\]^_`{|}~'
ls = []

for i in l:
    ls.append(i)

sp = nos + ls
print sp

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '"', '#', '$', '%', '&', '(', ')', '*', '+', ',', '-', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


In [7]:
# Function to Remove Special Characters and Store the obtained processed data in List format

def rmv(lst):
    test = []
    for txt in lst:
        for letter in txt:
            if (letter in sp):
                txt = txt.replace(letter,'')
        test.append(txt)
    return test


def rm_spch(s):
    s = rmv(s)
    one = []
    x = ''
    
    for num in s:
        x = str(num)
        test = word_tokenize(x)
        
        res = test[:]

        for xn in test:          
            if (xn in  stop_words):
                    res.remove(xn)
                
        sen = ' '.join(res)
        one.append(sen)
    return one

In [8]:
# Pre-Precessing the data set and storing it in as semi - processed text file

refined_pos = rm_spch(pos)
print 'Number of Positive Labelled Texts:',len(refined_pos)

refined_neg = rm_spch(neg)
print 'Number of Negative Labelled Texts:',len(refined_neg)

one = refined_pos + refined_neg
print len(one)

with open('data.txt','w') as dr:
    for i in one:
        dr.write(i+'|')
        

Number of Positive Labelled Texts: 1000
Number of Negative Labelled Texts: 1000
2000


In [9]:
# Converting the corpus to 'textblob' objects 
from textblob.sentiments import NaiveBayesAnalyzer

# Positive
txt_pos = TextBlob(str(refined_pos), analyzer = NaiveBayesAnalyzer())

# Negative

txt_neg = TextBlob(str(refined_neg), analyzer = NaiveBayesAnalyzer())

In [10]:
# A measure of the total number of sentences contained in both the positive and negative tagged files

print 'Number of Sentences in Positive Labelled Texts:',len(txt_pos.sentences)
print 'Number of Sentences in Negative Labelled Texts:',len(txt_neg.sentences)
print len(txt_pos)
print len(txt_neg)

Number of Sentences in Positive Labelled Texts: 33714
Number of Sentences in Negative Labelled Texts: 32162
2713353
2387331


In [11]:
# In order to remove any text that may be of lesser or no value due to earlier pre-processing, we set a minimum length for 
# the text; this value is not absolute and can be varied.

#Positive
sen_pos = txt_pos.sentences 

for i in sen_pos:      
    if len(i) <= 15:
        sen_pos.remove(i)
                  
print 'Individual Sentences:',len(sen_pos)

# Negative
sen_neg = txt_neg.sentences

for i in sen_neg:
    if len(i) <= 15:
        sen_neg.remove(i)
    
print 'Individual Sentences:',len(sen_neg)


Individual Sentences: 31818
Individual Sentences: 29533


In [39]:
# Summing up all the texts in one single List object
total = [x for x in sen_pos] + [y for y in sen_neg]
print len(total)


61351


In [43]:
# Creating a Dictionay comprising of each individual Text and its respective Polarity and saving it as a DataFarme

di = {'Text':total, 'Polarity':[z.sentiment[0] for z in total]}
df = DataFrame(di)

df = df[df.Text != '.']   # Some more processing to get rid of the '.' sentences

print df.Polarity.value_counts()

print len(df)

df.to_csv('out.csv')

pos    37484
neg    22352
Name: Polarity, dtype: int64
59836
