In [2]:
import nltk

In [3]:
#Step 1 Load the data
positiveReviewsFileName = "rt-polaritydata/rt-polarity.pos"
negativeReviewsFileName = "rt-polaritydata/rt-polarity.neg"

with open(positiveReviewsFileName,'r') as f:
    positiveReviews = f.readlines()

with open(negativeReviewsFileName,'r') as f:
    negativeReviews = f.readlines()

In [4]:
#Step 2 Train - Test : Split the Data
testTrainingSplitIndex = 2500

testNegativeReviews = negativeReviews[testTrainingSplitIndex+1:]
testPositiveReviews = positiveReviews[testTrainingSplitIndex+1:]

trainingNegativeReviews = negativeReviews[:testTrainingSplitIndex]
trainingPositiveReviews = positiveReviews[:testTrainingSplitIndex]

In [5]:
#Step 3 Create a Vocabulary for Creating Feature Vectors
def getVocabulary():
  positiveWordList = [word for line in trainingPositiveReviews for word in line.split()]
  negativeWordList = [word for line in trainingNegativeReviews for word in line.split()]
  allWordList = [item for sublist in [positiveWordList,negativeWordList] for item in sublist]
  allWordSet = list(set(allWordList))
  vocabulary = allWordSet
  return vocabulary

In [8]:
#print and see the item and the number of such items
vocabulary=getVocabulary()
vocabulary[2]

'foul'

In [9]:
len(vocabulary)

14102

In [10]:
#Step 4 Setting Up Data in Right Format - Words in the Review and the Label

def getTrainingData():
  negTaggedTrainingReviewList = [{'review':oneReview.split(),'label':'negative'} for oneReview in trainingNegativeReviews] 
  posTaggedTrainingReviewList = [{'review':oneReview.split(),'label':'positive'} for oneReview in trainingPositiveReviews] 
  fullTaggedTrainingData = [item for sublist in [negTaggedTrainingReviewList,posTaggedTrainingReviewList] for item in sublist]
  trainingData = [(review['review'],review['label']) for review in fullTaggedTrainingData]
  return trainingData

In [11]:
#print and see the tuple structure and the number of such tuples
trainingData= getTrainingData()
trainingData[0]

(['simplistic', ',', 'silly', 'and', 'tedious', '.'], 'negative')

In [12]:
len(trainingData)

5000

In [13]:
#can also access the first part of the tuple separately like this 
trainingData[0][0]

['simplistic', ',', 'silly', 'and', 'tedious', '.']

In [14]:
#can also access the second part of the tuple separately like this 
trainingData[0][1]

'negative'

In [15]:
#Step 5 Convert the tuple structure into a feature vector: A dictionary with 1 only in those places 
#where the words in the review are present and 0 for all other words in the vocabulary
def extract_features(review):
  review_words=set(review)
  features={}
  for word in vocabulary:
      features[word]=(word in review_words)
  return features 

In [16]:
#Step 6 Train the Naive Bayes Classifier
#Converts training data into feature vectors and then trains on those features

def getTrainedNaiveBayesClassifier(extract_features, trainingData):
  trainingFeatures=nltk.classify.apply_features(extract_features, trainingData)
  trainedNBClassifier=nltk.NaiveBayesClassifier.train(trainingFeatures)
  return trainedNBClassifier

In [17]:
trainingFeatures=nltk.classify.apply_features(extract_features, trainingData)
trainingFeatures[0]

({'meeropol': False,
  'writings': False,
  'ryoko': False,
  'foul': False,
  'four': False,
  'woods': False,
  'clotted': False,
  'spiders': False,
  "friend's": False,
  'railing': False,
  'woody': False,
  'comically': False,
  'marching': False,
  'crooned': False,
  'unanswered': False,
  'originality': False,
  'superficially': False,
  'xtc': False,
  "johnson's": False,
  'lord': False,
  'immature': False,
  'digit': False,
  'callie': False,
  'tantalizing': False,
  'leisurely': False,
  'screaming': False,
  'picaresque': False,
  'boogaloo': False,
  'prize': False,
  'wooden': False,
  'haber': False,
  "couple's": False,
  'succession': False,
  'stereotypical': False,
  'eye-popping': False,
  'sturm': False,
  'esos': False,
  'tired': False,
  'miller': False,
  'bacon': False,
  'pulse': False,
  '270': False,
  'elegant': False,
  'second': False,
  'crisply': False,
  'sterile': False,
  'loathing': False,
  'hilariously': False,
  'hit-man': False,
  'ruthless

In [18]:
vocabulary = getVocabulary()

In [19]:
trainingData = getTrainingData()

In [None]:
trainedNBClassifier = getTrainedNaiveBayesClassifier(extract_features,trainingData)

In [None]:
#Step 7 Wrapper function for Applying Naive Bayes Classifier on a Test Instance

def naiveBayesSentimentCalculator(review):
  problemInstance = review.split()
  problemFeatures = extract_features(problemInstance)
  return trainedNBClassifier.classify(problemFeatures)

In [19]:
naiveBayesSentimentCalculator("What an awesome movie")

'positive'

In [20]:
naiveBayesSentimentCalculator("What a terrible movie")

'negative'

In [21]:
#Step 8 Apply it on the Test Data and get the results for positive and negative reviews in the test set

def getTestReviewSentiments(naiveBayesSentimentCalculator):
  testNegResults = [naiveBayesSentimentCalculator(review) for review in testNegativeReviews]
  testPosResults = [naiveBayesSentimentCalculator(review) for review in testPositiveReviews]
  labelToNum = {'positive':1,'negative':-1}
  numericNegResults = [labelToNum[x] for x in testNegResults]
  numericPosResults = [labelToNum[x] for x in testPosResults]
  return {'results-on-positive':numericPosResults, 'results-on-negative':numericNegResults}

In [22]:
#Step 9 Calculate and print the Positive, Negative and Overall Classification Accuracy

def runDiagnostics(reviewResult):
  positiveReviewsResult = reviewResult['results-on-positive']
  negativeReviewsResult = reviewResult['results-on-negative']
  numTruePositive = sum(x > 0 for x in positiveReviewsResult)
  numTrueNegative = sum(x < 0 for x in negativeReviewsResult)
  pctTruePositive = float(numTruePositive)/len(positiveReviewsResult)
  pctTrueNegative = float(numTrueNegative)/len(negativeReviewsResult)  
  totalAccurate = numTruePositive + numTrueNegative
  total = len(positiveReviewsResult) + len(negativeReviewsResult)
  print "Accuracy on positive reviews = " +"%.2f" % (pctTruePositive*100) + "%"
  print "Accurance on negative reviews = " +"%.2f" % (pctTrueNegative*100) + "%"
  print "Overall accuracy = " + "%.2f" % (totalAccurate*100/total) + "%"

In [23]:
#Step 10 Run and Compare the Results
runDiagnostics(getTestReviewSentiments(naiveBayesSentimentCalculator))

Accuracy on positive reviews = 73.39%
Accurance on negative reviews = 77.07%
Overall accuracy = 75.00%
