In [11]:
# Required Python Packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from collections import Counter
import re

In [12]:
# Headers 
headers = ["Sentence", "Sentiment"]

In [13]:
# First we read in the dataset
dataset = pd.read_csv("C:/Users/haneesha/Anaconda3/textTrainData.csv")

In [14]:
# The above command returned a dataframe (with column and row names)
print(dataset.iloc[0:20])

                                             Sentence  Sentiment
0   So there is no way for me to plug it in here i...        0.0
1                         Good case, Excellent value.        1.0
2   Tied to charger for conversations lasting more...        0.0
3                                   The mic is great.        1.0
4   I have to jiggle the plug to get it to line up...        0.0
5   If you have several dozen or several hundred c...        0.0
6         If you are Razr owner...you must have this!        1.0
7                 Needless to say, I wasted my money.        0.0
8                    What a waste of money and time!.        0.0
9   He was very impressed when going from the orig...        1.0
10  If the two were seperated by a mere 5+ ft I st...        0.0
11                           Very good quality though        1.0
12  The design is very odd, as the ear clip is not...        0.0
13  Highly recommend for any one who has a blue to...        1.0
14                I advis

In [15]:
# Add the headers to the loaded dataset
dataset.columns = headers

In [18]:
 # Split dataset into train and test dataset
train, test = train_test_split(dataset[headers[ : ]], train_size=0.7)



In [19]:
print(train [0:5])

                                               Sentence  Sentiment
1267  I didn't know pulled pork could be soooo delic...        1.0
757                       The kids are very cool too.          1.0
86    The price was very good and with the free ship...        1.0
1254  The goat taco didn't skimp on the meat and wow...        1.0
861   Star Trek V The final Frontier is the worst in...        0.0


In [28]:
# We need a function that will split the text based upon sentiment
def get_text(sent, score):
  # Join together the text in the reviews for a particular sentiment.
  # We lowercase to avoid "Not" and "not" being seen as different words, for example.
   
    s = ""
    for index,row in sent.iterrows():
        if row['Sentiment'] == score:
            s = s + row['Sentence'].lower()
    
    return s

In [29]:
# We also need a function that will count word frequency for each sample
def count_text(text):
  # Split text into words based on whitespace.  Simple but effective.
  words = re.split("\s+", text)
  # Count up the occurence of each word.
  return Counter(words)

In [30]:
# Now we will capture the negative and positive samples in the training set.
# We will create two large strings, one of all text from positive reviews and one from the negatives
# We will then use these to create the word counts
# This will make the computations of the probabilities easier

# This will take a few minutes and use up some memory!

negative_train_text = get_text(train, 0)
positive_train_text = get_text(train, 1)

In [31]:
print(positive_train_text[0:100])

i didn't know pulled pork could be soooo delicious.the kids are very cool too.  the price was very g


In [32]:
# Here we generate the word counts for each sentiment
negative_counts = count_text(negative_train_text)
# Generate word counts for positive tone.
positive_counts = count_text(positive_train_text)

In [33]:
print(negative_train_text[0:100])

star trek v the final frontier is the worst in the series.  the food was terrible.the burger had abs


In [34]:
# We need this function to calculate a count of a given classification
def get_y_count(score):
  # Compute the count of each classification occuring in the data.
  # return len([r for r in reviews if r[1] == str(score)])
    c = 0
    for index,row in train.iterrows():
        if row['Sentiment'] == score:
            c = c + 1
    
    return c

In [35]:
# We need these counts to use for smoothing when computing the prediction.
positive_sentence_count = get_y_count(1)
negative_sentence_count = get_y_count(0)

In [36]:
# These are the class probabilities (we saw them in the formula as P(y)).
prob_positive = positive_sentence_count / len(train)
prob_negative = negative_sentence_count / len(train)

In [37]:
print(prob_positive)

0.5260461144321094


In [38]:
# Finallt, we create a function that will, given a text example, allow us to calculate the probability
# of a positive or negative review

def make_class_prediction(text, counts, class_prob, class_count):
  prediction = 1
  text_counts = Counter(re.split("\s+", text))
  for word in text_counts:
      # For every word in the text, we get the number of times that word occured in the reviews for a given class, add 1 to smooth the value, and divide by the total number of words in the class (plus the class_count to also smooth the denominator).
      # Smoothing ensures that we don't multiply the prediction by 0 if the word didn't exist in the training data.
      # We also smooth the denominator counts to keep things even.
      prediction *=  text_counts.get(word) * ((counts.get(word, 0) + 1) / (sum(counts.values()) + class_count))
  # Now we multiply by the probability of the class existing in the documents.
  return prediction * class_prob

In [53]:
print("Negative prediction: {0}".format(make_class_prediction(train.iloc[0,0], negative_counts, prob_negative, negative_sentence_count)))
print("Positive prediction: {0}".format(make_class_prediction(train.iloc[0,0], positive_counts, prob_positive, positive_sentence_count)))

Negative prediction: 1.3394702708964042e-30
Positive prediction: 1.6878267532391027e-32


In [59]:
# Here we will create a function that will actually make the prediction
def make_decision(text, make_class_prediction):
    # Compute the negative and positive probabilities.
    negative_prediction = make_class_prediction(text, negative_counts, prob_negative, negative_sentence_count)
    positive_prediction = make_class_prediction(text, positive_counts, prob_positive, positive_sentence_count)

    # We assign a classification based on which probability is greater.
    if negative_prediction > positive_prediction:
      return 0
    return 1

In [64]:
print(make_decision(train.iloc[0,0], make_class_prediction))

0


In [68]:
# Now we make predictions on the test data. Since it is a large set, we will simply select 200 movies.
predictions = [make_decision(row['Sentence'], make_class_prediction) for index,row in test[200:600].iterrows()]

In [69]:
# We check the accuracy. Note that when we pull the column out of the data frame, we need to convert it to a list
# to compare with the predictions

actual = test['Sentiment'].tolist()

actual = actual[200:600]


In [70]:
accuracy = sum(1 for i in range(len(predictions)) if predictions[i] == actual[i]) / float(len(predictions))
print("{0:.4f}".format(accuracy))

0.5033
