[View in Colaboratory](https://colab.research.google.com/github/KushalVenkatesh/Sentiment_Analysis_Customer_Reviews_TestPilot/blob/master/sentiment_analysis_model_updated.ipynb)

# SENTIMENT ANALYSIS USING REVIEW'S STARS

In [0]:
#text-classification-sentiment-analysis-naive-bayes-classifier
#referenced from the following blog
#http://streamhacker.com/2010/05/10/text-classification-sentiment-analysis-naive-bayes-classifier/
import nltk.classify.util
import collections, itertools
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import stopwords
from nltk import scores, sent_tokenize, word_tokenize, pos_tag, MaxentClassifier
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist

import random
import itertools
import pandas as pd
import json

from numpy import mean
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, matthews_corrcoef
from string import punctuation

In [0]:
random.seed(88)

## TO EVALUATE MODELS ##

In [0]:
# Function to evaluate features
def evaluate_classifier(featx,                         
                        negtrain,
                        negtest,
                        postrain, 
                        postest):     
    
    negtrain_feats = [(featx(w), 'neg') for w in negtrain]
    negtest_feats  = [(featx(w), 'neg') for w in negtest]
    postrain_feats = [(featx(w), 'pos') for w in postrain]
    postest_feats  = [(featx(w), 'pos') for w in postest]
    
    trainfeats = negtrain_feats + postrain_feats
    testfeats = negtest_feats + postest_feats    
    trainfeats = random.sample(trainfeats, len(trainfeats))
    testfeats = random.sample(testfeats, len(testfeats))    
        
    classifier = NaiveBayesClassifier.train(trainfeats)
    # classifier = MaxentClassifier.train(trainfeats)
    
    actual = []
    predict = []

    for i, (feats, label) in enumerate(testfeats):        
            observed = classifier.classify(feats)
            actual.append(label)
            predict.append(observed)
    try:
        accuracy = accuracy_score(actual, predict)
    except:
        accuracy = None
        print('Accuracy: Division by zero')

    try:
        precision = precision_score(actual, predict)
    except:
        precision = None
        print('Precision: Division by zero')

    try:
        recall = recall_score(actual, predict)
    except:
        recall  = None
        print('Recall: Division by zero')

    try:
        matthew = matthews_corrcoef(actual, predict)
    except:
        matthew = None
        print('Matthews corr coeff: Division by zero')

    return accuracy, precision, recall, matthew, classifier
    
    

In [0]:
# Function to evaluate features (include bestwords)
def evaluate_classifier2(featx,                         
                        negtrain,
                        negtest,
                        postrain, 
                        postest,
                        bestwords):    
    
    negtrain_feats = [(featx(w, bestwords), 'neg') for w in negtrain]
    negtest_feats  = [(featx(w, bestwords), 'neg') for w in negtest]
    postrain_feats = [(featx(w, bestwords), 'pos') for w in postrain]
    postest_feats  = [(featx(w, bestwords), 'pos') for w in postest]
    
    trainfeats = negtrain_feats + postrain_feats
    testfeats = negtest_feats + postest_feats    
    trainfeats = random.sample(trainfeats, len(trainfeats))
    testfeats = random.sample(testfeats, len(testfeats))
        
    classifier = NaiveBayesClassifier.train(trainfeats)
    # classifier = MaxentClassifier.train(trainfeats)
    
    actual = []
    predict = []

    for i, (feats, label) in enumerate(testfeats):        
            observed = classifier.classify(feats)
            actual.append(label)
            predict.append(observed)
    try:
        accuracy = accuracy_score(actual, predict)
    except:
        accuracy = None
        print('Accuracy: Division by zero')

    try:
        precision = precision_score(actual, predict)
    except:
        precision = None
        print('Precision: Division by zero')

    try:
        recall = recall_score(actual, predict)
    except:
        recall  = None
        print('Recall: Division by zero')

    try:
        matthew = matthews_corrcoef(actual, predict)
    except:
        matthew = None
        print('Matthews corr coeff: Division by zero')

    return accuracy, precision, recall, matthew, classifier

## FEATURES ##

In [0]:
# Bag of words: All words
def word_feats(words):
    return dict([(word, True) for word in words])

In [0]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
# Stopword filtering
stop_set = set(stopwords.words('english')) 

def stopword_filtered_word_feats(words, stopset = stop_set):
    return dict([(word, True) for word in words if word not in stopset])

In [0]:
# Bigram Collocations
def bigram_word_feats(words, score_fn = BigramAssocMeasures.chi_sq, n = 200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    return dict([(ngram, True) for ngram in itertools.chain(words, bigrams)])

In [0]:
# Eliminate Low Information Features
def get_best_words(neg_train, pos_train, best_n):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()

    for w in [word for review in neg_train for word in review]:
        word_fd[w.lower()] += 1
        label_word_fd['neg'][w.lower()] += 1

    for w in [word for review in pos_train for word in review]:
        word_fd[w.lower()] += 1
        label_word_fd['pos'][w.lower()] += 1

    neg_word_count = label_word_fd['neg'].N()
    pos_word_count = label_word_fd['pos'].N()
    total_word_count = pos_word_count + neg_word_count
    
    # Compute scores
    word_scores = {}
    for word, freq in word_fd.items():
        pos_score = BigramAssocMeasures.chi_sq( \
            label_word_fd['pos'][word], \
            (freq, pos_word_count), \
            total_word_count)
        neg_score = BigramAssocMeasures.chi_sq( \
            label_word_fd['neg'][word], \
            (freq, neg_word_count), \
            total_word_count)
        word_scores[word] = pos_score + neg_score
    
    # Choose best score
    best = sorted(word_scores.items(), \
                  key=lambda s: s[1], \
                  reverse=True)[:best_n]    
    return set([w for w, s in best])

# Features (words) based on best words
def best_word_feats(words, bestwords):
    return dict([(word, True) for word in words if word in bestwords])

In [0]:
# Best words + bigram
def best_bigram_word_feats(words, 
                           bestwords,
                           score_fn = BigramAssocMeasures.chi_sq, 
                           n = 200):
    bigram_finder = BigramCollocationFinder.from_words(words)
    bigrams = bigram_finder.nbest(score_fn, n)
    d = dict([(bigram, True) for bigram in bigrams])
    d.update(best_word_feats(words, bestwords))
    return d

## DEVELOP MODEL ##

In [0]:
!ls rebtel_sentiment_analysis/data

rawdata_20170620.json  rawdata_20180717.json  reviews_influxDB.txt


### Prepare data 

In [0]:
# Load raw data
file_name = 'rebtel_sentiment_analysis/data/rawdata_20180717.json'
with open(file_name) as json_data:
    data = json.load(json_data)

In [0]:
# Convert to dataframe
df = pd.DataFrame(data)
df = df.drop_duplicates()

In [0]:
# Remove punctuation
import re
import string

def remove_punctuation(s):
    s = ''.join([i for i in s if i not in frozenset(punctuation)])
    return s

rem = string.punctuation
pattern = r"[{}]".format(rem)
df['text_nopunct'] = df['text'].str.replace(pattern, ' ')

In [0]:
# Classify reviews: negative vs positive vs neutral
# Assumptions:
# Reviews with 1,2 stars -> negative
# Reviews with 4,5 stars -> positive
# Revjiews with 3 stars -> neutral

df_neg = df[(df.stars == 1) | (df.stars == 2)]
df_pos = df[(df.stars == 4) | (df.stars == 5)]
df_neu = df[(df.stars == 3)]
df_neg.shape, df_pos.shape, df_neu.shape

((721, 9), (2128, 9), (275, 9))

Assumption: We focused on positive and negative reviews (discard neutral)

In [0]:
# Split reviews: positive vs negative
import nltk
nltk.download('stopwords')
nltk.download('punkt')
neg_words = [word_tokenize(f) for f in df_neg.text_nopunct]
pos_words = [word_tokenize(f) for f in df_pos.text_nopunct]

[nltk_data] Downloading package stopwords to /content/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
# Split data set: developing vs validation
neg_develop, neg_val = train_test_split(neg_words, test_size=0.25)
pos_develop, pos_val = train_test_split(pos_words, test_size=0.25)

In [0]:
len(neg_develop),len(neg_val), len(pos_develop),len(pos_val)

(540, 181, 1596, 532)

## 5-CROSS VALIDATION ##

In [0]:
# k-fold cross validation
k = 5

results = collections.defaultdict(dict)
models = ['bag_of_words', 'stop_words', 'bigram', \
          'best_words', 'bigram_best_words']
for model_name in models:
    results[model_name]['accuracy'] = []
    results[model_name]['precision'] = []
    results[model_name]['recall'] = []
    results[model_name]['matthew'] = []

In [0]:
for i in range(0, k):
    # Split developing dataset: training vs testing
    neg_train, neg_test = train_test_split(neg_develop, test_size=1/k)
    pos_train, pos_test = train_test_split(pos_develop, test_size=1/k)    
        
    num_train = len(neg_train) + len(pos_train)
    num_test = len(neg_test) + len(pos_test)    
    print(str(i) + '. Train on %d instances, test on %d instances' % \
          (num_train, num_test)) 
    
    model_name = 'bag_of_words'
    print(model_name.upper())    
    accuracy, precision, recall, matthew, _ = evaluate_classifier(
        word_feats,                         
        negtrain = neg_train,
        negtest = neg_test,
        postrain = pos_train, 
        postest = pos_test) 
    results[model_name]['accuracy'].append(accuracy)
    results[model_name]['precision'].append(precision)
    results[model_name]['recall'].append(recall)
    results[model_name]['matthew'].append(matthew)
    
        
    model_name = 'stop_words'
    print(model_name.upper())     
    accuracy, precision, recall, matthew, _ = evaluate_classifier(
        stopword_filtered_word_feats,                         
        negtrain = neg_train,
        negtest = neg_test,
        postrain = pos_train, 
        postest = pos_test)
    results[model_name]['accuracy'].append(accuracy)
    results[model_name]['precision'].append(precision)
    results[model_name]['recall'].append(recall)
    results[model_name]['matthew'].append(matthew)
    
#     model_name = 'bigram'
#     print(model_name.upper())     
#     accuracy, precision, recall, matthew, _ = evaluate_classifier(
#         bigram_word_feats,                         
#         negtrain = neg_train,
#         negtest = neg_test,
#         postrain = pos_train, 
#         postest = pos_test) 
#     results[model_name]['accuracy'].append(accuracy)
#     results[model_name]['precision'].append(precision)
#     results[model_name]['recall'].append(recall)
#     results[model_name]['matthew'].append(matthew)
    
    best_words = get_best_words(neg_train, pos_train, 5000)                         
    model_name = 'best_words'    
    print(model_name.upper())            
    accuracy, precision, recall, matthew, _ = evaluate_classifier2(
        best_word_feats,                        
        negtrain = neg_train,
        negtest = neg_test,
        postrain = pos_train, 
        postest = pos_test,
        bestwords = best_words) 
    results[model_name]['accuracy'].append(accuracy)
    results[model_name]['precision'].append(precision)
    results[model_name]['recall'].append(recall)
    results[model_name]['matthew'].append(matthew)
                     
#     model_name = 'bigram_best_words'
#     print(model_name.upper())         
#     accuracy, precision, recall, matthew, _ = evaluate_classifier2(
#         best_bigram_word_feats,                         
#         negtrain = neg_train,
#         negtest = neg_test,
#         postrain = pos_train, 
#         postest = pos_test,
#         bestwords = best_words)
#     results[model_name]['accuracy'].append(accuracy)
#     results[model_name]['precision'].append(precision)
#     results[model_name]['recall'].append(recall)
#     results[model_name]['matthew'].append(matthew)
          

0. Train on 2136 instances, test on 0 instances
BAG_OF_WORDS


  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


STOP_WORDS
BEST_WORDS
1. Train on 2136 instances, test on 0 instances
BAG_OF_WORDS
STOP_WORDS
BEST_WORDS
2. Train on 2136 instances, test on 0 instances
BAG_OF_WORDS
STOP_WORDS
BEST_WORDS
3. Train on 2136 instances, test on 0 instances
BAG_OF_WORDS
STOP_WORDS
BEST_WORDS
4. Train on 2136 instances, test on 0 instances
BAG_OF_WORDS
STOP_WORDS
BEST_WORDS


In [0]:
# Choose Best Features
best_features = collections.defaultdict(dict)
for model_name in models:
    a = [x for x in results[model_name]['accuracy'] if x is not None]
    if len(a)>0:
        best_features[model_name]['accuracy'] = mean(a)
        
    p = [x for x in results[model_name]['precision'] if x is not None]
    if len(p)>0:
        best_features[model_name]['precision'] = mean(p)
        
    r = [x for x in results[model_name]['recall'] if x is not None]
    if len(r)>0:
        best_features[model_name]['recall'] = mean(r)
        
    m = [x for x in results[model_name]['matthew'] if x is not None]
    if len(m)>0:
        best_features[model_name]['matthew'] = mean(m)     
    
# Highest accuracy, precision, recall, Matthews correlation coefficient
pd.DataFrame.from_dict({(i): best_features[i]  
                        for i in best_features.keys()}, 
                       orient = 'index')

Unnamed: 0,recall,matthew,precision,accuracy
bag_of_words,0.0,0.0,0.0,
best_words,0.0,0.0,0.0,
stop_words,0.0,0.0,0.0,


# BEST MODEL: TRAIN ON FULL DEVELOPING DATA SET, EVALUATE ON VALIDATION SET

In [0]:
num_train = len(neg_develop) + len(pos_develop)
num_test = len(neg_val) + len(pos_val)    
print('Train on %d instances, test on %d instances' % (num_train, num_test)) 

Train on 2136 instances, test on 713 instances


In [0]:
# Get best words
best_words = get_best_words(neg_develop, pos_develop, 5000)

In [0]:
# Train model
model_name = 'stop_words'
print(model_name.upper()) 

accuracy, precision, recall, matthew, best_classifier = evaluate_classifier(
    stopword_filtered_word_feats,                         
    negtrain = neg_develop,
    negtest = neg_val,
    postrain = pos_develop, 
    postest = pos_val)

STOP_WORDS
Precision: Division by zero
Recall: Division by zero


  if pos_label not in present_labels:


In [0]:
# Evaluate model
if accuracy is not None: print('Accuracy: %f' % accuracy)
if precision is not None: print('Precision: %f' % precision)
if recall is not None: print('Recall: %f' % recall)
if matthew is not None: print('Matthew corr coeff: %f' % matthew)

Accuracy: 0.566620
Matthew corr coeff: 0.380511


In [0]:
# Most relevant features
best_classifier.show_most_informative_features()

Most Informative Features
                 emailed = True              neg : pos    =     46.2 : 1.0
                  ticket = True              neg : pos    =     44.3 : 1.0
                  refund = True              neg : pos    =     39.2 : 1.0
                  unable = True              neg : pos    =     38.4 : 1.0
              restricted = True              neg : pos    =     38.4 : 1.0
                      20 = True              neg : pos    =     36.4 : 1.0
                   worst = True              neg : pos    =     36.0 : 1.0
                   today = True              neg : pos    =     30.5 : 1.0
                      00 = True              neg : pos    =     30.5 : 1.0
               contacted = True              neg : pos    =     30.1 : 1.0
