In [29]:
import random
import sys
import pandas as pd
import numpy as np
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.corpus import stopwords
from collections import Counter
from nltk.sentiment.util import mark_negation

random.seed(1234)
file_names = ["hotelNegT-train.txt", "hotelPosT-train.txt"]
class_type_mappings = {"hotelNegT-train.txt": "NEG", "hotelPosT-train.txt": "POS"}
class_list = ["NEG", "POS"]

def get_reviews_raw(file_name, class_type):
    with open(file_name) as review_file:
        reviews = review_file.readlines()
    reviews = map(lambda x : tuple((x.strip() + '\t' + class_type).split('\t')), reviews)
    return reviews

def get_all_reviews(file_names, class_type_mapping):
    reviews_all = []
    for file_name in file_names:
        reviews_all.extend(get_reviews_raw(file_name, class_type_mapping[file_name]))
        
    random.shuffle(reviews_all)
    return reviews_all

def get_class(review):
    return review[2]

def get_class_probability(reviews_list, class_list):
    result = pd.Series({x:0 for x in class_list})
    for review in reviews_list:
        result[get_class(review)] += 1.0
    
    total_reviews = len(reviews_list)
    result = np.log(result/total_reviews)
    return result

def get_sent_tokenize(reviews):
    return pd.DataFrame(map(lambda x : (x[0], tuple(sent_tokenize(x[1])), x[2]), reviews), columns = ['ID', 'Sentences', 'Class'])

def word_tokenize_driver(sentence):
    tokens = [ token.lower() for token in  mark_negation(word_tokenize(sentence), double_neg_flip= True) if token.isalpha()]
    return tokens

def get_word_tokenize(review):
    tokenizer = RegexpTokenizer(r'\s+', gaps = True)
    review['Word_Token'] = review['Sentences'].apply(lambda x : '\n'.join(x))
    review['Word_Token'] = review['Word_Token'].apply(lambda x : word_tokenize_driver(x))
#     review_all['Word_Token'] = review_all['Word_Token'].apply(lambda x : tokenizer.tokenize(x))
    
def get_remove_stop_words(reviews):
    stop_words = set(stopwords.words('english'))
    reviews['Word_Token_SW'] = reviews['Word_Token'].apply(lambda tokens :[token for token in tokens if token not in stop_words] )
                
def generate_bag_of_words(reviews):
    reviews['bag_of_words'] = reviews['Word_Token_SW'].apply(lambda tokens : Counter(set(tokens)))
    
def get_bag_of_words(reviews):
    result = Counter()
    for bag_of_words in reviews.bag_of_words:
        result.update(bag_of_words)
    return result

def get_vocab(reviews):
    vocab = set()
    for counter_bow in reviews['bag_of_words']:
        vocab.update(counter_bow.keys())
    return vocab

def train_naive_bayes(train_reviews, class_list):
    logprior = {}
    loglikelihood = {}
    vocab = get_vocab(train_reviews)
    vocab_size = len(vocab)
    for c in class_list:
        n_doc = train_reviews.shape[0]
        n_c = train_reviews[train_reviews.Class == c].shape[0]
        logprior[c] = np.log(n_c*1.0/n_doc)
        bag_of_words = get_bag_of_words(train_reviews[train_reviews.Class == c])
        total_word_count_for_class = sum(bag_of_words.values())
        
        for word in vocab:
            word_count_for_class = bag_of_words[word]
            loglikelihood[(word, c)] = np.log((word_count_for_class + 1.0)/(total_word_count_for_class + vocab_size))
    return logprior, loglikelihood, vocab

def test_naive_bayes(test_review, logprior, loglikelihood, class_list, vocab):
    pred_class_sum = {}
    for c in class_list:
        pred_class_sum[c] = logprior[c]
        words = set(test_review['bag_of_words'].keys())
        for word in words:
            if word in vocab:
                pred_class_sum[c] += loglikelihood[(word, c)]
    
#     for key, value in pred_class_sum.items():
    return max(pred_class_sum, key = pred_class_sum.get)

def k_fold_test(k = 10):
    reviews_all = get_all_reviews(file_names, class_type_mappings)
    reviews_all = get_sent_tokenize(reviews_all)
    get_word_tokenize(reviews_all)
    get_remove_stop_words(reviews_all)
    generate_bag_of_words(reviews_all)
    
    count = reviews_all.shape[0]
    print "Total Count", count
    test_count = int (count * 1.0/ k) + 1
    accuracy_list = []
    
    for i in xrange(k):
        mask = np.zeros(count, dtype=bool)
        if i == k - 1:
            mask[i * test_count:] = True
        else:
            mask[i * test_count: (i + 1) * test_count] = True
        
        print "\nFor Fold %s as dev " % i
    
        train_reviews = reviews_all[~mask]
        test_reviews = reviews_all[mask]
        print "POS: %d, NEG: %d" % (np.sum(train_reviews.Class == 'POS'), np.sum(train_reviews.Class == 'NEG'))
        logprior, loglikelihood, vocab = train_naive_bayes(train_reviews, class_list)
        test_reviews['Pred_Class'] = test_reviews.apply(lambda x : test_naive_bayes(x, logprior, loglikelihood, class_list, vocab), axis = 1)
        correct_pred = np.sum(test_reviews.Class == test_reviews.Pred_Class)
        total_test_reviews = test_reviews.shape[0]
        accuracy = correct_pred * 100.0/total_test_reviews
        print "Correct: %d, Total: %d, Accuracy: %f" % (correct_pred, total_test_reviews, accuracy)
        accuracy_list.append(accuracy)
        
        print test_reviews[test_reviews.Class != test_reviews.Pred_Class][['ID', 'Class', 'Pred_Class']]
        
    return accuracy_list, reviews_all
            
accuracy_list,df  = k_fold_test()

print np.average(accuracy_list)

print accuracy_list

Total Count 189

For Fold 0 as dev 
POS: 85, NEG: 85
Correct: 17, Total: 19, Accuracy: 89.473684
         ID Class Pred_Class
8   ID-0869   NEG        POS
11  ID-0885   NEG        POS

For Fold 1 as dev 
POS: 87, NEG: 83
Correct: 18, Total: 19, Accuracy: 94.736842
         ID Class Pred_Class
37  ID-0961   NEG        POS

For Fold 2 as dev 
POS: 86, NEG: 84
Correct: 18, Total: 19, Accuracy: 94.736842
         ID Class Pred_Class
51  ID-0993   NEG        POS

For Fold 3 as dev 
POS: 84, NEG: 86
Correct: 18, Total: 19, Accuracy: 94.736842
         ID Class Pred_Class
71  ID-1196   POS        NEG

For Fold 4 as dev 
POS: 84, NEG: 86
Correct: 17, Total: 19, Accuracy: 89.473684
         ID Class Pred_Class
76  ID-0997   NEG        POS
77  ID-0971   NEG        POS

For Fold 5 as dev 
POS: 87, NEG: 83
Correct: 17, Total: 19, Accuracy: 89.473684
          ID Class Pred_Class
104  ID-0827   NEG        POS
111  ID-0941   NEG        POS

For Fold 6 as dev 
POS: 83, NEG: 87
Correct: 15, Total: 19,