In [1359]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import spatial

import matplotlib.pyplot as plt # side-stepping mpl backend

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_selection import SelectPercentile, f_classif
# from sklearn import cross_validation
import heapq
import string
import re

from nltk.corpus import stopwords
from nltk import word_tokenize
from collections import defaultdict
from collections import Counter

trainingData  = pd.read_csv('data/train.dat.txt', sep="\t", encoding='utf-8', header=None, names=["rating","review"])[:2];
testData = pd.read_csv('data/test.dat.txt', sep="\t", encoding='utf-8', header=None, names=["review"])[:2];


In [1360]:
# 
print('Before Cleaning');
print (trainingData[0:1])

def preProcess(reviews):
#     print(reviews);
    processedReviews = [];
    for review in reviews:
        tokens = word_tokenize(review);
        filteredTokens = [];
        for token in list(tokens):
#             print(token);
            # if it is a stopword then eliminate
            if token.lower() in stopwords.words('english'):
#                 print('removing stopword ' + token);
                continue;
            # if it is punctuation then eliminate
            if token.lower() in set(string.punctuation):
#                 print('removing punct ' + token);
                continue;
            if len(token)<=3:
#                 print('removing small ' + token);
                continue;
            token = token.lower();
            filteredTokens.append(token);
#         print(filteredTokens);
        processedReviews.append(filteredTokens);
#     print(len(processedReviews));
    return processedReviews;
print('After Cleaning')
XTrain = preProcess(trainingData['review']);
print(XTrain);
print (len(XTrain));
XTest = preProcess(testData['review']);
print (len(XTest));

Before Cleaning
   rating                                             review
0      -1  Although a film with Bruce Willis is always wo...
After Cleaning
[[u'although', u'film', u'bruce', u'willis', u'always', u'worth', u'watching', u'better', u'skip', u'watched', u'television', u'plunk', u'cash', u'lucky', u'plot', u'develops', u'slowly', u'slowly', u'although', u'first', u'minutes', u'quite', u'believable', u'gets', u'unbelievable', u'towards', u'highly', u'questionable', u'seasoned', u'soldier', u'like', u'waters', u'would', u'disobey', u'direct', u'orders', u'even', u'would', u'rest', u'platoon', u'would', u'know', u'puts', u'direct', u'danger', u'know', u'certainly', u'follow', u'heck', u'says', u'despite', u'direct', u'orders', u'remember', u'still', u'nice', u'scenes', u'movie', u'somewhat', u'save', u'village', u'total', u'population', u'massacred', u'rebels', u'well', u'save', u'dozen', u'villagers', u'rest', u'already', u'killed', u'strange', u'part', u'take', u'trucks', u'reb

In [1361]:
# count frequencies for all words in the Training data
def countFrequency(data):
    wordCountInData = Counter()
    for d in data:
    #     print(d);
        for w in d:
    #         print(w);
            if w not in wordCountInData:
                wordCountInData[w]=1
            else:
                wordCountInData[w] += 1
    print("Number of unique words: %d." % len(wordCountInData));
    return wordCountInData;

wordCountInTrainingData = countFrequency(XTrain);
wordCountInTestData = countFrequency(XTest);
print(len(wordCountInTrainingData));
print(len(wordCountInTestData));

Number of unique words: 191.
Number of unique words: 152.
191
152


In [1362]:
#Top percent of training data
topPct = 1;
topPctOfTraining = wordCountInTrainingData.most_common(int(round(len(wordCountInTrainingData)*topPct)));
topPctOfTest = wordCountInTestData.most_common(int(round(len(wordCountInTestData)*topPct)));

# Get a set of topPctOfTraining
topPctOfTrainingSet = set()
for word in topPctOfTraining:
    topPctOfTrainingSet.add(word[0]);

# Get a set of topPctOfTest
topPctOfTestSet = set()
for word in topPctOfTest:
    topPctOfTestSet.add(word[0]);
    
# print(topPctOfTrainingSet);
# print(topPctOfTestSet);
print(len(topPctOfTrainingSet));
print(len(topPctOfTestSet));

191
152


In [1363]:
testTrainSet =  set.intersection(topPctOfTrainingSet,topPctOfTestSet);
print("Number of unique words common in both test and train: %d." % len(testTrainSet));
# print(testTrainSet);

Number of unique words common in both test and train: 17.


In [1364]:
cnt = Counter(testTrainSet);
top_percentile = 1.0
features = cnt.most_common(int(round(len(cnt)*top_percentile)))
featuresCount = len(features)
# print(features);
print(featuresCount)

17


In [1365]:
featureList= [];
for feature in features:
    featureList.append(feature[0])
print(len(featureList));
print(featureList);

17
[u'even', u'made', u'point', u'movie', u'loose', u'scenes', u'first', u'times', u'every', u'take', u'characters', u'time', u'nothing', u'still', u'think', u'film', u'nice']


In [1366]:
# #Buid CSR matrix

from collections import Counter
from scipy.sparse import csr_matrix
def build_matrix(docs):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    dim = len(featureList)
    feature_set = set(featureList[:dim])
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    for d in docs:
#         print('d---')
#         print(d)
#         set_d = set(d)
        set_d = set()
        for word in d:
#             print(word)
            set_d.add(word);
        print('set d--')
        print(set_d);
        print('features--')
        print(feature_set);
        d = list(set.intersection(feature_set,set_d))
        print('common elemets %d',len(d));
        print('com')
        nnz += len(set(d))
        for w in d:
            print('adding word ', w) ;
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
    print(idx);
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        set_d = set(d)
        d = list(set.intersection(feature_set,set_d))
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat


def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

In [1367]:
combinedData = np.concatenate((trainingData['review'], testData['review']))
print(combinedData);
docsCombineData = [word_tokenize(l) for l in combinedData]
matCombineData  = build_matrix(docsCombineData);
csr_info(matCombineData);

# docsTestData = [word_tokenize(l) for l in testData['review']]
# matTestData  = build_matrix(docsTestData)
# csr_info(matTestData)

[ u"Although a film with Bruce Willis is always worth watching, you better skip this one. I watched this one on television, so I didn't have to plunk down cash for it. Lucky me.<br /><br />The plot develops slowly, very slowly. Although the first 30 minutes or so are quite believable, it gets more and more unbelievable towards the end. It is highly questionable, if a seasoned soldier like Lt. Waters would disobey direct orders. And even if he would, if the rest of his platoon would. They know he puts them in direct danger, and they know they will certainly die if they follow him, but what the heck, he is our Lt. so let's do what he says (despite the direct orders, remember).<br /><br />Still, there are some nice scenes in this movie. They somewhat save a village, where the total population is being massacred by the rebels. Well, they save a dozen villagers or so, the rest was already killed. The strange part of it, that they did take the trucks which the rebels left behind. They rather

In [1368]:
scaledData = csr_idf(matCombineData, copy=True)
normalizedData = csr_l2normalize(scaledData, copy=True)
# print("csr matrix:", matTrainData.todense(), "\n")
# print("scaled csr matrix:", scaledTrainData.todense(), "\n")
# print("normalized csr matrix:", normalizedTrainData.todense())

In [1369]:
# scaledTestData = csr_idf(matTestData, copy=True)
# normalizedTestData = csr_l2normalize(scaledTestData, copy=True)
# print("csr matrix:", matTestData.todense(), "\n")
# print("scaled csr matrix:", scaledTestData.todense(), "\n")
# print("normalized csr matrix:", normalizedTestData.todense())

In [1370]:
# Calculate consine similarity between training and test normalized data
from sklearn.metrics.pairwise import cosine_similarity
def calculate_cosine_sim(train,test):
    cosineSimilarity = cosine_similarity(train,test)
    return cosineSimilarity
csr_info(normalizedData)
train = normalizedData[0:(len(trainingData))];
csr_info(train)
print(normalizedData[0:1]);
print('----')
print(normalizedData[1:2]);
print('----')
print(normalizedData[2:3]);
print('----')
print(normalizedData[3:4]);
print('----')
print(len(trainingData));
print(len(testData));
test = normalizedData[len(trainingData):len(trainingData)+len(testData)];
csr_info(test)
cosineSimilarity = calculate_cosine_sim(train, test);
print(len(cosineSimilarity));

 [nrows 4, ncols 17, nnz 40]
 [nrows 2, ncols 17, nnz 20]
  (0, 0)	0.364740858496
  (0, 1)	0.151381133795
  (0, 2)	0.364740858496
  (0, 3)	0.0
  (0, 4)	0.151381133795
  (0, 5)	0.364740858496
  (0, 6)	0.364740858496
  (0, 7)	0.151381133795
  (0, 8)	0.364740858496
  (0, 9)	0.364740858496
  (0, 10)	0.0
  (0, 11)	0.364740858496
----
  (0, 3)	0.0
  (0, 4)	0.199120889896
  (0, 7)	0.199120889896
  (0, 10)	0.0
  (0, 12)	0.479766021727
  (0, 13)	0.479766021727
  (0, 14)	0.479766021727
  (0, 15)	0.479766021727
----
  (0, 1)	0.164773822763
  (0, 2)	0.397009482394
  (0, 3)	0.0
  (0, 5)	0.397009482394
  (0, 6)	0.397009482394
  (0, 7)	0.164773822763
  (0, 10)	0.0
  (0, 11)	0.397009482394
  (0, 12)	0.397009482394
  (0, 13)	0.397009482394
----
  (0, 0)	0.327130984845
  (0, 1)	0.135771625887
  (0, 3)	0.0
  (0, 4)	0.135771625887
  (0, 8)	0.327130984845
  (0, 9)	0.327130984845
  (0, 10)	0.0
  (0, 14)	0.327130984845
  (0, 15)	0.327130984845
  (0, 16)	0.654261969689
----
2
2
 [nrows 2, ncols 17, nnz 20]
2


In [1371]:
# Find K nearest neighbors and write to test file
f = open('data/test_out.dat.txt', 'w')
count = 0
for row in cosineSimilarity:
    k=1
    kLargestSimilarities = np.argpartition(-row, k)
    print(kLargestSimilarities)
    neighbors = kLargestSimilarities[:k]
    print(neighbors)
    neighbourReviewClassList = []
    neighbourReviewClassNegative = 0
    neighbourReviewClassPositive = 0

    for review in neighbors:
        print('review' , trainingData['rating'][review])
        if int(trainingData['rating'][review]) == -1:
            neighbourReviewClassNegative+=1
        elif int(trainingData['rating'][review]) == 1:
            neighbourReviewClassPositive+=1
    print(neighbourReviewClassNegative)
    print(neighbourReviewClassPositive)
    if neighbourReviewClassNegative > neighbourReviewClassPositive:
        f.write('-1\n')
        count+=1
    else:
        f.write('+1\n')
        count+=1

print("count : ",count)

[0 1]
[0]
('review', -1)
1
0
[0 1]
[0]
('review', -1)
1
0
('count : ', 2)


In [1372]:
with open("data/test_out.dat.txt", "r") as fh:
    linesOfFormat = fh.readlines()
print(len(linesOfFormat))

0
