In [348]:
import numpy as np
import pandas as pd
import scipy as sp
from scipy import spatial

import matplotlib.pyplot as plt # side-stepping mpl backend

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import pairwise
from sklearn.feature_selection import SelectPercentile, f_classif
# from sklearn import cross_validation
import heapq
import string
import re

from nltk.corpus import stopwords
from nltk import word_tokenize
from collections import defaultdict
from collections import Counter

trainingData  = pd.read_csv('data/train.dat.txt', sep="\t", encoding='utf-8', header=None, names=["rating","review"])[0:5];
testData = pd.read_csv('data/test.dat.txt', sep="\t", encoding='utf-8', header=None, names=["review"])[0:5];


In [349]:
# 
print('Before Cleaning');
print (trainingData[0:1])

def preProcess(reviews):
#     print(reviews);
    processedReviews = [];
    for review in reviews:
        tokens = word_tokenize(review);
        filteredTokens = [];
        for token in list(tokens):
#             print(token);
            # if it is a stopword then eliminate
            if token.lower() in stopwords.words('english'):
#                 print('removing stopword ' + token);
                continue;
            # if it is punctuation then eliminate
            if token.lower() in set(string.punctuation):
#                 print('removing punct ' + token);
                continue;
            if len(token)<=3:
#                 print('removing small ' + token);
                continue;
            token = token.lower();
            filteredTokens.append(token);
#         print(filteredTokens);
        processedReviews.append(filteredTokens);
#     print(len(processedReviews));
    return processedReviews;
print('After Cleaning')
XTrain = preProcess(trainingData['review']);
print(XTrain);
print (len(XTrain));
XTest = preProcess(testData['review']);
print (len(XTest));

Before Cleaning
   rating                                             review
0      -1  Although a film with Bruce Willis is always wo...
After Cleaning
[[u'although', u'film', u'bruce', u'willis', u'always', u'worth', u'watching', u'better', u'skip', u'watched', u'television', u'plunk', u'cash', u'lucky', u'plot', u'develops', u'slowly', u'slowly', u'although', u'first', u'minutes', u'quite', u'believable', u'gets', u'unbelievable', u'towards', u'highly', u'questionable', u'seasoned', u'soldier', u'like', u'waters', u'would', u'disobey', u'direct', u'orders', u'even', u'would', u'rest', u'platoon', u'would', u'know', u'puts', u'direct', u'danger', u'know', u'certainly', u'follow', u'heck', u'says', u'despite', u'direct', u'orders', u'remember', u'still', u'nice', u'scenes', u'movie', u'somewhat', u'save', u'village', u'total', u'population', u'massacred', u'rebels', u'well', u'save', u'dozen', u'villagers', u'rest', u'already', u'killed', u'strange', u'part', u'take', u'trucks', u'reb

In [350]:
# count frequencies for all words in the Training data
def countFrequency(data):
    wordCountInData = Counter()
    for d in data:
    #     print(d);
        for w in d:
    #         print(w);
            if w not in wordCountInData:
                wordCountInData[w]=1
            else:
                wordCountInData[w] += 1
    print("Number of unique words: %d." % len(wordCountInData));
    return wordCountInData;

wordCountInTrainingData = countFrequency(XTrain);
wordCountInTestData = countFrequency(XTest);
print(len(wordCountInTrainingData));
print(len(wordCountInTestData));

Number of unique words: 375.
Number of unique words: 277.
375
277


In [351]:
#Top percent of training data
topPct = 0.5;
topPctOfTraining = wordCountInTrainingData.most_common(int(round(len(wordCountInTrainingData)*topPct)));
topPctOfTest = wordCountInTestData.most_common(int(round(len(wordCountInTestData)*topPct)));

# Get a set of topPctOfTraining
topPctOfTrainingSet = set()
for word in topPctOfTraining:
    topPctOfTrainingSet.add(word[0]);

# Get a set of topPctOfTest
topPctOfTestSet = set()
for word in topPctOfTest:
    topPctOfTestSet.add(word[0]);
    
# print(topPctOfTrainingSet);
# print(topPctOfTestSet);
print(len(topPctOfTrainingSet));
print(len(topPctOfTestSet));

188
139


In [352]:
testTrainSet =  set.intersection(topPctOfTrainingSet,topPctOfTestSet);
print("Number of unique words common in both test and train: %d." % len(testTrainSet));
# print(testTrainSet);

Number of unique words common in both test and train: 21.


In [353]:
cnt = Counter(testTrainSet);
top_percentile = 1.0
features = cnt.most_common(int(round(len(cnt)*top_percentile)))
featuresCount = len(features)
# print(features);
print(featuresCount)

21


In [354]:
featureList= [];
for feature in features:
    featureList.append(feature[0])
print(len(featureList));
print(featureList);

21
[u'enjoy', u'even', u'great', u'made', u'like', u'supporting', u'brilliant', u'movie', u'never', u'watch', u'actors', u'worth', u'role', u'know', u'still', u'time', u'well', u'every', u'work', u'think', u'film']


In [359]:
# #Buid CSR matrix

from collections import Counter
from scipy.sparse import csr_matrix
def build_matrix(docs):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    dim = len(featureList)
    feature_set = set(featureList[:dim])
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    for d in docs:
        set_d = set(d)
        d = list(set.intersection(feature_set,set_d))
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        set_d = set(d)
        d = list(set.intersection(feature_set,set_d))
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat


def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

In [360]:
docsTrainData = [l.split() for l in trainingData['review']]
matTrainData  = build_matrix(docsTrainData);
csr_info(matTrainData);

print (matTrainData[:1])

print(testData[:1])
docsTestData = [l.split() for l in testData['review']]
matTestData  = build_matrix(docsTestData)
csr_info(matTestData)
print (matTestData[:1])

 [nrows 5, ncols 21, nnz 34]
  (0, 0)	1.0
  (0, 1)	1.0
  (0, 2)	1.0
  (0, 3)	1.0
  (0, 4)	1.0
  (0, 5)	1.0
  (0, 6)	1.0
  (0, 7)	1.0
