In [59]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from numpy.linalg import norm
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
import nltk
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [60]:
trainData = pd.read_csv(filepath_or_buffer='data/train.dat.txt', header=None, sep='\t',names=["rating","review"])[0:2]
testData =  pd.read_csv(filepath_or_buffer='data/test.dat.txt', header=None, sep='\n',names=["review"])[0:2]

trainData.columns = ["rating","review"]
testData.columns = ["review"]


In [61]:
# df["names"] = [BeautifulSoup(text).get_text() for text in df["names"] ]
# df["names"] = df["names"].map(lambda x: re.sub(r'\W+', " ", x))
# df["names"] = df["names"].map(lambda x: re.sub(r'\d+', "", x))
# df["names"] = df["names"].map(lambda x: WordNetLemmatizer().lemmatize(x))
# df["names"] = df["names"].map(lambda x: PorterStemmer().stem(x))
# df["names"] = df["names"].str.lower().str.split(" ")

# stop = stopwords.words('english')
# df["names"] = df["names"].apply(lambda x: [item for item in x if item not in stop])

# df1["test"] = [BeautifulSoup(text).get_text() for text in df1["test"] ]
# df1["test"] = df1["test"].map(lambda x: re.sub(r'\W+', " ", x))
# df1["test"] = df1["test"].map(lambda x: re.sub(r'\d+', "", x))
# df1["test"] = df1["test"].map(lambda x: WordNetLemmatizer().lemmatize(x))
# df1["test"] = df1["test"].map(lambda x: PorterStemmer().stem(x))
# df1["test"] = df1["test"].str.lower().str.split(" ")
# df1["test"] = df1["test"].apply(lambda x: [item for item in x if item not in stop])
# print('done')

In [62]:
def preProcess(data):
    print(data)
    data["review"] = [BeautifulSoup(text).get_text() for text in data["review"] ]
    data["review"] = data["review"].map(lambda x: re.sub(r'\W+', " ", x))
    data["review"] = data["review"].map(lambda x: re.sub(r'\d+', "", x))
    data["review"] = data["review"].map(lambda x: WordNetLemmatizer().lemmatize(x))
    data["review"] = data["review"].map(lambda x: PorterStemmer().stem(x))
    data["review"] = data["review"].str.lower().str.split(" ")
    
    stop_words = stopwords.words('english')
    data["review"] = data["review"].apply(lambda x: [item for item in x if item not in stop_words])
    return data;
    
trainData = preProcess(trainData)
testData = preProcess(testData)
print('pre-processing done')

   rating                                             review
0      -1  Although a film with Bruce Willis is always wo...
1      -1  This movie was slower then Molasses in January...
                                              review
0  I am so glad when i watch in every time the mo...
1  when I first heard about this movie, I noticed...
pre-processing done


In [63]:

def build_matrix(docs):
    r""" Build sparse matrix from a list of documents, 
    each of which is a list of word/terms in the document.  
    """
    idx = {}
    tid = 0
    nnz = 0
    nrows = len(docs)
    for d in docs:
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)

    # set up memory

    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)

    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
    
    mat = csr_matrix((val, ind, ptr), shape=(nrows, nnz), dtype=np.double)
    mat.sort_indices()
    return mat

combinedMat = trainData["review"].append(testData["review"]);
combinedCSRMat = build_matrix(combinedMat);
trainMat = combinedMat[0:len(trainData)];
testMat = combinedMat[len(trainData):len(trainData)+len(testData)]


def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

In [64]:
def csr_idf(mat, copy=False, **kargs):
    r""" Scale a CSR matrix by idf. 
    Returns scaling factors as dict. If copy is True, 
    returns scaled matrix and scaling factors.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # document frequency
    df = defaultdict(int)
    for i in ind:
        df[i] += 1
    # inverse document frequency
    for k,v in df.items():
        df[k] = np.log(nrows / float(v))  ## df turns to idf - reusing memory
    # scale by idf
    for i in range(0, nnz):
        val[i] *= df[ind[i]]
        
    return df if copy is False else mat

def csr_l2normalize(mat,copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
    return csr_matrix((val, ind, ptr), shape=(nrows, nnz), dtype=np.double)

scaledTrain = csr_idf(trainMat, copy=True)
scaledTest = csr_idf(testMat, copy=True)

normTrain = csr_l2normalize(scaledTrain, copy=True)
normTest = csr_l2normalize(scaledTest, copy=True)

csr_info(normTrain)
csr_info(normTest)

AttributeError: 'Series' object has no attribute 'nnz'

In [None]:
def classify(x, train, clstr, k=17):
        r""" Classify vector x using kNN and majority vote rule given training data and associated classes
        """
        # find nearest neighbors for x
        dots = x.dot(train.T)
        sims = list(zip(dots.indices, dots.data))
        if len(sims) == 0:
            # could not find any neighbors
            return '+' if np.random.rand() > 0.5 else '-'
        sims.sort(key=lambda x: x[1], reverse=True)
        tc = Counter(clstr[s[0]] for s in sims[:k]).most_common(k)
        if len(tc) < 2 or tc[0][1] > tc[1][1]:
            # majority vote
            return tc[0][0]
        # tie break
        tc = defaultdict(float)
        for s in sims[:k]:
            tc[clstr[s[0]]] += s[1]
        return sorted(tc.items(), key=lambda x: x[1], reverse=True)[0][0]

In [None]:
try: 
    outFile = open("data/output.dat", 'w')
    count=0
    l = [ classify(normTest[i], normTrain, trainData["review"]) for i in range(normTest.shape[0]) ]
    for i in range(len(l)):
        if str(l[i])=="+1":
            outFile.write("+1\n")
        elif str(l[i])=="-1":
            outFile.write("-1\n")
    outFile.close();
except IOError as (errno, strerror):
     print "I/O error({0}): {1}".format(errno, strerror)
finally:
    outFile.close()