### Load Data

In [1]:
import numpy as np
import pandas as pd
import pickle as pkl
from itertools import chain
from collections import Counter

In [2]:
# Load data
train = pd.read_csv('../data/train.csv', index_col='ex_id')
val = pd.read_csv('../data/dev.csv', index_col='ex_id')
test = pd.read_csv('../data/test_no_label.csv', index_col='ex_id')

# Load tokenized data
train_data_tokens = pkl.load(open("../data/tokens/train_data_tokens.pkl", "rb"))
val_data_tokens = pkl.load(open("../data/tokens/val_data_tokens.pkl", "rb"))
test_data_tokens = pkl.load(open("../data/tokens/test_data_tokens.pkl", "rb"))

In [3]:
# Combine train and val for final
train = pd.concat([train, val])
train_data_tokens = train_data_tokens + val_data_tokens
all_train_tokens = list(chain.from_iterable(train_data_tokens))

In [4]:
# Vocab
def build_vocab(all_tokens, threshold):
    c = Counter(all_tokens)
    vocab = [word for word, count in Counter(all_train_tokens).items() if count >= threshold]
    id2token = vocab
    token2id = dict(zip(vocab, range(len(vocab))))
    return token2id, id2token

token2id, id2token = build_vocab(all_train_tokens, 20)

In [5]:
# TFIDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

def dummy(doc):
    return doc

tfidf_vec = TfidfVectorizer(lowercase=False, preprocessor=dummy, tokenizer=dummy, vocabulary=token2id)  

X_train_tfidf = tfidf_vec.fit_transform(train_data_tokens)
X_test_tfidf = tfidf_vec.transform(test_data_tokens)

In [6]:
# Get labels
y_train = train.label.values
y_test = test.label.values

### Train Latent Factor Model

##### MyAlgo

In [7]:
from surprise import AlgoBase
from surprise import SVD
from surprise import PredictionImpossible

class MyAlgo(SVD):

    def __init__(self, n_factors=25, n_epochs=20, biased=False, 
                 lr_all=.005, reg_all=.1, random_state=None, verbose=False):

        SVD.__init__(self, n_factors=n_factors, n_epochs=n_epochs, 
                     biased=biased, lr_all=lr_all, reg_all=reg_all, 
                     random_state=random_state, verbose=verbose)

    def fit(self, trainset):

        SVD.fit(self, trainset)

        return self

    def estimate(self, u, i):

        known_user = self.trainset.knows_user(u)
        known_item = self.trainset.knows_item(i)

        if known_user and known_item:

            if self.biased:
                est = self.trainset.global_mean
                if known_user:
                    est += self.bu[u]

                if known_item:
                    est += self.bi[i]

                if known_user and known_item:
                    est += np.dot(self.qi[i], self.pu[u])

            else:
                est = np.dot(self.qi[i], self.pu[u])    

        else:
            est = 0
            raise PredictionImpossible('User and item are unknown.')

        return est
    
    def test(self, testset, clip=False, verbose=False):
        predictions = [self.predict(uid,
                                    iid,
                                    r_ui_trans,
                                    clip=clip,
                                    verbose=verbose)
                       for (uid, iid, r_ui_trans) in testset]
        return predictions

##### Construct ALS Dataset

In [8]:
from surprise import Dataset, Reader, accuracy

# Get train data
# SELECT user_id, prod_id, rating FROM train WHERE label = 0
train_als = train[(train['label'] == 0)][['user_id', 'prod_id', 'rating']]

# The columns must correspond to user id, item id and ratings (in that order)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(train_als[['user_id', 'prod_id', 'rating']], reader)

##### Train Model

In [9]:
rank = 25
reg = 0.1

algo = MyAlgo(n_factors=rank, reg_all=reg, biased=True,
              lr_all=0.005, n_epochs=30, verbose=False, random_state=None)

algo.fit(data.build_full_trainset())

# user_factors = algo.pu
# prod_factors = algo.qi

<__main__.MyAlgo at 0x127a07b50>

#### Generate rating features & Combine rating features and review features

In [10]:
from scipy.sparse import csr_matrix, coo_matrix, hstack

def combine_features(user_features, rating_features, review_features):
    return hstack( [user_features, csr_matrix(rating_features), review_features], format='csr' )

def get_rating_features(val, algo):
    testset = list(zip(val['user_id'].values, val['prod_id'].values, val['rating'].values))
    predictions = algo.test(testset)
    pred_rating = []
    is_missing = []
    actual_rating = []
    diff = []
    
    for pred in predictions:
        pred_rating.append(pred.est)
        is_missing.append( int(pred.details['was_impossible']) )
        actual_rating.append(pred.r_ui)
        diff.append(pred.r_ui - pred.est)
    
    rating_features = list(zip(pred_rating, is_missing, actual_rating, diff))

    return rating_features

def get_user_features(dataset):
    users = dataset['user_id']
    users_fake_cnts = Counter(train[train['label'] == 1]['user_id'])
    users_feature = [users_fake_cnts[user] if user in users_fake_cnts else 0 for user in users]
    return np.array(users_feature).reshape([len(users_feature), 1])

In [11]:
train_rating_features = get_rating_features(train, algo)
test_rating_features = get_rating_features(test, algo)

train_users_features = get_user_features(train)
test_users_features = get_user_features(test)

X_train = combine_features(train_users_features, train_rating_features, X_train_tfidf)
X_test = combine_features(test_users_features, test_rating_features, X_test_tfidf)

### Logistic Regression

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
lr = LogisticRegression(C=0.4, max_iter=1000)
lr.fit(X_train, y_train)

LogisticRegression(C=0.4, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
pred = lr.predict_proba(X_test)[:, 1]

In [15]:
# Predict
test['pred'] = pred

In [16]:
# Save result
test[['pred']].to_csv('predictions_v2.csv', index=False, header=False)

In [17]:
test

Unnamed: 0_level_0,user_id,prod_id,rating,label,date,review,pred
ex_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6,929,0,4.0,,2009-08-25,Let me start with a shout-out to everyone who ...,0.997499
9,932,0,5.0,,2014-05-09,Stopped in for lunch today and couldn't believ...,0.998199
14,937,0,4.0,,2014-10-15,"Tiny little place, but very good food. Pastits...",0.002167
22,945,0,5.0,,2014-04-10,Food was delicious and service was great. Good...,0.998846
23,946,0,5.0,,2014-03-29,Awesome hole in the wall place to grab a quick...,0.001729
...,...,...,...,...,...,...,...
358923,3429,349,3.0,,2014-04-08,Meh. I guess this might have been a little ov...,0.001542
358924,161137,349,5.0,,2014-04-05,Thank god we got in when we did. There was but...,0.997644
358936,41730,349,5.0,,2014-03-03,when tasting table sent me an email about this...,0.003661
358952,161146,349,5.0,,2014-02-06,"I'm very spoiled with Pizza. Really, I have tr...",0.998315
