In [76]:
import numpy as np
import pandas as pd
from scipy import sparse

from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn import metrics
from sklearn.metrics import roc_auc_score

%matplotlib inline

In [151]:
train = pd.read_csv('/home/adam/R/Yelp/dataset/model_train.csv', usecols=['text', 'stars'])
train = pd.get_dummies(train, columns=['stars'])
test = pd.read_csv('/home/adam/R/Yelp/dataset/model_test.csv', usecols=['text', 'stars'])
test = pd.get_dummies(test, columns = ['stars'])

In [78]:
train.shape, test.shape

((28000, 6), (7000, 6))

In [140]:
train = train.sample(frac = .50)
test = test.sample(frac = .50)

In [152]:
scoring = 'roc_auc'
cv = 3
n_jobs = -1
max_features = 2500

In [153]:
class NBFeaturer(BaseEstimator):
    def __init__(self, alpha):
        self.alpha = alpha
        
    def preprocess_x(self, x, r):
        return x.multiply(r)
    
    
    def pr(self, x, y_i, y):
        p = x[y == y_i].sum(0)
        return (p + self.alpha)/((y==y_i).sum()+self.alpha)
    
    def fit(self, x, y = None):
        self._r = sparse.csr_matrix(np.log(self.pr(x, 1, y) /self.pr(x, 0, y)))
        return self
    
    def transform(self, x):
        x_nb = self.preprocess_x(x, self._r)
        return x_nb

In [154]:
tfidf = TfidfVectorizer(max_features = max_features)
lr = LogisticRegression()
nb = NBFeaturer(1)
p = Pipeline([
    ('tfidf', tfidf),
    ('nb', nb),
    ('lr', lr)
])

In [155]:
class_names = ['stars_1', 'stars_2', 'stars_3', 'stars_4', 'stars_5']

In [156]:
scores = []
preds = np.zeros((len(test), len(class_names)))

In [157]:
for i, class_name in enumerate(class_names):
    train_target = train[class_name].values
    cv_score = np.mean(cross_val_score(estimator=p, X = train['text'].values,
                                       y = train_target,
                                      cv = cv, scoring = scoring, n_jobs = n_jobs))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))
    p.fit(train['text'].values, train_target)
    preds[:,i] = p.predict_proba(test['text'].values)[:,1]

CV score for class stars_1 is 0.964846774635839
CV score for class stars_2 is 0.8856696298354785
CV score for class stars_3 is 0.8353248280978337
CV score for class stars_4 is 0.7625265896038552
CV score for class stars_5 is 0.8974893679390711


In [158]:
np.mean(scores)

0.8691714380224156

In [86]:
import re, string

In [87]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ',s).split()

In [148]:
tfidf = TfidfVectorizer(max_features = max_features, ngram_range = (1,2))
lr = LogisticRegression()
nb = NBFeaturer(1)
p = Pipeline([
    ('tfidf', tfidf),
    ('nb', nb),
    ('lr', lr)
])

In [149]:
for i, class_name in enumerate(class_names):
    train_target = train[class_name].values
    cv_score = np.mean(cross_val_score(estimator=p, X = train['text'].values,
                                       y = train_target,
                                      cv = cv, scoring = scoring, n_jobs = n_jobs))
    scores.append(cv_score)
    print('CV score for class {} is {}'.format(class_name, cv_score))
    p.fit(train['text'].values, train_target)
    preds[:,i] = p.predict_proba(test['text'].values)[:,1]

CV score for class stars_1 is 0.9013675213675213
CV score for class stars_2 is 0.669199113643558
CV score for class stars_3 is 0.6673333333333332
CV score for class stars_4 is 0.61002886002886
CV score for class stars_5 is 0.7358231437178805


In [150]:
np.mean(scores)

0.7231778336673074