In [71]:
import numpy as np
import pandas as pd
from scipy import sparse

from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

In [8]:
train = pd.read_csv('/home/adam/R/Yelp/dataset/model_train.csv')
train = pd.get_dummies(train, columns=['stars'])

In [109]:
y = train['stars_5'].values[:5000]

In [43]:
y.shape

(5000,)

In [44]:
scoring = 'roc_auc'
cv = 3
n_jobs = -1
max_features = 2500

In [110]:
tfidf = TfidfVectorizer(max_features = max_features)

In [111]:
lr = LogisticRegression()

In [112]:
p = Pipeline([
    ('tfidf', tfidf),
    ('lr', lr)
])

In [113]:
cross_val_score(estimator=p, X=x, y = y, scoring = scoring, cv = cv)

array([0.85819854, 0.87763112, 0.88590467])

In [67]:
class NBFeaturer(BaseEstimator):
    def __init__(self, alpha):
        self.alpha = alpha
        
    def preprocess_x(self, x, r):
        return x.multiply(r)
    
    
    def pr(self, x, y_i, y):
        p = x[y == y_i].sum(0)
        return (p + self.alpha)/((y==y_i).sum()+self.alpha)
    
    def fit(self, x, y = None):
        self._r = sparse.csr_matrix(np.log(self.pr(x, 1, y) /self.pr(x, 0, y)))
        return self
    
    def transform(self, x):
        x_nb = self.preprocess_x(x, self._r)
        return x_nb

In [114]:
tf_idf = TfidfVectorizer(max_features=max_features)
lr = LogisticRegression()
nb = NBFeaturer(1)
p = Pipeline([
    ('tfidf', tfidf),
    ('nb', nb),
    ('lr', lr)
])

In [115]:
cross_val_score(estimator=p, X=x, y = y, scoring = scoring, cv = cv, n_jobs=n_jobs)

array([0.86229373, 0.88329142, 0.88749547])

In [73]:
class Lemmatizer(BaseEstimator):
    def __init__(self):
        self.l = WordNetLemmatizer()
        
    def fit(self, x, y = None):
        return self
    
    def transform(self, x):
        x = map(lambda r: " ".join([self.l.lemmatize(i.lower()) for i in r.split()]),x)
        x = np.array(list(x))
        return x

In [116]:
lm = Lemmatizer()
tfidf = TfidfVectorizer(max_features=max_features)
lr = LogisticRegression()
nb = NBFeaturer(1)
p = Pipeline([
    ('lm', lm),
    ('tfidf', tfidf),
    ('nb', nb),
    ('lr', lr)
])

In [117]:
cross_val_score(estimator=p, X=x, y = y, scoring = scoring, cv=cv, n_jobs=n_jobs)

array([0.86216858, 0.88286772, 0.88619597])

In [118]:
max_features = 2500
lm = Lemmatizer()
tfidf_w = TfidfVectorizer(max_features=max_features, analyzer='word')
tfidf_c = TfidfVectorizer(max_features=max_features, analyzer='char')
lr = LogisticRegression()
nb = NBFeaturer(1)
p = Pipeline([
    ('lm', lm),
    ('wc_tfidfs', 
         FeatureUnion([
            ('tfidf_w', tfidf_w), 
            ('tfidf_c', tfidf_c), 
         ])
    ),
    ('nb', nb),
    ('lr', lr)
])

cross_val_score(estimator=p, X=x, y=y, scoring=scoring, cv=cv, n_jobs=n_jobs)

array([0.86745947, 0.88320999, 0.88887044])