### Standard import

In [1]:
import numpy as np
import pandas as pd
from scipy import sparse

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_auc_score

%matplotlib inline

In [2]:
review_all = pd.read_csv("sampledata_clean.csv")

In [3]:
n_examples = review_all.shape[0]
rev_samp = review_all.sample(n = n_examples, random_state = 43)

x = int(n_examples * 0.8)

train = rev_samp[0:x]
test = rev_samp[x:]

In [4]:
train.shape, test.shape

((25950, 4), (6488, 4))

In [5]:
train = train[['text', 'review_stars']]

In [6]:
train = pd.get_dummies(train, columns = ['review_stars'])
train.head()

Unnamed: 0,text,review_stars_1,review_stars_2,review_stars_3,review_stars_4,review_stars_5
13115,young persian woman cash register lot attitude...,1,0,0,0,0
6525,kinds dietary restrictions accommodated perfec...,0,0,0,0,1
12053,fan unlimited salad place guess ok jack prices...,1,0,0,0,0
15745,best chinese food great atmosphere service imp...,0,0,0,0,1
31220,went talk seriously disappointed ll start blue...,0,1,0,0,0


In [7]:
test = test[['text', 'review_stars']]
test = pd.get_dummies(test, columns = ['review_stars'])
train.shape, test.shape

((25950, 6), (6488, 6))

In [8]:
train_samp = train.sample(frac = .1, random_state = 43)
test_samp = test.sample(frac = .1, random_state = 43)
train_samp.shape, test_samp.shape

((2595, 6), (649, 6))

### Naive Bayes Linear Model

In [9]:
max_features = 2000
tfidf = TfidfVectorizer(max_features = max_features)

In [10]:
class Naive_Bayes(BaseEstimator):
    def __init__(self, alpha):
        self.alpha = alpha
        
    def preprocess_x(self, x, r):
        return x.multiply(r)
    
    def pr(self, x, y_i, y):
        p = x[y == y_i].sum(0)
        return (p + self.alpha)/((y==y_i).sum()+self.alpha)
    
    def fit(self, x, y = None):
        self._r = sparse.csr_matrix(np.log(self.pr(x, 1, y) /self.pr(x, 0, y)))
        return self
    
    def transform(self, x):
        x_nb = self.preprocess_x(x, self._r)
        return x_nb

In [11]:
lr = LogisticRegression()
nb = Naive_Bayes(1)

p = Pipeline([('tfidf', tfidf), ('nb', nb), ('lr', lr)])

In [12]:
class_names = ['review_stars_1', 'review_stars_2', 'review_stars_3', 'review_stars_4', 'review_stars_5']
scores = []

preds = np.zeros((len(test_samp), len(class_names)))

In [13]:
for i, class_name in enumerate(class_names):
    
    train_target = train_samp[class_name]  
    
    cv_score = np.mean(cross_val_score(estimator = p, X = train_samp['text'].values,
                                       y = train_target, cv = 3, scoring = 'accuracy'))
    scores.append(cv_score)
    
    p.fit(train_samp['text'].values, train_target)
    preds[:,i] = p.predict_proba(test_samp['text'].values)[:,1]

In [14]:
t = metrics.classification_report(np.argmax(test_samp[class_names].values, axis = 1),np.argmax(preds, axis = 1))
print(t)

              precision    recall  f1-score   support

           0       0.74      0.61      0.67        80
           1       0.53      0.12      0.20        73
           2       0.32      0.11      0.16        73
           3       0.40      0.34      0.36       163
           4       0.60      0.93      0.73       260

    accuracy                           0.56       649
   macro avg       0.52      0.42      0.43       649
weighted avg       0.53      0.56      0.51       649

