In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics

In [2]:
train = pd.read_csv('/home/adam/R/Yelp/dataset/model_train.csv', usecols = ['text', 'stars'])
test = pd.read_csv('/home/adam/R/Yelp/dataset/model_test.csv', usecols = ['text', 'stars'])

In [3]:
train = pd.get_dummies(train, columns=['stars'])

In [4]:
train.head()

Unnamed: 0,text,stars_1,stars_2,stars_3,stars_4,stars_5
0,I'm always looking for tasty vegetarian restau...,0,0,0,1,0
1,Ribs were very dry and overlooked not fall of ...,0,1,0,0,0
2,Love the lunch specials - have tried the mac&c...,0,0,0,1,0
3,"I am not a meat person, so not too familiar wi...",1,0,0,0,0
4,Not sure what people are raving about. The lin...,0,0,1,0,0


In [5]:
test = pd.get_dummies(test, columns=['stars'])

In [6]:
train['text'][10923]

'The appetizer was bruschetta.  Essentially three pieces of toast with various toppings.  Two of the three were good, but the brocolinni topped one was not my taste.  Certainly, this was NOT worth the $12 price.  Our dinner was pizza.  The crust was very good - almost like a soft, warm, flat breadstick.  Sadly, the pizza was sparsely topped with cheese and sausage.\n\nOur server was attentive, almost to a fault.  The restaurant was excessively loud - we couldn\'t hear the waitress rattle off the soup, appetizer or entree specials (seven specials in total).\n\nWhile the food was not bad, I can\'t justify the prices or quantities.  In fact, I really didn\'t pull up Yelp to rate this restaurant...I\'m actually looking for a place to go eat since I\'m still famished after "eating" here.'

In [7]:
lens = train.text.str.len()
lens.mean(), lens.std(), lens.max(), lens.min()

(584.1313142857143, 550.9753826938884, 5000, 2)

In [8]:
lens.hist();

In [9]:
import re, string

In [10]:
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

In [11]:
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [12]:
n = train.shape[0]
vec = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenize,
                     min_df = 3, max_df = 0.9, strip_accents = 'unicode', use_idf = 1,
                     smooth_idf = 1, sublinear_tf = 1)

In [13]:
trn_term_doc = vec.fit_transform(train['text'])

In [14]:
test_term_doc = vec.transform(test['text'])

In [15]:
trn_term_doc, test_term_doc

(<280000x645701 sparse matrix of type '<class 'numpy.float64'>'
 	with 52180327 stored elements in Compressed Sparse Row format>,
 <70000x645701 sparse matrix of type '<class 'numpy.float64'>'
 	with 12982675 stored elements in Compressed Sparse Row format>)

In [57]:
trn_term_doc.shape

(280000, 645701)

In [16]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1)/((y==y_i).sum()+1)

In [17]:
x = trn_term_doc
test_x = test_term_doc

In [18]:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1, y) /pr(0,y))
    m = LogisticRegression(C=4, dual = True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

In [19]:
label_cols = ['stars_1', 'stars_2', 'stars_3', 'stars_4', 'stars_5']

In [20]:
preds = np.zeros((len(test), len(label_cols)))
for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit stars_1
fit stars_2
fit stars_3
fit stars_4
fit stars_5


In [21]:
p = pd.DataFrame(preds, columns = [['stars_1', 'stars_2', 'stars_3', 'stars_4', 'stars_5']])

In [90]:
p.to_csv('nb_preds.csv', index = False)

In [53]:
metrics.accuracy_score(np.argmax(test[label_cols].values, axis = 1), np.argmax(preds, axis = 1))

0.6704714285714286