In [1]:
import numpy as np 
import pandas as pd 

In [15]:
data = pd.read_csv('yelp_review_full_csv/train.csv', header=None)
print(data[:10])

   0                                                  1
0  5  dr. goldberg offers everything i look for in a...
1  2  Unfortunately, the frustration of being Dr. Go...
2  4  Been going to Dr. Goldberg for over 10 years. ...
3  4  Got a letter in the mail last week that said D...
4  1  I don't know what Dr. Goldberg was like before...
5  5  Top notch doctor in a top notch practice. Can'...
6  5  Dr. Eric Goldberg is a fantastic doctor who ha...
7  1  I'm writing this review to give you a heads up...
8  2  Wing sauce is like water. Pretty much a lot of...
9  3  Decent range somewhat close to the city.  The ...


In [25]:
print(data[0][1])
print(type(data))
print(data.index.values)
print(data.columns.values)

2
<class 'pandas.core.frame.DataFrame'>
[     0      1      2 ..., 649997 649998 649999]
[0 1]


In [23]:
print(len(data))
print(type(data[1][0]))
print(type(data[1][1]))

650000
<class 'str'>
<class 'str'>


In [26]:
texts = []
labels = []

for i in range(len(data)):
    labels.append(data[0][i])
    texts.append(data[1][i])

In [30]:
print(texts[0])
print(labels[0])

print(texts[1])
print(labels[1])

dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.
5
Unfortunately, the frustration of being Dr. Goldberg's patient is a repeat of the experience I've had with so many other doctors in NYC -- good doctor, terrible staff.  It seems that his staff simply never answers the phone.  It usually takes 2 hours of repeated calling to get an answer.  Who has time for that or wants to deal with it?  I have run into this problem with many other doctors and I just don't get it.  You have office workers, you have patients wi

In [39]:
# filtering all . ? ! into <END> token 

END_TK = ' <end>'
INTER_PUNS = [',', ';', '\\n']
END_PUNS = ['.', '?', '!']


def filter_text(text):
    temp = text.replace('Dr.', 'Dr')
    temp = temp.replace('dr.', 'dr')
    temp = temp.replace('DR.', 'DR')
    
    for pun in INTER_PUNS:
        temp = temp.replace(pun, '')
    
    for pun in END_PUNS:
        temp = temp.replace(pun, END_TK)
    return temp.lower() 

text_filtered = [filter_text(text) for text in texts]

In [40]:
for i in range(6):
    print(text_filtered[i])

dr goldberg offers everything i look for in a general practitioner <end>  he's nice and easy to talk to without being patronizing he's always on time in seeing his patients he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery and you can get referrals to see specialists without having to see him first <end>  really what more do you need <end>  i'm sitting here trying to think of any complaints i have about him but i'm really drawing a blank <end>
unfortunately the frustration of being dr goldberg's patient is a repeat of the experience i've had with so many other doctors in nyc -- good doctor terrible staff <end>  it seems that his staff simply never answers the phone <end>  it usually takes 2 hours of repeated calling to get an answer <end>  who has time for that or wants to deal with it <end>  i have run into this problem with many other doctors and i just don't get it <end>  you have of

In [47]:
all_words = [word for text in text_filtered for word in text.split()]

In [48]:
from collections import Counter 

counts = Counter(all_words)

In [49]:
# vocabulary for all words 

vocab_words = list(set(all_words))

print(len(vocab_words))  # 640772

640772


In [50]:
# get n most common word and the vocabulary as a list 

common_word_list = counts.most_common(20000)

vocab_list = [word for word, count in common_word_list]

In [52]:
print(len(vocab_list))  # correct 

20000


In [53]:
# mapping dicts 

idx2word = {(idx+1):word for idx, word in enumerate(vocab_list)}
word2idx = {word:(idx+1) for idx, word in enumerate(vocab_list)}

In [54]:
print(len(idx2word))
print(len(word2idx))   # correct 

20000
20000


In [56]:
# reform texts with vocab and the unknown token 

UNK_TK = '<UNK>'

def filter_word(word, vocab):
    return word if word in vocab else UNK_TK

text_filtered_word_splitted = [[filter_word(word, vocab_list) for word in text.split()] for text in text_filtered]

KeyboardInterrupt: 

In [None]:
print(len(text_filtered_word_splitted))
print(text_filtered_word_splitted[0])

In [None]:
# optional, depends on the vocab size, augment the dicts with the unknown token 

idx2word[len(vocab_list)+1] = UNK_TK
word2idx[UNK_TK] = len(vocab_list) + 1

In [60]:
# get sentence maxlen 

maxlen = max([len(text) for text in text_filtered])

In [61]:
print(maxlen)   # 6526 

6256


In [None]:
# Optional, cut down sentence length by truncating sentences 

maxlen_ideal = 1000  # ? how should i determine this 

In [58]:
# get number of labels 

label_list = list(set(labels))
print(label_list)  # 1, 2, 3, 4, 5

num_labels = len(label_list)    
print(num_labels)    # 5 

[1, 2, 3, 4, 5]
5


In [None]:
# encoding API

def encode_word(word, word2idx):
    return word2idx[word]

def encode_text(text, word2idx):
    return [encode_word(word, word2idx) for word in text]

def encode_label(label, dim):
    temp = [0]*dim
    temp[label] = 1
    return temp