In [1]:
import pandas as pd
import collections
import numpy as np
import re

## Preprocessing

In [2]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'([.,!?])',r' \1 ', text)
    text = re.sub(r'[^a-zA-z.,!?]+', r' ', text)
    return text


In [3]:
sample_text = 'First visit...Had lunch here today - used my G'
aa = preprocess_text(sample_text)

In [4]:
aa

'first visit . . . had lunch here today used my g'

## Split Data

##### We want to split the data into three groups and we are goin to put the same amount of samples of the different classes in all the three partitions, special interest on the training partion.

In [5]:
data = pd.read_csv('yelp.csv')

In [6]:
data.columns

Index(['business_id', 'date', 'review_id', 'stars', 'text', 'type', 'user_id',
       'cool', 'useful', 'funny'],
      dtype='object')

In [7]:
# We dont care about the rest of the columns, just the review_id, stars and text columns
data = data.loc[:,['review_id','stars','text']]
data.set_index(['review_id'])

Unnamed: 0_level_0,stars,text
review_id,Unnamed: 1_level_1,Unnamed: 2_level_1
fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...
IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...
IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...
G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!..."
1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...
...,...,...
Ubyfp2RSDYW0g7Mbr8N3iA,3,First visit...Had lunch here today - used my G...
2XyIOQKbVFb6uXQdJ0RzlQ,4,Should be called house of deliciousness!\n\nI ...
jyznYkIbpqVmlsZxSDSypA,4,I recently visited Olive and Ivy for business ...
5UKq9WQE1qQbJ0DJbc-B6Q,2,My nephew just moved to Scottsdale recently so...


In [8]:
# We do the preprocessing previous we start spliting, so we dont have to do it again fro each subdataset

for i in range(len(data.iloc[:,2])):
    data.iloc[i,2] = preprocess_text(data.iloc[i,2])

In [9]:
data

Unnamed: 0,review_id,stars,text
0,fWKvX83p0-ka4JS3dc6E5A,5,my wife took me here on my birthday for breakf...
1,IjZ33sJrzXqU-0X6U8NwyA,5,i have no idea why some people give bad review...
2,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate . rice is so good and i al...
3,G-WvGaISbqqaMHlNnByodA,5,"rosie , dakota , and i love chaparral dog park..."
4,1uJFq2r5QfJG_6ExMRCaGw,5,general manager scott petello is a good egg ! ...
...,...,...,...
9995,Ubyfp2RSDYW0g7Mbr8N3iA,3,first visit . . . had lunch here today used my...
9996,2XyIOQKbVFb6uXQdJ0RzlQ,4,should be called house of deliciousness ! i co...
9997,jyznYkIbpqVmlsZxSDSypA,4,i recently visited olive and ivy for business ...
9998,5UKq9WQE1qQbJ0DJbc-B6Q,2,my nephew just moved to scottsdale recently so...


In [10]:
# We are interesting in the stars given and the comment, but we are going to count the number of samples of each type

In [11]:
# Create a function that is going to return a DataFrame with all the samples of one class from the whole dataset
# We give to this class the dataset and the class label

def clustering(label, data):
    samples_class_label = []
    for i in range(len(data)):
        if data.iloc[i,1] == label:
            samples_class_label.append(data.iloc[i,:])
    return samples_class_label

stars_5 = pd.DataFrame(clustering(5,data)).set_index(['review_id'])
stars_4 = pd.DataFrame(clustering(4,data)).set_index(['review_id'])
stars_3 = pd.DataFrame(clustering(3,data)).set_index(['review_id'])
stars_2 = pd.DataFrame(clustering(2,data)).set_index(['review_id'])
stars_1 = pd.DataFrame(clustering(1,data)).set_index(['review_id'])

#### We are going to keep a proportion of the classes in the three partitions, one way to proceed would be to say 70% for training, 10% validation and 20% test, and apply this criteria for the 5 groups of samples that we got in the prvious statement

In [12]:
def partition(data_class_label):
    num_samples = len(data_class_label)
    n_samples_train = round(0.7 * num_samples)
    n_samples_test = round(0.2 * num_samples)
    n_samples_validation = n_samples_train - n_samples_test
    
    train = data_class_label.iloc[:n_samples_train,:]
    test = data_class_label.iloc[n_samples_train:n_samples_train + n_samples_test,:]
    validation = data_class_label.iloc[n_samples_train + n_samples_test :,:]
    
    return train,test,validation

In [13]:
# We are going to create three pandas objects with the data corresponding to train, test and validation

train = pd.DataFrame()
test = pd.DataFrame()
validation = pd.DataFrame()
total = [stars_5,stars_4,stars_3,stars_2,stars_1]

for dataset_class in total:
    
    train_subset = []
    test_subset = []
    validation_subset = []
    
    train_subset,test_subset,validation_subset = partition(dataset_class)
    
    train = train.append(pd.DataFrame(train_subset),ignore_index=True)
    test = test.append(pd.DataFrame(test_subset),ignore_index=True)
    validation = validation.append(pd.DataFrame(validation_subset),ignore_index=True)

In [17]:
# Now we shuffle the rows df.sample(frac=1)
train = train.sample(frac=1)
test = test.sample(frac=1)
validation = validation.sample(frac=1)

# And now we save them on csv files
train.to_csv('train.csv',index=False)
test.to_csv('test.csv',index=False)
validation.to_csv('validation.csv',index=False)