In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

#Data pre-processing
import re
import copy
import json
import pandas as pd
import numpy as np

#nltk
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

#Data visualization
import matplotlib as plt

import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

Mounted at /content/gdrive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [2]:
## Loading the dataframe
with open('/content/gdrive/My Drive/yelp/yelp_academic_dataset_business.json') as json_file:      
    data = json_file.readlines()
    # this line below may take at least 8-10 minutes of processing for 4-5 million rows. It converts all strings in list to actual json objects. 
    data = list(map(json.loads, data)) 
business = pd.DataFrame(data)

In [3]:
chunks = pd.read_json('/content/gdrive/My Drive/yelp/yelp_academic_dataset_review.json', lines=True, chunksize = 10000)
reviews = pd.DataFrame()
for chunk in chunks:
  reviews = pd.concat([reviews, chunk])

In [4]:
print("Shape of business:", business.shape)
print("Shape of reviews:", reviews.shape)

Shape of business: (150346, 14)
Shape of reviews: (6990280, 9)


#### Filtering only the Restaurant category

In [5]:
def select_restaurant(business, reviews):
  restaurants = business[business['categories'].str.contains('Restaurant') == True]
  reviews = reviews[reviews.business_id.isin(restaurants['business_id']) == True]
  return reviews

In [6]:
reviews = select_restaurant(business, reviews)

In [7]:
reviews = reviews[['text','stars']]
reviews.head()

Unnamed: 0,text,stars
0,"If you decide to eat here, just be aware it is...",3
2,Family diner. Had the buffet. Eclectic assortm...,3
3,"Wow! Yummy, different, delicious. Our favo...",5
4,Cute interior and owner (?) gave us tour of up...,4
5,I am a long term frequent customer of this est...,1


In [8]:
reviews.shape

(4724684, 2)

#### Labelling the targets

In [9]:
def target_labelling(data):
    target = {"sentiment":[]}
    for i in data["stars"]:
        if i > 3:
            target["sentiment"].append(1)
        else:
            target["sentiment"].append(0)
            
    data = data.join(pd.DataFrame(target, index = data.index))
    data = data.drop('stars', axis = 1)
    return data

In [10]:
reviews = target_labelling(reviews)

#### Data cleaning

In [11]:
def contractions(text):
    text = re.sub(r"ain't", "am not", text)
    text = re.sub(r"aren't", "are not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"can't've", "can not have", text)
    text = re.sub(r"'cause", "because", text)
    text = re.sub(r"could've", "could have", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"couldn't've", "could not have", text)
    text = re.sub(r"doesn't", "does not", text)
    text = re.sub(r"hadn't", "had not", text)
    text = re.sub(r"hadn't've", "had not have", text)
    text = re.sub(r"hasn't", "has not", text)
    text = re.sub(r"haven't", "have not", text)
    text = re.sub(r"he'd", "he had", text)
    text = re.sub(r"he'd've", "he would have", text)
    text = re.sub(r"he'll", "he will", text)
    text = re.sub(r"he'll've", "he will have", text)
    text = re.sub(r"he's", "he has", text)
    text = re.sub(r"how'd", "how did", text)
    text = re.sub(r"how'd'y", "how do you", text)
    text = re.sub(r"how'll", "how will", text)
    text = re.sub(r"how's", "how has", text)
    text = re.sub(r"i'd", "i had", text)
    text = re.sub(r"i'd've", "i would have", text)
    text = re.sub(r"i'll", "i shall", text)
    text = re.sub(r"i'll've", "i shall have", text)
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"i've", "i have", text)
    text = re.sub(r"isn't", "is not", text)
    text = re.sub(r"it'd", "it had", text)
    text = re.sub(r"it'd've", "it would have", text)
    text = re.sub(r"it'll", "it shall", text)
    text = re.sub(r"it'll've", "it shall have", text)
    text = re.sub(r"it's", "it has", text)
    text = re.sub(r"let's", "let us", text)
    text = re.sub(r"ma'am", "madam", text)
    text = re.sub(r"mayn't", "may not", text)
    text = re.sub(r"might've", "might have", text)
    text = re.sub(r"mightn't", "might not", text)
    text = re.sub(r"mightn't've", "might not have", text)
    text = re.sub(r"must've", "must have", text)
    text = re.sub(r"mustn't", "must not", text)
    text = re.sub(r"mustn't've", "must not have", text)
    text = re.sub(r"needn't", "need not", text)
    text = re.sub(r"needn't've", "need not have", text)
    text = re.sub(r"o'clock", "of the clock", text)
    text = re.sub(r"oughtn't", "ought not", text)
    text = re.sub(r"oughtn't've", "ought not have", text)
    text = re.sub(r"shan't", "shall not", text)
    text = re.sub(r"sha'n't", "shall not", text)
    text = re.sub(r"shan't've", "shall not have", text)
    text = re.sub(r"she'd", "she had", text)
    text = re.sub(r"she'd've", "she would have", text)
    text = re.sub(r"she'll", "she shall", text)
    text = re.sub(r"she'll've", "she shall have", text)
    text = re.sub(r"she's", "she has", text)
    text = re.sub(r"should've", "should have", text)
    text = re.sub(r"shouldn't", "should not", text)
    text = re.sub(r"shouldn't've", "should not have", text)
    text = re.sub(r"so've", "so have", text)
    text = re.sub(r"so's", "so as", text)
    text = re.sub(r"that'd", "that would", text)
    text = re.sub(r"that'd've", "that would have", text)
    text = re.sub(r"that's", "that has", text)
    text = re.sub(r"there'd", "there had", text)
    text = re.sub(r"there'd've", "there would have", text)
    text = re.sub(r"there's", "there has", text)
    text = re.sub(r"they'd", "they had", text)
    text = re.sub(r"they'd've", "they would have", text)
    text = re.sub(r"they'll", "they shall", text)
    text = re.sub(r"they'll've", "they shall have", text)
    text = re.sub(r"they're", "they are", text)
    text = re.sub(r"they've", "they have", text)
    text = re.sub(r"to've", "to have", text)
    text = re.sub(r"wasn't", "was not", text)
    text = re.sub(r"we'd", "we had", text)
    text = re.sub(r"we'd've", "we would have", text)
    text = re.sub(r"we'll", "we will", text)
    text = re.sub(r"we'll've", "we will have", text)
    text = re.sub(r"we're", "we are", text)
    text = re.sub(r"we've", "we have", text)
    text = re.sub(r"weren't", "were not", text)
    text = re.sub(r"what'll", "what shall", text)
    text = re.sub(r"what'll've", "what shall have", text)
    text = re.sub(r"what're", "what are", text)
    text = re.sub(r"what's", "what has", text)
    text = re.sub(r"what've", "what have", text)
    text = re.sub(r"when's", "when has", text)
    text = re.sub(r"when've", "when have", text)
    text = re.sub(r"where'd", "where did", text)
    text = re.sub(r"where's", "where has", text)
    text = re.sub(r"where've", "where have", text)
    text = re.sub(r"who'll", "who shall", text)
    text = re.sub(r"who'll've", "who shall have", text)
    text = re.sub(r"who's", "who has", text)
    text = re.sub(r"who've", "who have", text)
    text = re.sub(r"why's", "why has", text)
    text = re.sub(r"why've", "why have", text)
    text = re.sub(r"will've", "will have", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"won't've", "will not have", text)
    text = re.sub(r"would've", "would have", text)
    text = re.sub(r"wouldn't", "would not", text)
    text = re.sub(r"wouldn't've", "would not have", text)
    text = re.sub(r"y'all", "you all", text)
    text = re.sub(r"y'all'd", "you all would", text)
    text = re.sub(r"y'all'd've", "you all would have", text)
    text = re.sub(r"y'all're", "you all are", text)
    text = re.sub(r"y'all've", "you all have", text)
    text = re.sub(r"you'd", "you had", text)
    text = re.sub(r"you'd've", "you would have", text)
    text = re.sub(r"you'll", "you shall", text)
    text = re.sub(r"you'll've", "you shall have", text)
    text = re.sub(r"how's", "how has", text)
    text = re.sub(r"you're", "you are", text)
    text = re.sub(r"you've", "you have", text)
    text = re.sub(r"didn't", "did not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"'","",text)
    text = re.sub(r". . .","",text)
    return text

In [12]:
def cleaning(data):
     for index, row in tqdm(data.iterrows()):
        strings = ""
        ## remove digit with word pattern
        clean = re.sub(r'([\d]+[a-zA-Z]+)|([a-zA-Z]+[\d]+)', "", row["text"])
        ## remove only digit pattern
        clean = re.sub(r"(^|\s)(\-?\d+(?:\.\d)*|\d+|[\d]+[A-Za-z]+)"," ", clean.lower())
        ## remove every symbols except characters
        clean = re.sub('[^A-Za-z\']+', " ", clean)
        strings = strings + clean
        strings = contractions(strings)
        data["text"][index] = strings
     return data

In [13]:
reviews.to_csv("/content/gdrive/My Drive/yelp/tobecleanedreviews.csv",index = False)

In [14]:
chunks = pd.read_csv("/content/gdrive/My Drive/yelp/tobecleanedreviews.csv", chunksize = 10000)
reviews = pd.DataFrame()
for chunk in chunks:
  chunk = cleaning(chunk)
  reviews = pd.concat([reviews, chunk])

10000it [00:07, 1383.50it/s]
10000it [00:07, 1349.99it/s]
10000it [00:07, 1341.62it/s]
10000it [00:07, 1358.73it/s]
10000it [00:07, 1352.79it/s]
10000it [00:07, 1381.27it/s]
10000it [00:07, 1386.83it/s]
10000it [00:07, 1408.00it/s]
10000it [00:07, 1355.74it/s]
10000it [00:07, 1359.05it/s]
10000it [00:07, 1334.61it/s]
10000it [00:07, 1345.19it/s]
10000it [00:07, 1350.45it/s]
10000it [00:07, 1349.49it/s]
10000it [00:07, 1341.58it/s]
10000it [00:07, 1351.45it/s]
10000it [00:07, 1383.20it/s]
10000it [00:07, 1341.60it/s]
10000it [00:07, 1359.20it/s]
10000it [00:07, 1397.90it/s]
10000it [00:07, 1393.80it/s]
10000it [00:06, 1440.18it/s]
10000it [00:07, 1307.26it/s]
10000it [00:07, 1318.63it/s]
10000it [00:07, 1313.46it/s]
10000it [00:07, 1351.61it/s]
10000it [00:07, 1360.08it/s]
10000it [00:07, 1354.70it/s]
10000it [00:07, 1338.14it/s]
10000it [00:07, 1348.05it/s]
10000it [00:07, 1342.73it/s]
10000it [00:07, 1367.83it/s]
10000it [00:07, 1349.63it/s]
10000it [00:07, 1353.42it/s]
10000it [00:07

In [15]:
reviews.shape

(4724686, 2)

In [16]:
reviews.to_csv("/content/gdrive/My Drive/yelp/cleanedreviews.csv",index = False)

In [17]:
def further_cleaning(data):
    data_copy = copy.deepcopy(data)
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english')) - set(['no', 'not'])
    for index, row in tqdm(data_copy.iterrows()):
        sent = ''
        for e in row["text"].split():
            if e not in stop_words:
                e = lemmatizer.lemmatize(e, pos ="a")
                sent = ' '.join([sent,e])
        data_copy["text"][index] = sent
    return data_copy

In [18]:
chunks = pd.read_csv("/content/gdrive/My Drive/yelp/cleanedreviews.csv", chunksize = 10000)
reviews = pd.DataFrame()
for chunk in chunks:
  chunk = further_cleaning(chunk)
  reviews = pd.concat([reviews, chunk])

10000it [00:13, 756.88it/s]
10000it [00:09, 1064.84it/s]
10000it [00:09, 1082.29it/s]
10000it [00:06, 1554.36it/s]
10000it [00:06, 1558.14it/s]
10000it [00:06, 1537.26it/s]
10000it [00:06, 1516.86it/s]
10000it [00:06, 1507.19it/s]
10000it [00:06, 1519.37it/s]
10000it [00:06, 1558.98it/s]
10000it [00:06, 1562.95it/s]
10000it [00:06, 1537.99it/s]
10000it [00:06, 1561.02it/s]
10000it [00:06, 1559.20it/s]
10000it [00:06, 1531.06it/s]
10000it [00:06, 1608.36it/s]
10000it [00:06, 1529.94it/s]
10000it [00:06, 1541.85it/s]
10000it [00:06, 1530.72it/s]
10000it [00:06, 1526.59it/s]
10000it [00:06, 1543.65it/s]
10000it [00:06, 1548.61it/s]
10000it [00:06, 1586.27it/s]
10000it [00:06, 1563.15it/s]
10000it [00:06, 1580.22it/s]
10000it [00:06, 1602.46it/s]
10000it [00:06, 1579.48it/s]
10000it [00:06, 1555.45it/s]
10000it [00:06, 1573.22it/s]
10000it [00:06, 1548.98it/s]
10000it [00:06, 1596.53it/s]
10000it [00:05, 1686.23it/s]
10000it [00:06, 1628.90it/s]
10000it [00:06, 1532.35it/s]
10000it [00:06,

In [None]:
reviews.to_csv("/content/gdrive/My Drive/yelp/finalcleanedreviews.csv",index = False)

In [None]:
reviews.head()

Unnamed: 0,text,sentiment
0,decide eat aware going take hours beginning e...,0.0
1,family diner buffet eclectic assortmenarge ch...,0.0
2,wow yummy different delicious favorite lamb c...,1.0
3,cute interior owner gave us tour upcoming pat...,1.0
4,aong term frequent customer establishmenust w...,0.0


In [2]:
reviews = pd.read_csv('/content/gdrive/My Drive/yelp/finalcleanedreviews.csv')

In [3]:
reviews.head()

Unnamed: 0,text,sentiment
0,decide eat aware going take hours beginning e...,0.0
1,family diner buffet eclectic assortmenarge ch...,0.0
2,wow yummy different delicious favorite lamb c...,1.0
3,cute interior owner gave us tour upcoming pat...,1.0
4,aong term frequent customer establishmenust w...,0.0
