In [1]:
# Import python packages
import spacy
import numpy as np
import pandas as pd

# Clean Business Data

In [2]:
# Read business data
business = pd.read_csv('yelp_business.csv')
business.shape

(7183, 17)

In [3]:
# Look at first few rows of data
business.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open_2017,latitude,longitude,name,neighborhood,postal_code,review_count,stars,state,is_open_2018,isBankrupt
0,719 E Thunderbird Rd,"{'RestaurantsTableService': False, 'GoodForMea...",rDMptJYWtnMhpQu_rRXHng,"['Fast Food', 'Burgers', 'Restaurants']",Phoenix,{},1,33.60707,-112.064382,McDonald's,,85022.0,10,1.0,AZ,1,0
1,"777 E Thunderbird Rd, Ste 107","{'RestaurantsTableService': True, 'GoodForMeal...",1WBkAuQg81kokZIPMpn9Zg,"['Burgers', 'Restaurants']",Phoenix,"{'Monday': '11:00-22:00', 'Tuesday': '11:00-22...",1,33.60731,-112.063404,Charr An American Burger Bar,,85022.0,232,3.0,AZ,1,0
2,1635 E Camelback Rd,"{'RestaurantsTableService': False, 'GoodForMea...",iPa__LOhse-hobC2Xmp-Kw,"['Restaurants', 'Burgers', 'Fast Food']",Phoenix,"{'Monday': '5:00-23:00', 'Tuesday': '5:00-23:0...",1,33.508765,-112.04624,McDonald's,,85016.0,34,3.0,AZ,1,0
3,,"{'BusinessAcceptsCreditCards': True, 'Business...",YhV93k9uiMdr3FlV4FHjwA,"['Marketing', ""Men's Clothing"", 'Restaurants',...",Phoenix,"{'Monday': '8:00-17:00', 'Tuesday': '8:00-17:0...",1,33.449967,-112.070223,Caviness Studio,,85001.0,4,5.0,AZ,1,0
4,"8140 N Hayden Rd, Ste H115","{'Alcohol': 'full_bar', 'HasTV': True, 'NoiseL...",VdlPZg2NAu8t8GkdbPLecg,"['Restaurants', 'Gluten-Free', 'Indian', 'Seaf...",Scottsdale,"{'Tuesday': '17:00-22:00', 'Friday': '17:00-22...",1,33.555212,-111.900456,Tandoori Times Indian Bistro,,85258.0,263,3.5,AZ,1,0


In [4]:
# Make a copy of raw dataset
business_df = business.copy()

## Check Duplicates

In [5]:
# Check row duplicates
print(business_df.shape)
print(business_df.drop_duplicates().shape)

(7183, 17)
(7183, 17)


In [6]:
# Check business_id duplicates
print(len(business_df['business_id']))
print(len(business_df['business_id'].unique()))

7183
7183


## Check Missing Values

In [7]:
# Mark invisible missing values
business_df = business_df.replace(['{}','[]'], np.nan)

In [8]:
# Check NA's before cleaning
business_df.isna().sum().sort_values(ascending=False)

neighborhood    7183
hours           1397
address           75
attributes        56
postal_code        8
is_open_2017       0
business_id        0
categories         0
city               0
isBankrupt         0
latitude           0
is_open_2018       0
name               0
review_count       0
stars              0
state              0
longitude          0
dtype: int64

In [9]:
# Drop variables
business_df = business_df.drop(['neighborhood', 'hours', 'address'], axis=1)
business_df.shape

(7183, 14)

In [10]:
# Drop observations
business_df = business_df.dropna(axis=0, how='any')
business_df.shape

(7122, 14)

In [11]:
# Check NA's after cleaning
business_df.isna().sum()

attributes      0
business_id     0
categories      0
city            0
is_open_2017    0
latitude        0
longitude       0
name            0
postal_code     0
review_count    0
stars           0
state           0
is_open_2018    0
isBankrupt      0
dtype: int64

# Clean Review Data

In [12]:
# Read review data
review = pd.read_csv('yelp_review.csv')
review.shape

(781473, 9)

In [13]:
# Look at first few rows of data
review.head()

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,JlNeaOymdVbE6_bubqjohg,0,2014-08-09,0,BF0ANB54sc_f-3_howQBCg,1,We always go to the chevo's in chandler which ...,3,ssuXFjkH4neiBgwv-oN4IA
1,0Rni7ocMC_Lg2UH0lDeKMQ,0,2014-08-09,0,DbLUpPT61ykLTakknCF9CQ,1,"This place is always so dirty and grimy, been ...",6,ssuXFjkH4neiBgwv-oN4IA
2,S-oLPRdhlyL5HAknBKTUcQ,0,2017-11-30,0,z_mVLygzPn8uHp63SSCErw,4,Holy portion sizes! You get a lot of bang for ...,0,MzEnYCyZlRYQRISNMXTWIg
3,iIjVO7cLD1UEmIO7G05Ujw,0,2016-06-11,0,xatycgntu_F_Ioyny3iflw,4,Flavor was actually pretty good. Not used to e...,0,vaXJ7-xLrnD6FAEhUqYKwQ
4,1JF9TbJ2d5hH8xsQvvklHg,0,2016-06-18,0,Z7U7MMef6Tbj_ZbSFzLRUw,5,This is place very great flavor. Server was on...,1,vaXJ7-xLrnD6FAEhUqYKwQ


In [14]:
# Make a copy of raw dataset
review_df = review.copy()

## Check Duplicates

In [15]:
# Check row duplicates
print(review_df.shape)
print(review_df.drop_duplicates().shape)

(781473, 9)
(781473, 9)


## Check Missing Values

In [16]:
# Check NA's
review_df.isna().sum()

business_id    0
cool           0
date           0
funny          0
review_id      0
stars          0
text           0
useful         0
user_id        0
dtype: int64

## Clean Text 

In [17]:
def keep_token(x):
    # Drop word that do not belong to the required part of speech
    pos_to_keep = ['ADJ', 'ADV', 'NOUN', 'VERB']
    if x.pos_ not in pos_to_keep:
        return False
    
    # Keep only alphabetical words
    if x.is_alpha == False:
        return False
    
    # Remove stopwords
    if x.is_stop == True:
        return False
    
    # Keep the token if it does not get filtered out
    return True

In [18]:
# Load spacy nlp model
nlp = spacy.load('en_core_web_sm')

# Define function that lemmatize and remove
clean_text = lambda x: ' '.join([word.lemma_ for word in nlp(u'{}'.format(x)) if keep_token(word)])

In [19]:
%%time
# Clean text
review_df['clean_text'] = review_df['text'].apply(clean_text)

Wall time: 5h 39min 5s


In [20]:
# Drop unwanted columns
col_to_drop = ['review_id', 'text', 'user_id']
review_df = review_df.drop(columns = col_to_drop)

In [21]:
# Look at first few rows of finalized review data
review_df.head()

Unnamed: 0,business_id,cool,date,funny,stars,useful,clean_text
0,JlNeaOymdVbE6_bubqjohg,0,2014-08-09,0,1,3,chevo chandler delicious ahwatukee different r...
1,0Rni7ocMC_Lg2UH0lDeKMQ,0,2014-08-09,0,1,6,place dirty grimy twice service horrible
2,S-oLPRdhlyL5HAknBKTUcQ,0,2017-11-30,0,4,0,holy portion size lot bang buck service super ...
3,iIjVO7cLD1UEmIO7G05Ujw,0,2016-06-11,0,4,0,flavor actually pretty good eat menudo tortill...
4,1JF9TbJ2d5hH8xsQvvklHg,0,2016-06-18,0,5,1,place great flavor thing ask bring chip salsa ...


In [22]:
# Export datasets
business_df.to_csv('clean_business.csv', index=False)
review_df.to_csv('clean_review.csv', index=False)