In [245]:
import pandas as pd
import re
import string
from datetime import date, timedelta
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
import pickle
import os
from pathlib import Path

In [246]:
stored_folder = Path(os.path.abspath('')).parent.parent / "data" / "raw" / "McDonald_s_Reviews.csv"
data = pd.read_csv(stored_folder,encoding="latin-1")

In [247]:
data.head(10)

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating
0,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star
1,2,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars
2,3,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,Made a mobile order got to the speaker and che...,1 star
3,4,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,a month ago,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars
4,5,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star
5,6,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 weeks ago,I work for door dash and they locked us all ou...,1 star
6,7,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,If I could give this location a zero on custo...,1 star
7,8,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,a year ago,Came in and ordered a Large coffee w/no ice. T...,1 star
8,9,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Went thru drive thru. Ordered. Getting home no...,1 star
9,10,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,"I'm not really a huge fan of fast food, but I ...",4 stars


In [248]:
# check data column
data.columns

Index(['reviewer_id', 'store_name', 'category', 'store_address', 'latitude ',
       'longitude', 'rating_count', 'review_time', 'review', 'rating'],
      dtype='object')

In [249]:
missing_value = data.isnull().sum()
print('missing value :\n',missing_value)

missing value :
 reviewer_id        0
store_name         0
category           0
store_address      0
latitude         660
longitude        660
rating_count       0
review_time        0
review             0
rating             0
dtype: int64


In [250]:
#check distinct value
selected_columns = ['store_name','category','store_address','rating','review_time']
for columns in data.columns:
    if columns in selected_columns:
        num_distinct_value = len(data[columns].unique())
        distinct_value = data[columns].unique()
        print(f"{columns}: {num_distinct_value} : {distinct_value}")

store_name: 2 : ["McDonald's" "ýýýMcDonald's"]
category: 1 : ['Fast food restaurant']
store_address: 40 : ['13749 US-183 Hwy, Austin, TX 78750, United States'
 '1698 US-209, Brodheadsville, PA 18322, United States'
 '72-69 Kissena Blvd, Queens, NY 11367, United States'
 '429 7th Ave, New York, NY 10001, United States'
 '724 Broadway, New York, NY 10003, United States'
 '160 Broadway, New York, NY 10038, United States'
 '555 13th St NW, Washington, DC 20004, United States'
 '10451 Santa Monica Blvd, Los Angeles, CA 90025, United States'
 '114 Delancey St, New York, NY 10002, United States'
 '5920 Balboa Ave, San Diego, CA 92111, United States'
 '262 Canal St, New York, NY 10013, United States'
 '490 8th Ave, New York, NY 10001, United States'
 '550 Lawrence Expy, Sunnyvale, CA 94086, United States'
 '11382 US-441, Orlando, FL 32837, United States'
 '210 5th S, Salt Lake City, UT 84106, United States'
 '1916 M St NW, Washington, DC 20036, United States'
 "151 West 34th Street (Macy's 7th

In [251]:
# Replace "ýýýMcDonald's" with "McDonald's"
data['store_name'] = data['store_name'].replace("ýýýMcDonald's", "McDonald's")
print(len(data['store_name'].unique())," : ",data['store_name'].unique())

1  :  ["McDonald's"]


In [252]:
data['store_name']

0        McDonald's
1        McDonald's
2        McDonald's
3        McDonald's
4        McDonald's
            ...    
33391    McDonald's
33392    McDonald's
33393    McDonald's
33394    McDonald's
33395    McDonald's
Name: store_name, Length: 33396, dtype: object

In [253]:
# clean store address
store_address_data = data['store_address']
#removed all non alphabet character
cleaned_data = store_address_data.apply(lambda x: re.sub(r'[^a-zA-Z0-9,]+', " ", x) if pd.notnull(x) else x)
cleaned_data = cleaned_data.str.split(',', expand=True)
cleaned_data.columns = ['Street Address','City', 'State', 'Country','None']
cleaned_data = cleaned_data.drop(columns=['None'])
cleaned_data

Unnamed: 0,Street Address,City,State,Country
0,13749 US 183 Hwy,Austin,TX 78750,United States
1,13749 US 183 Hwy,Austin,TX 78750,United States
2,13749 US 183 Hwy,Austin,TX 78750,United States
3,13749 US 183 Hwy,Austin,TX 78750,United States
4,13749 US 183 Hwy,Austin,TX 78750,United States
...,...,...,...,...
33391,3501 Biscayne Blvd,Miami,FL 33137,United States
33392,3501 Biscayne Blvd,Miami,FL 33137,United States
33393,3501 Biscayne Blvd,Miami,FL 33137,United States
33394,3501 Biscayne Blvd,Miami,FL 33137,United States


In [254]:
data['rating'] = data['rating'].str.extract('(\d+)').astype(int)
data['rating']

0        1
1        4
2        1
3        5
4        1
        ..
33391    1
33392    5
33393    4
33394    5
33395    5
Name: rating, Length: 33396, dtype: int64

In [255]:
unique_review = data['review'].unique()
unique_review

array(['Why does it look like someone spit on my food?\nI had a normal transaction,  everyone was chill and polite, but now i dont want to eat this. Im trying not to think about what this milky white/clear substance is all over my food, i d*** sure am not coming back.',
       "It'd McDonalds. It is what it is as far as the food and atmosphere go. The staff here does make a difference. They are all friendly, accommodating and always smiling. Makes for a more pleasant experience than many other fast food places.",
       'Made a mobile order got to the speaker and checked it in.\nLine was not moving so I had to leave otherwise Iï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½ï¿½d be late for work.\nNever got the refund in the app.\nI called them and they said I could only get my money back in person because it was stuck in the system.\nWent there in person the next day  and the manager told me she wasnï¿',
       ..., 'To remove hunger is enough',
       "I

In [256]:
def clean_review(data):
    clean_review = data.apply(lambda s: s.lower())
    clean_review = clean_review.apply(lambda x: re.sub(r'[^a-zA-Z0-9,]+', " ", x) if pd.notnull(x) else x)
    clean_review = clean_review.apply(lambda s: s.translate(str.maketrans(' ', ' ', string.punctuation + u'\xa0')))
    clean_review = clean_review.apply(lambda s: s.translate(str.maketrans(string.whitespace, ' ' * len(string.whitespace), '')))
    clean_review = clean_review.apply(lambda s: ' '.join(word for word in s.split() if len(word) > 2))
    clean_review = clean_review.drop_duplicates()
    return clean_review

In [257]:
review = clean_review(data['review'])
data['review'] = review

In [258]:
today = date(2024, 3, 5)

def convert_to_date(expression):
    num, unit = expression.split(' ')[0], expression.split(' ')[1]
    if num == 'a':
        num = 1
    else:
        num = int(num)
    
    if 'hour' in unit:
        return today - timedelta(hours=num)
    elif 'day' in unit:
        return today - timedelta(days=num)
    elif 'week' in unit:
        return today - timedelta(weeks=num)
    elif 'month' in unit:
        return today - timedelta(days=num * 30)  
    elif 'year' in unit:
        return today - timedelta(days=num * 365) 
    else:
        return today

In [259]:
data['review_time'] = data['review_time'].apply(convert_to_date)
data['review_time']

0        2023-12-06
1        2024-02-29
2        2024-02-29
3        2024-02-04
4        2024-01-05
            ...    
33391    2020-03-06
33392    2023-03-06
33393    2023-03-06
33394    2019-03-07
33395    2022-03-06
Name: review_time, Length: 33396, dtype: object

In [260]:
def preprocess(texts):
    texts = texts.apply(word_tokenize)
    stop_words = set(stopwords.words('english'))
    texts = texts.apply(lambda s : [word for word in s if word not in stop_words])
    stemmer = SnowballStemmer('english')
    texts = texts.apply(lambda s: [stemmer.stem(word) for word in s])
    texts = texts.apply(lambda s: ' '.join(s))
    return texts

In [261]:
processed_review = preprocess(data['review'].astype(str))
data['processed_review'] = processed_review

In [262]:
data

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating,processed_review
0,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2023-12-06,why does look like someone spit food had norma...,1,look like someon spit food normal transact eve...
1,2,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2024-02-29,mcdonalds what far the food and atmosphere the...,4,mcdonald far food atmospher staff make differ ...
2,3,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2024-02-29,made mobile order got the speaker and checked ...,1,made mobil order got speaker check line move l...
3,4,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2024-02-04,crispy chicken sandwich was customer service w...,5,crispi chicken sandwich custom servic quick
4,5,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2024-01-05,repeat order times the drive thru and she stil...,1,repeat order time drive thru still manag mess ...
...,...,...,...,...,...,...,...,...,...,...,...
33391,33392,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,2020-03-06,they treated very badly,1,treat bad
33392,33393,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,2023-03-06,the service very good,5,servic good
33393,33394,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,2023-03-06,remove hunger enough,4,remov hunger enough
33394,33395,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,2019-03-07,good but lately has become very expensive,5,good late becom expens


In [263]:
def sentiment_text(rating):
    # Determine sentiment based on the rating
    if rating >= 4:
        return 'Positive'
    elif rating <= 2:
        return 'Negative'
    else:
        return 'Neutral'

In [264]:
data['sentiment'] = data.apply(lambda row: sentiment_text(row['rating']), axis=1)

In [265]:
data

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating,processed_review,sentiment
0,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2023-12-06,why does look like someone spit food had norma...,1,look like someon spit food normal transact eve...,Negative
1,2,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2024-02-29,mcdonalds what far the food and atmosphere the...,4,mcdonald far food atmospher staff make differ ...,Positive
2,3,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2024-02-29,made mobile order got the speaker and checked ...,1,made mobil order got speaker check line move l...,Negative
3,4,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2024-02-04,crispy chicken sandwich was customer service w...,5,crispi chicken sandwich custom servic quick,Positive
4,5,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2024-01-05,repeat order times the drive thru and she stil...,1,repeat order time drive thru still manag mess ...,Negative
...,...,...,...,...,...,...,...,...,...,...,...,...
33391,33392,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,2020-03-06,they treated very badly,1,treat bad,Negative
33392,33393,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,2023-03-06,the service very good,5,servic good,Positive
33393,33394,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,2023-03-06,remove hunger enough,4,remov hunger enough,Positive
33394,33395,McDonald's,Fast food restaurant,"3501 Biscayne Blvd, Miami, FL 33137, United St...",25.810000,-80.189098,2810,2019-03-07,good but lately has become very expensive,5,good late becom expens,Positive


In [267]:
output_dir = Path(os.path.abspath('')).parent.parent / "data" / "processed"
output_file = open(str(output_dir) + "/cleaned_df.pkl", "wb")
pickle.dump(data, output_file)
output_file.close()