In [1]:
import pandas as pd
import re
import nltk
nltk.download("vader_lexicon")
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/hemanthnarlasubramanyam/nltk_data...


In [4]:
data = "/Users/hemanthnarlasubramanyam/Documents/Projects/Sentiment analysis/McDonald_s_Reviews.csv"

df = pd.read_csv(data , encoding= 'unicode_escape')

df.head()

Unnamed: 0,reviewer_id,store_name,category,store_address,latitude,longitude,rating_count,review_time,review,rating
0,1,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,3 months ago,Why does it look like someone spit on my food?...,1 star
1,2,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars
2,3,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,5 days ago,Made a mobile order got to the speaker and che...,1 star
3,4,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,a month ago,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars
4,5,McDonald's,Fast food restaurant,"13749 US-183 Hwy, Austin, TX 78750, United States",30.460718,-97.792874,1240,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star


In [5]:
#create a new data frame using only the needed column

rev = df[['review', 'review_time','rating']].copy()

rev.head()

Unnamed: 0,review,review_time,rating
0,Why does it look like someone spit on my food?...,3 months ago,1 star
1,It'd McDonalds. It is what it is as far as the...,5 days ago,4 stars
2,Made a mobile order got to the speaker and che...,5 days ago,1 star
3,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,a month ago,5 stars
4,"I repeat my order 3 times in the drive thru, a...",2 months ago,1 star


In [7]:
#create a function to identify and remove special characters
def special_char(text):
    #convert to string to handle potential non string data types
    
    text = str(text)
    cleaned_text = re.sub('[^a-zA-Z0-9\s\'.]' , '', text)
    return cleaned_text
rev['cleaned_review'] = rev['review'].apply(special_char)
rev[['review' , 'cleaned_review']].head(5)

Unnamed: 0,review,cleaned_review
0,Why does it look like someone spit on my food?...,Why does it look like someone spit on my food\...
1,It'd McDonalds. It is what it is as far as the...,It'd McDonalds. It is what it is as far as the...
2,Made a mobile order got to the speaker and che...,Made a mobile order got to the speaker and che...
3,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,My mc. Crispy chicken sandwich was customer s...
4,"I repeat my order 3 times in the drive thru, a...",I repeat my order 3 times in the drive thru an...


In [8]:
del rev["review"]

In [9]:
rev["rating"]

0         1 star
1        4 stars
2         1 star
3        5 stars
4         1 star
          ...   
33391     1 star
33392    5 stars
33393    4 stars
33394    5 stars
33395    5 stars
Name: rating, Length: 33396, dtype: object

In [14]:
rev["numeric_rating"] = rev["rating"].str.extract('(\d+)').astype(float)

In [15]:
rev["numeric_rating"]

0        1.0
1        4.0
2        1.0
3        5.0
4        1.0
        ... 
33391    1.0
33392    5.0
33393    4.0
33394    5.0
33395    5.0
Name: numeric_rating, Length: 33396, dtype: float64

In [16]:
del rev["rating"]

In [18]:
rev.head()

Unnamed: 0,review_time,cleaned_review,numeric_rating
0,3 months ago,Why does it look like someone spit on my food\...,1.0
1,5 days ago,It'd McDonalds. It is what it is as far as the...,4.0
2,5 days ago,Made a mobile order got to the speaker and che...,1.0
3,a month ago,My mc. Crispy chicken sandwich was customer s...,5.0
4,2 months ago,I repeat my order 3 times in the drive thru an...,1.0


In [24]:
from dateutil.relativedelta import relativedelta
from datetime import datetime

# Function to convert time periods to datetime objects
def convert_to_datetime(period):
    if 'year' in period:
        if period.split()[0].isdigit():
            num_years = int(period.split()[0])
            return datetime.now() - relativedelta(years=num_years)
    elif 'month' in period:
        if period.split()[0].isdigit():
            num_months = int(period.split()[0])
            return datetime.now() - relativedelta(months=num_months)
    elif 'week' in period:
        if period.split()[0].isdigit():
            num_weeks = int(period.split()[0])
            return datetime.now() - relativedelta(weeks=num_weeks)
    elif 'day' in period:
        if period.split()[0].isdigit():
            num_days = int(period.split()[0])
            return datetime.now() - relativedelta(days=num_days)
    return None  # Return None for unmatched patterns

# Apply the function to the 'review_time' column and create a new 'modified_review_time' column with only the dates
rev['modified_review_time'] = rev['review_time'].apply(lambda x: convert_to_datetime(x).date() if convert_to_datetime(x) else "Unknown Date")
rev[["review_time", "modified_review_time"]]


Unnamed: 0,review_time,modified_review_time
0,3 months ago,2024-07-08
1,5 days ago,2024-10-03
2,5 days ago,2024-10-03
3,a month ago,Unknown Date
4,2 months ago,2024-08-08
...,...,...
33391,4 years ago,2020-10-08
33392,a year ago,Unknown Date
33393,a year ago,Unknown Date
33394,5 years ago,2019-10-08


In [26]:
# convert the new review_time column and Group the dates by year and count occurrences
rev['modified_review_time'] = pd.to_datetime(rev['modified_review_time'], errors='coerce')
grouped_dates = rev.groupby(rev['modified_review_time'].dt.year)['modified_review_time'].count()

In [27]:
print(grouped_dates)

modified_review_time
2012.0       4
2013.0      10
2014.0      38
2015.0      52
2016.0      91
2017.0     387
2018.0    1679
2019.0    4306
2020.0    6740
2021.0    5522
2022.0    3892
2023.0     960
2024.0    4264
Name: modified_review_time, dtype: int64


In [28]:
random_rev = rev['cleaned_review'][100]

In [29]:
random_rev

"I love going to McDonald's in that area because that's where I get most of my food for my family and my grandchildren and they love me when I bring stuff to him like every couple days so it's great to go to that McDonald's McDonald's is a good spot for taking my grandkids and buy him food there are the greatest thing that they're right off of lake Creek thank you for putting it there"

In [30]:
#apply VADER

analyzer = SentimentIntensityAnalyzer()

analyzer.polarity_scores(random_rev)

{'neg': 0.0, 'neu': 0.723, 'pos': 0.277, 'compound': 0.9773}

In [31]:
#apply VANDER to all reviews

analyzer = SentimentIntensityAnalyzer()

body = rev.cleaned_review
neg, neu, pos, compound, tag = [], [], [], [], []

for review in body:
    res = analyzer.polarity_scores(str(review))
    neg.append(res['neg'])
    neu.append(res['neu'])
    pos.append(res['pos'])
    compound.append(res['compound'])

    # Assigning sentiment tags based on VADER scores for each review
    if res['compound'] >= 0.05:
        tag.append("Positive")
    elif res['compound'] <= -0.05:
        tag.append("Negative")
    else:
        tag.append("Neutral")

# Append results to the dataframe
rev["Negative"] = neg
rev["Neutral"] = neu
rev["Positive"] = pos
rev["Compound"] = compound
rev["Sentiment_Tag"] = tag

In [32]:
rev.tail()

Unnamed: 0,review_time,cleaned_review,numeric_rating,review_time_converted,modified_review_time,Negative,Neutral,Positive,Compound,Sentiment_Tag
33391,4 years ago,They treated me very badly.,1.0,2024-10-08 14:11:03.831762,2020-10-08,0.459,0.541,0.0,-0.5256,Negative
33392,a year ago,The service is very good,5.0,2024-10-08 14:11:03.831763,NaT,0.0,0.556,0.444,0.4927,Positive
33393,a year ago,To remove hunger is enough,4.0,2024-10-08 14:11:03.831763,NaT,0.333,0.667,0.0,-0.25,Negative
33394,5 years ago,It's good but lately it has become very expens...,5.0,2024-10-08 14:11:03.831764,2019-10-08,0.0,0.804,0.196,0.2382,Positive
33395,2 years ago,they took good care of me,5.0,2024-10-08 14:11:03.831764,2022-10-08,0.0,0.396,0.604,0.7269,Positive


In [33]:
rev['Sentiment_Tag'].value_counts()

Sentiment_Tag
Positive    17414
Negative     9608
Neutral      6374
Name: count, dtype: int64