# Extract Data From Google Review

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import time

# Define Chrome options
chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--headless") 

# Setup Chrome service
chrome_service = Service(ChromeDriverManager().install())

# Initialize the driver
driver = webdriver.Chrome(service=chrome_service, options=chrome_options)

def scrape_google_reviews(url, restaurant_name, max_scrolls=1000):
    driver.get(url)
    time.sleep(5)  # Let the page load completely

    SCROLL_PAUSE_TIME = 0.2

    # Scroll through the reviews section to load all reviews
    divSideBar = driver.find_element(By.XPATH, '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')
    scrolls = 0
    while scrolls < max_scrolls:
        divSideBar.send_keys(Keys.PAGE_DOWN)
        time.sleep(SCROLL_PAUSE_TIME)
        divSideBar.send_keys(Keys.PAGE_DOWN)
        time.sleep(SCROLL_PAUSE_TIME)
        scrolls += 1

    # Click "More" buttons to expand reviews
    try:
        next_items = driver.find_elements(By.XPATH, '//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[9]')
        for item in next_items:
            buttons = item.find_elements(By.TAG_NAME, 'button')
            for button in buttons:
                if button.text == "More":
                    button.click()
                    time.sleep(0.5)
    except Exception as e:
        print(f"Error clicking 'More' buttons: {e}")

    response = BeautifulSoup(driver.page_source, 'html.parser')
    review_elements = response.find_all('div', class_='jftiEf')

    reviews = []
    for element in review_elements:
        author = element.find('div', class_='d4r55').text if element.find('div', class_='d4r55') else ''
        review_text = element.find('span', class_='wiI7pd').text if element.find('span', class_='wiI7pd') else ''
        rating = 0
        rating_element = element.find('span', class_='kvMYJc')
        if rating_element and 'aria-label' in rating_element.attrs:
            stars = rating_element['aria-label']
            rating = int(stars[0]) if stars else 0
        reviews.append({"Restaurant": restaurant_name, "Author": author, "Rating": rating, "Review": review_text})

    return pd.DataFrame(reviews)

# List of restaurant review URLs
restaurant_urls = [
    ("Mr Dakgalbi @ IOI City Mall", "https://www.google.com/maps/place/Mr.+Dakgalbi+@+IOI+City+Mall/@2.9695377,101.7117839,17z/data=!3m1!5s0x31cdca0c047a47ed:0x41f6f4dcfa725c11!4m8!3m7!1s0x31cdca0eaae5554f:0x2f178fd5a73381dc!8m2!3d2.9695377!4d101.7143642!9m1!1b1!16s%2Fg%2F11c2r09z9w?entry=ttu"),
    ("Ombak Kitchen @ IOI City Mall", "https://www.google.com/maps/place/Ombak+Kitchen+IOI+City+Mall,+Putrajaya/@2.969361,101.7099821,17z/data=!3m1!5s0x31cdca0c047a47ed:0x41f6f4dcfa725c11!4m8!3m7!1s0x31cdcb214da9b545:0xfc788fd97b812b03!8m2!3d2.9693556!4d101.712557!9m1!1b1!16s%2Fg%2F11rhvvfh8x?entry=ttu"),
    ("Nando's @ IOI City Mall", "https://www.google.com/maps/place/Nando's+IOI+City+Mall/@2.9695664,101.7107493,17z/data=!4m8!3m7!1s0x31cdca0c1ac2a0f1:0xcee5ecc7ba0e8312!8m2!3d2.9695664!4d101.7133296!9m1!1b1!16s%2Fg%2F11btwttwr2?entry=ttu"),
    ("KyoChon 1991 @ IOI City Mall", "https://www.google.com/maps/place/KyoChon+1991+@+IOI+City+Mall/@2.969697,101.7123299,17z/data=!3m1!5s0x31cdca0c047a47ed:0xaed1d10d9ef3a66f!4m8!3m7!1s0x31cdcba1a52ccea5:0x7da9990023eff55!8m2!3d2.969697!4d101.7149102!9m1!1b1!16s%2Fg%2F11h424z2y2?entry=ttu"),
    ("The Manhattan Fish Market @ IOI City Mall", "https://www.google.com/maps/place/The+Manhattan+Fish+Market+@City+Mall+IOI/@2.9695853,101.7111811,17z/data=!4m8!3m7!1s0x31cdca0ea716881f:0x6d90d2ddfb9b1294!8m2!3d2.9695853!4d101.7137614!9m1!1b1!16s%2Fg%2F11b7t45sjg?entry=ttu"),
]

# Extract reviews from URLs
all_reviews = pd.DataFrame()

for restaurant_name, url in restaurant_urls:
    try:
        reviews = scrape_google_reviews(url, restaurant_name)
        all_reviews = pd.concat([all_reviews, reviews], ignore_index=True)
    except Exception as e:
        print(f"Error scraping {restaurant_name}: {e}")

driver.quit()

# Save the reviews to a CSV file
all_reviews.to_csv("restaurant_reviews.csv", index=False)

print("Reviews have been output into CSV files.")

Reviews have been output into CSV files.


# Data Preprocessing

In [6]:
# Import the necessary libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import pandas as pd

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
# Read the data
df = pd.read_csv('restaurant_reviews.csv')

# Drop rows where the review is null
df = df.dropna(subset=['Review'])

In [8]:
# Perform text pre-processing
stop_words = set(stopwords.words('english'))  # Create a set of English stopwords
lemmatizer = WordNetLemmatizer()  # Initialize a WordNet lemmatizer

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize the text into words and convert to lowercase
    tokens = [token for token in tokens if token.isalnum()]  # Filter out non-alphanumeric tokens
    tokens = [token for token in tokens if token not in stop_words]  # Remove stopwords from the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize each token
    return ' '.join(tokens)  # Return the preprocessed tokens as a single string

# Apply preprocessing to each review and add to a new column
df['Processed Review'] = df['Review'].apply(preprocess_text)

# Print out the total number of rows available for each restaurant
restaurant_counts = df['Restaurant'].value_counts()
for restaurant, count in restaurant_counts.items():
    print(f"{restaurant}: {count} reviews")

# Save the processed data to the same CSV file
df.to_csv('restaurant_reviews.csv', index=False)

print("Preprocessed data has been saved to 'restaurant_reviews.csv'.")

KyoChon 1991 @ IOI City Mall: 620 reviews
Ombak Kitchen @ IOI City Mall: 550 reviews
Mr Dakgalbi @ IOI City Mall: 528 reviews
Nando's @ IOI City Mall: 466 reviews
The Manhattan Fish Market @ IOI City Mall: 413 reviews
Preprocessed data has been saved to 'restaurant_reviews.csv'.


# Sentiment Analysis using Lexicon - Based Model

In [9]:
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import classification_report, accuracy_score

In [10]:
# Function to analyze sentiment using TextBlob
def analyze_sentiment_textblob(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

# Function to analyze sentiment using VADER
analyzer = SentimentIntensityAnalyzer()

def analyze_sentiment_vader(text):
    score = analyzer.polarity_scores(text)
    compound = score['compound']
    if compound >= 0.05:
        return 'positive'
    elif compound > -0.05 and compound < 0.05:
        return 'neutral'
    else:
        return 'negative'

# Apply the sentiment analysis functions
df['sentiment_textblob'] = df['Processed Review'].apply(analyze_sentiment_textblob)
df['sentiment_vader'] = df['Processed Review'].apply(analyze_sentiment_vader)

# Convert ratings to sentiment categories
def score_to_sentiment(score):
    if score >= 4:
        return 'positive'
    elif score == 3:
        return 'neutral'
    else:
        return 'negative'

df['ActualSentiment'] = df['Rating'].apply(score_to_sentiment)

# Evaluate the models
print("Evaluation for TextBlob:")
print(classification_report(df['ActualSentiment'], df['sentiment_textblob']))
print("Accuracy for TextBlob:", accuracy_score(df['ActualSentiment'], df['sentiment_textblob']))

print("Evaluation for VADER:")
print(classification_report(df['ActualSentiment'], df['sentiment_vader']))
print("Accuracy for VADER:", accuracy_score(df['ActualSentiment'], df['sentiment_vader']))

# Save the processed data to the same CSV file
df.to_csv('restaurant_reviews.csv', index=False)

print("Sentiment analysis results have been saved to 'restaurant_reviews.csv'.")

Evaluation for TextBlob:
              precision    recall  f1-score   support

    negative       0.73      0.65      0.69       465
     neutral       0.05      0.05      0.05       175
    positive       0.88      0.91      0.89      1937

    accuracy                           0.80      2577
   macro avg       0.56      0.54      0.55      2577
weighted avg       0.80      0.80      0.80      2577

Accuracy for TextBlob: 0.8040357004268529
Evaluation for VADER:
              precision    recall  f1-score   support

    negative       0.84      0.51      0.64       465
     neutral       0.12      0.14      0.13       175
    positive       0.86      0.93      0.89      1937

    accuracy                           0.80      2577
   macro avg       0.61      0.53      0.55      2577
weighted avg       0.81      0.80      0.80      2577

Accuracy for VADER: 0.8005432673651532
Sentiment analysis results have been saved to 'restaurant_reviews.csv'.
