# **Project Title**
#### *Project Subtitle*

## Hypothesis;

Project thesis

In [None]:
# Importing dependencies
import pandas as pd

# **Data**

## Dataset

Dataset details

In [None]:
# Reading in data
# (potentially?)

### EDA

In [None]:
# Beginning EDA

# **Ramona's Code Space**

*End Code Space*

# **Christian's Code Space**

### Dependencies

In [None]:
# Installing necessary libraries (uncomment if needed)
# %pip install gdown --quiet
# ! pip install evaluate --quiet

In [None]:
# Imports and dependencies
import os
import re
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# json
import json

# gdown
import gdown

from tqdm import tqdm
import unicodedata

import torch
from datasets import load_metric

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

import transformers
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments,Trainer
from transformers import pipeline

from huggingface_hub import notebook_login

import accelerate

from datasets import Dataset

from evaluate import load

#### Resources path

In [None]:
# Defining a function to access datasets through `gdown`
def fetch_data(set):
    # Declaring `url` and `output` for dataset
    match set:
        case 'business':
            url = 'https://drive.google.com/file/d/1t-_rOjZ8oMqPcMJunVaMgY3OEbhnuSCv/view?usp=sharing'
            output = 'Resources/business_dataset.csv'
        case 'checkin':
            url = 'https://drive.google.com/file/d/1_AVWp31ymfvf4QgTiMN_WLAeapfr0omf/view?usp=sharing'
            output = 'Resources/checkin_dataset.csv'
        case 'reviews':
            url = 'https://drive.google.com/file/d/1L8rFjhOQyU90Ycr9t_OLA70vCYM0e7ck/view?usp=sharing'
            output = 'Resources/reviews_dataset.csv'
        case 'tip':
            url = 'https://drive.google.com/file/d/1LMkCi5AFC_58_m7ELmn1hR8YDykuXwqq/view?usp=sharing'
            output = 'Resources/tip_dataset.csv'
        case 'user':
            url = 'https://drive.google.com/file/d/1kQ522qcod7AjD5DO9vj8qFcSKxwJCDrO/view?usp=sharing'
            output = 'Resources/user_dataset.csv'
        case _:
            print('Invalid dataset selected, please try again')
            return None
    
    # Downloading dataset
    gdown.download(url, output, fuzzy=True, quiet=True)

    # Reading in the dataset
    df = pd.read_csv(output, low_memory=False)

    # Returning the dataset
    return df

Fetching/reading in all datasets

In [None]:
# Fetching all datasets (uncomment for first run of code)
# business_df = fetch_data('business')
# checkin_df = fetch_data('checkin')
# reviews_df = fetch_data('reviews')
# tips_df = fetch_data('tip')
# user_df = fetch_data('user')

# Reading in all datasets (uncomment if data already fetched)
business_df = pd.read_csv('./Resources/business_dataset.csv')
checkin_df = pd.read_csv('./Resources/checkin_dataset.csv')
reviews_df = pd.read_csv('./Resources/reviews_dataset.csv')
tips_df = pd.read_csv('./Resources/tip_dataset.csv')
user_df = pd.read_csv('./Resources/user_dataset.csv')

---

#### Business dataset

#### <font color='blue'> Description:</font> 
**Contains business data including location data, attributes, and categories.**

#### Overview

In [None]:
business_df.head()

#### Info

In [None]:
business_df.info()

---

#### Checkin dataset

#### <font color='blue'> Description:</font>
**Checkins on a business.**

#### Overview

In [None]:
checkin_df.head()

#### Info

In [None]:
checkin_df.info()

#### **<font color='orange'> Notes:</font>**
**The team has determined this dataset would not add any value to our training data.**

---

#### Reviews dataset

#### <font color = 'blue'>Description:</font>
**Contains full review text data including the user_id that wrote the review and the business_id the review is written for.**

#### Overview

In [None]:
reviews_df.head()

#### Info

In [None]:
reviews_df.info()

#### Na count

In [None]:
reviews_df.isna().sum()

#### Dropping columns:
- **review_id**
- **useful**
- **funny**
- **cool**

In [None]:
reviews_df.drop(columns = ['review_id','useful','funny','cool'],
                inplace = True)

#### Renaming the 'text' field to 'review'

In [None]:
reviews_df.rename(columns = {'text':'review'},inplace = True)
reviews_df.head()

#### **<font color='orange'> Notes:</font>**
- **review_id: Eliminated due to low informational value.**
- **useful: Eliminated due to low relevance.**
- **funny: Eliminated due to low relevance.**
- **cool: Eliminated due to low relevance.**

  **The *<font color='green'>'business_id'</font>* feature will be used as the identifier, *<font color='green'>'stars'</font>* is the rating metric and the *<font color='grey'>'review'</font>*  field encapsulates**<br>
  **the data to be processed. the *<font color='green'>'date'</font>* variable is in place if time series analysis is needed.**

---

#### Tips dataset

#### <font color='blue'>Description:</font>
**Tips written by a user on a business. Tips are shorter than reviews and tend to convey quick suggestions.**

#### Overview

In [None]:
tips_df.head()

#### Info

In [None]:
tips_df.info()

#### Dropping columns:
- **compliment_count**

In [None]:
tips_df.drop(columns = ['compliment_count'],
             inplace =True)

#### Renaming the 'text' column to 'recommendations'

In [None]:
tips_df.rename(columns = {'text':'recommendations'},inplace = True)
tips_df.head()

#### **<font color='orange'> Notes:</font>**
- **compliment_count: Eliminated due to low informational value.**


 **Since this data set has recommendations from the user to improve customer experience the 'recommendations' field could be a useful target variable.**

---

#### User dataset

#### <font color = 'blue'>Note:</font>
**User data including the user's friend mapping and all the metadata associated with the user.**

#### Overview

In [None]:
user_df.head()

#### Info

In [None]:
user_df.info()

#### **<font color='orange'> Notes:</font>**
**This data set will not be included in the training data to preserve user anonimity.**

# //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

## **<font color = 'darkgrey'>Merging the reviews data set and the business data set</font>**

#### <font color = 'blue'>Description:</font>
**This data set contains the fields that will be used to train the model**

In [None]:
data_df = reviews_df.merge(business_df,how='left',on = 'business_id')

In [None]:
data_df.info()

In [None]:
data_df.head()

#### Na count

In [None]:
data_df.isna().sum()

In [None]:
na_prcnt = data_df[['attributes','categories','hours']].isna().sum()/data_df.shape[0]*100
nas_df = pd.DataFrame(na_prcnt, columns=['percentage'])
nas_df = nas_df.transpose()
nas_df.round(4)

In [None]:
sns.barplot(data = nas_df).set_title('Na percentage')

#### **<font color='orange'> Notes:</font>**
**After consulting with the team we decided to drop all three columns.**

#### Dropping rows with na values

In [None]:
data_df.drop(columns = ['attributes','categories','hours'],inplace=True)

In [None]:
data_df.isna().sum()

#### *<font color='grey'>stars_x* and *<font color='grey'>stars_y* comparison

In [None]:
data_df.loc[data_df['stars_x'] != data_df['stars_y']][['stars_x','stars_y']].head()

#### *<font color='grey'>stars_x* and *<font color='grey'>stars_y*  for the same customer

In [None]:
data_df.loc[data_df['business_id']=='XQfwVwDr-v0ZS3_CbbE5Xw'][['stars_x','stars_y']].head()

#### *<font color='grey'>stars_x* average

In [None]:
round(data_df.loc[data_df['business_id']=='XQfwVwDr-v0ZS3_CbbE5Xw']['stars_x'].mean(),2)

#### **<font color='orange'> Notes:</font>**
**Because** *<font color='grey'> star_y</font>* **represents the average star rating, renaming** *<font color='grey'> star_y:</font>* **to:** *<font color='grey'> star_avg:</font>*

#### Renaming

In [None]:
data_df.rename(columns={'stars_y':'stars_avg','stars_x':'stars'},inplace = True)

#### Dropping is_open feature

In [None]:
fig,ax = plt.subplots()
sns.countplot(data_df,
             x='is_open',
             hue = 'is_open',
             ax = ax).set_title('is_open Feature')

#### droppin is_open

In [None]:
data_df.drop(columns = ['is_open'],inplace = True)

#### **<font color='orange'> Notes:</font>**
**After cosulting with the team we decided to drop this feature due low informational value and feature imbalance**

# //////////////////////////////////////////////////////////////////////////////////////////////////

## **<font color='darkgrey'>Merging with the tips data set exploration</font>**

#### <font color = 'blue'>Description:</font>
**Contains customer recommendatins to improve experience**

In [None]:
tips_df.head()

In [None]:
tips_df.info()

#### Quantity of unique business_id in the tips data set

In [None]:
display(tips_df['business_id'].unique().shape[0])

#### Quantity of unique business_id in  data_df

In [None]:
data_df['business_id'].unique().shape[0]

#### Subset of *<font color='grey'>business_id</font>* in *<font color='grey'>data_df</font>* not found in *<font color='grey'>tips_df</font>*.

In [None]:
no_tips_df = data_df[~data_df['business_id'].isin(tips_df['business_id'])]
no_tips_df.head()

#### Number of *<font color='grey'>business_id</font>* in *<font color='grey'>data_df</font>* not found in *<font color='grey'>tips_df</font>*.

In [None]:
no_tips_df = data_df[~data_df['business_id'].isin(tips_df['business_id'])]
not_found = no_tips_df['business_id'].unique().shape[0]
print(f'Number of business_ids in tips_df not found in data_df: {not_found}')

#### Evidence

In [None]:
tips_df.loc[tips_df['business_id'] == no_tips_df['business_id'].iloc[33]]

#### Merge

In [None]:
test_df = pd.merge(tips_df,data_df,
                   on = ['business_id','user_id'],
                   how = 'inner')
                         

#### Overview

In [None]:
test_df.info()

In [None]:
test_df.head()

#### Comparison review vs. recommendations

In [None]:
test_df[['review','recommendations']].head()

##### **<font color='orange'> Notes:</font>**
**The <font color='grey'>data_df</font> has approximately <font color='green'>7 million</font> entries and <font color='grey'>tips_df</font> about <font color='green'>1 million</font> after merging them we end up the a little under half a million**.<br>
**In the comparison above I don't see a difference between a review from the *reviews data set* and a recommendation from the *tips data set***.<br>
**As shown above we stand to loose a significant amount of data if a merge is performed**.

# ///////////////////////////////////////////////////////////////////////////////////////////////////

## <font color='darkgrey'>Final Data Overview</font>

#### Dropping the user_id column to preserv user anonimity

In [None]:
data_df.drop(columns = ['user_id'],inplace = True)

#### Overview

In [None]:
data_df.head()

#### Info

In [None]:
data_df.info()

#### Na verification

In [None]:
data_df.isna().sum()

## Modeling

In [None]:
# Narrowing data to trainable scope
def sample_stars(df, val):
    df = df[df['stars'] == val].copy()
    if val >= 4:
        df = df.sample(100)
    elif val <= 2:
        df = df.sample(100)
    else:
        df = df.sample(200)
    df.reset_index(inplace=True)
    return df

In [None]:
sample_5 = sample_stars(data_df,5)
sample_4 = sample_stars(data_df,4)
sample_3 = sample_stars(data_df,3)
sample_2 = sample_stars(data_df,2)
sample_1 = sample_stars(data_df,1)

In [None]:
sample_data_df = pd.concat(
    [
        sample_1,
        sample_2,
        sample_3,
        sample_4,
        sample_5
    ], axis=0, ignore_index=True
)

sample_data_df.shape

In [None]:
sample_data_df['stars'].value_counts()

In [None]:
sample_data_df['stars']= sample_data_df['stars'].replace(to_replace=[1,2], value=0)
sample_data_df['stars']= sample_data_df['stars'].replace(to_replace=3, value=1)
sample_data_df['stars']= sample_data_df['stars'].replace(to_replace=[4,5], value=2)

In [None]:
sample_data_df['stars'].value_counts()

In [None]:
sample_data_df.isna().sum()

In [None]:
sample_data_df.rename(columns={'review':'text','stars':'label'},inplace = True)

Removing accented characters

In [None]:
def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

Cleaning text

In [None]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    # Remove mentions
    text = re.sub(r"@\S+", "", text)
    # Remove hashtags
    text = re.sub(r"#\S+", "", text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

Preprocessing reviews

In [None]:
def pre_process_reviews(reviews):
  stop_words = set(stopwords.words('english'))  
  norm_reviews = []
  for review in tqdm(reviews):
    
    # Clean text
    review = clean_text(review)
    # remove extra newlines and convert them to spaces
    review = review.translate(review.maketrans("\n\t\r", "   "))
    # lower case
    review = review.lower()
    # remove accents
    review = remove_accented_chars(review)
    # remove special characters
    review = re.sub(r'[^a-zA-Z0-9\s]', '', review, flags=re.I|re.A)
    # remove extra whitespaces
    review = re.sub(' +', ' ', review)
    # remove leading and training whitespaces
    review = review.strip()

    review_tokens = word_tokenize(review)
    review = [w for w in review_tokens if not w in stop_words]
    review = ' '.join(review)
      
    norm_reviews.append(review)

  return norm_reviews

Tokenizer function

In [None]:
def tokenizer_function(review):
    # Extracting text
    text = review['text']

    # Tokenize text with truncation and padding
    tokenized_inputs = tokenizer(
        text,
        # Truncate to max_length from the right by default
        truncation=True,
        # Pad to the maximum length
        padding="max_length",
        # Maximum sequence length for BERT models
        max_length=512,
        # Assuming you are using PyTorch; change to 'np' if necessary
        return_tensors='pt'
    )

    return tokenized_inputs

Metrics

In [None]:
# Load multiple metrics
accuracy_metric = load("accuracy")
precision_metric = load("precision")
recall_metric = load("recall")
f1_metric = load("f1")

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    
    # Return a dictionary containing all metrics
    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
    }

Train Test Split

In [None]:
X = sample_data_df['text']
y = sample_data_df['label']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state=42)

Subset

In [None]:
# X_train = X_train[:100]
# y_train = y_train[:100]

# X_test = X_test[:30]
# y_test = y_test[:30]

Preprocessing

In [None]:
norm_train_reviews = pre_process_reviews(X_train)
norm_test_reviews = pre_process_reviews(X_test)

Datasets

In [None]:
train_dataset = Dataset.from_dict({'label':y_train.to_list(),'text':norm_train_reviews})
test_dataset = Dataset.from_dict({'label':y_test.to_list(),'text':norm_test_reviews})

Generating Model

In [None]:
#Pretrained model
model_checkpoint = 'distilbert-base-uncased'

#Defining label classes
id_to_label = {0:'Negative', 1:'Neutral', 2:'Positive'}
label_to_id = {'Negative':0, 'Neutral': 1,'Positive':2}

#Model definiftion
model = DistilBertForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels = 3,
    id2label = id_to_label,
    label2id =label_to_id 
)

Tokenizing

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenized_train_dataset = train_dataset.map(tokenizer_function, batched=True)
tokenized_test_dataset = train_dataset.map(tokenizer_function, batched=True)

Collator

In [None]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

Training Arguments

In [None]:
output_dir = 'model_sentiment'
lr = 2e-5
batch_size = 32
EPOCHS = 3


training_args = TrainingArguments(
    output_dir = output_dir,
    learning_rate = lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size*2,
    num_train_epochs = EPOCHS,
    weight_decay = 0.01,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
    logging_steps = 10,
    load_best_model_at_end = True,
    # Enable mixed precision
    fp16=True
)

Trainer

In [None]:
trainer = Trainer(
                  model = model,
                  args = training_args,
                  train_dataset = tokenized_train_dataset,
                  eval_dataset = tokenized_test_dataset,
                  tokenizer = tokenizer,
                  compute_metrics = compute_metrics,
                  data_collator = data_collator
)

Training

In [None]:
trained_model_results = trainer.train()

Initial fine-tuning of pretrained model yielded accuracy values of ~40-50%.
Final yielded accruacy values are around XXX%.

Steps taken to improve accuracy (in something close to resembling order of application);
* Changed pre-trained model from `distilbert-base-uncased` to `MarieAngeA13/Sentiment-Analysis-BERT`
* Adjusted sample sizes of data (from ~100 records total to a balanced sample set with equal representation for all ratings)
* Updates to text cleaning to include more web-present syntax (eg; mentions, multiple spaces, hashtags, and web address elements)
* Adjustted syntax and arguments of tokenizer function and application of it
* Adjusted training arguments to better align with our BERT-based model
* Added additional metrics for better understanding of neccessary optimization
* Increased sample data size, again, and removed subset step entirely
* Adjusted batch size and epochs
* Move back to `distilbert-base-uncased` and adjusted learning tokenizers, learning rate, logging steps, and such hyperparameters accordingly
* 

Paths

In [None]:
model_path = 'Sentiment_Analysis/model'
tokenizer_path =  'Sentiment_Analysis/tokenizer'

Saving Model & Tokenizer

In [None]:
trainer.save_model(model_path)
tokenizer.save_pretrained(tokenizer_path)

Loading and Testing model

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

roberto = pipeline('sentiment-analysis',model=model,tokenizer=tokenizer)
roberto(sample_data_df.iloc[0]['text'])

*End Code Space*

In [None]:
# Install necessary libraries (uncomment if needed)
# ! pip install selenium --quiet
# ! pip install webdriver-manager --quiet
# ! pip install beautifulsoup4 --quiet

# **Leigh's Code Space**

*End Code Space*

# **Angelica's Code Space**

In [None]:
#import required libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

# # create a function to get the address of the each location and add it to a dataframe 
# def address_Addition(business,address1,new_df):
#     new_df['business_name'] = business
#     address_list = address1.split(',')
#     new_df['address'] = address_list[0]
#     new_df['city'] = address_list[1]
#     return new_df

# initiate driver
driver = webdriver.Chrome(service = ChromeService(ChromeDriverManager().install()))

# url list with business locations
#url = ['https://www.google.com/maps/place/Tim+Hortons/@43.7607366,-79.5321831,14z/data=!4m10!1m2!2m1!1stim+hortons!3m6!1s0x882b31d93eab2809:0xa9ea7bb65f9da6ec!8m2!3d43.7607366!4d-79.4992241!15sCgt0aW0gaG9ydG9ucyIDiAEBWg0iC3RpbSBob3J0b25zkgEKcmVzdGF1cmFudOABAA!16s%2Fg%2F1vyxk0xz','https://www.google.com/maps/place/Tim+Hortons/@43.7607366,-79.5321831,14z/data=!4m10!1m2!2m1!1stim+hortons!3m6!1s0x882b302d70a29891:0xc279061e4a5c71bc!8m2!3d43.756124!4d-79.5152637!15sCgt0aW0gaG9ydG9ucyIDiAEBWg0iC3RpbSBob3J0b25zkgEKcmVzdGF1cmFudOABAA!16s%2Fg%2F1td38wkb']
url = ["https://www.google.com/maps/place/McDonald's/@43.7607329,-79.5321831,14z/data=!4m10!1m2!2m1!1sMcDonald's!3m6!1s0x882b31e6d3859eb1:0xc92a9af2d1385093!8m2!3d43.7624131!4d-79.490243!15sCgpNY0RvbmFsZCdzIgOIAQFaDCIKbWNkb25hbGQnc5IBFGZhc3RfZm9vZF9yZXN0YXVyYW504AEA!16s%2Fg%2F1hc604hjv?entry=ttu"]


driver.get(url[0])
time.sleep(5)

# Find the address of the location
response = BeautifulSoup(driver.page_source, 'html.parser')
business_name = response.find('h1',class_='DUwDvf lfPIob').text
business_name



    # address = response.find('div',class_= 'rogA2c').text

In [None]:
business_name

In [None]:
#import required libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

# create a function to get the address of the each location and add it to a dataframe 
def business_Overview(business,avg_rating,address1,new_df):
    new_df['business_name'] = business
    new_df['avg_rating'] = avg_rating
    address_list = address1.split(',')
    new_df['address'] = address_list[0]
    new_df['city'] = address_list[1]
    return new_df

# initiate driver
driver = webdriver.Chrome(service = ChromeService(ChromeDriverManager().install()))

# url list with business locations
#url = ['https://www.google.com/maps/place/Tim+Hortons/@43.7607366,-79.5321831,14z/data=!4m10!1m2!2m1!1stim+hortons!3m6!1s0x882b31d93eab2809:0xa9ea7bb65f9da6ec!8m2!3d43.7607366!4d-79.4992241!15sCgt0aW0gaG9ydG9ucyIDiAEBWg0iC3RpbSBob3J0b25zkgEKcmVzdGF1cmFudOABAA!16s%2Fg%2F1vyxk0xz','https://www.google.com/maps/place/Tim+Hortons/@43.7607366,-79.5321831,14z/data=!4m10!1m2!2m1!1stim+hortons!3m6!1s0x882b302d70a29891:0xc279061e4a5c71bc!8m2!3d43.756124!4d-79.5152637!15sCgt0aW0gaG9ydG9ucyIDiAEBWg0iC3RpbSBob3J0b25zkgEKcmVzdGF1cmFudOABAA!16s%2Fg%2F1td38wkb']
url = ["https://www.google.com/maps/place/McDonald's/@43.7607329,-79.5321831,14z/data=!4m10!1m2!2m1!1sMcDonald's!3m6!1s0x882b31e6d3859eb1:0xc92a9af2d1385093!8m2!3d43.7624131!4d-79.490243!15sCgpNY0RvbmFsZCdzIgOIAQFaDCIKbWNkb25hbGQnc5IBFGZhc3RfZm9vZF9yZXN0YXVyYW504AEA!16s%2Fg%2F1hc604hjv?entry=ttu"]


#create for loop to parse through the different locations in the url list above
c = 0
for i in range(0,len(url)):
    c = c+1
    driver.get(url[i])
    time.sleep(5)

    # Find the address of the location
    response = BeautifulSoup(driver.page_source, 'html.parser')
    business_name = response.find('h1',class_='DUwDvf lfPIob').text
    avg_rating = response.find('div',class_='fontDisplayLarge').text
    address = response.find('div',class_= 'rogA2c').text
    driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[3]/div/div[1]/div/div/div[2]/div[2]/div[1]/div[1]/div[2]/div/div[1]/div[2]').click()
    time.sleep(3)
    
    # By default, only 10 reviews can be extracted - to extract more reviews we have to scroll down the apge
    SCROLL_PAUSE_TIME = 5

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    number = 0

    while True:
        number = number+1

        # Scroll down to bottom

        #old_==ele = driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')
        ele = driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[3]/div/div[1]/div/div/div[2]')
        driver.execute_script('arguments[0].scrollBy(0, 5000);', ele)

        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)

        # Calculate new scroll height and compare with last scroll height
        #print(f'last height: {last_height}')

        ele = driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[3]/div/div[1]/div/div/div[2]')

        new_height = driver.execute_script("return arguments[0].scrollHeight", ele)

        #print(f'new height: {new_height}')

        if number == 5:
            break

        if new_height == last_height:
            break

        #print('cont')
        last_height = new_height
    next_item = driver.find_elements('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[3]/div/div[1]/div/div/div[2]/div[9]')
    time.sleep(3)

    #expand review by click on 'more' button
    for i in next_item:
        button = i.find_elements(By.TAG_NAME,'button')
        for m in button:
            if m.text == "More":
                m.click()
        time.sleep(5)

    response = BeautifulSoup(driver.page_source, 'html.parser')
    next_2 = response.find_all('div',class_ = 'jftiEf')

    #get review by passing it to a dictionary
    def get_review_summary(result_set):
        rev_dict = {
            'Review Name': [],
            'Review Text' : [],
            'Review Rating' : []}

        for result in result_set:
            review_name = result.find(class_='d4r55').text
            review_text = result.find('span',class_='wiI7pd').text
            review_rating = result.find(class_='kvMYJc')['aria-label']
            rev_dict['Review Name'].append(review_name)
            rev_dict['Review Text'].append(review_text)
            rev_dict['Review Rating'].append(review_rating)
        
         
        return(pd.DataFrame(rev_dict))

    df = get_review_summary(next_2)
    if c == 1:
        df1 = df.copy()
        final_df = business_Overview(business_name,avg_rating,address,df1)
    else:
        df2 = df.copy()
        final_df = business_Overview(business_name,avg_rating,address,df2)
        final_df = pd.concat([df1,final_df],axis = 0)
    

print(final_df)

In [None]:
# review df with reviews and locations
final_df


*End Code Space*

# **Odele's Code Space**

*End Code Space*

# **Vanessa's Code Space**

*End Code Space*

# **Train Test Splitting**

# **Scaling and Encoding**

# **Modeling**

# **Application (?)**

# **Findings**

# **Citations and Licenses**

## Citations

## Licenses