# **Project Title**
#### *Project Subtitle*

## Hypothesis;

Project thesis

In [None]:
# Importing dependencies
import pandas as pd

# **Data**

## Dataset

Dataset details

In [None]:
# Reading in data
# (potentially?)

### EDA

In [None]:
# Beginning EDA

# **Ramona's Code Space**

*End Code Space*

# **Christian's Code Space**

### Dependencies

In [None]:
# Installing necessary libraries (uncomment if needed)
# %pip install gdown --quiet
# ! pip install evaluate --quiet

In [None]:
# Imports and dependencies
import os
import re
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# json
import json

# gdown
import gdown

# zipfile
import zipfile

from tqdm import tqdm
import unicodedata

import torch
from datasets import load_metric

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

import transformers
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments,Trainer
from transformers import pipeline

from huggingface_hub import notebook_login

import accelerate

from datasets import Dataset

from evaluate import load

#### Resources path

In [None]:
# Defining a function to access datasets through `gdown`
def fetch_data(set):
    # Declaring `url` and `output` for dataset
    match set:
        case 'business':
            url = 'https://drive.google.com/file/d/1t-_rOjZ8oMqPcMJunVaMgY3OEbhnuSCv/view?usp=sharing'
            output = 'Resources/business_dataset.csv'
        case 'checkin':
            url = 'https://drive.google.com/file/d/1_AVWp31ymfvf4QgTiMN_WLAeapfr0omf/view?usp=sharing'
            output = 'Resources/checkin_dataset.csv'
        case 'reviews':
            url = 'https://drive.google.com/file/d/1L8rFjhOQyU90Ycr9t_OLA70vCYM0e7ck/view?usp=sharing'
            output = 'Resources/reviews_dataset.csv'
        case 'tip':
            url = 'https://drive.google.com/file/d/1LMkCi5AFC_58_m7ELmn1hR8YDykuXwqq/view?usp=sharing'
            output = 'Resources/tip_dataset.csv'
        case 'user':
            url = 'https://drive.google.com/file/d/1kQ522qcod7AjD5DO9vj8qFcSKxwJCDrO/view?usp=sharing'
            output = 'Resources/user_dataset.csv'
        case _:
            print('Invalid dataset selected, please try again')
            return None
    
    # Downloading dataset
    gdown.download(url, output, fuzzy=True, quiet=True)

    # Reading in the dataset
    df = pd.read_csv(output, low_memory=False)

    # Returning the dataset
    return df

Fetching/reading in all datasets

In [None]:
# Fetching all datasets (uncomment for first run of code)
# business_df = fetch_data('business')
# checkin_df = fetch_data('checkin')
# reviews_df = fetch_data('reviews')
# tips_df = fetch_data('tip')
# user_df = fetch_data('user')

# Reading in all datasets (uncomment if data already fetched)
business_df = pd.read_csv('./Resources/business_dataset.csv')
checkin_df = pd.read_csv('./Resources/checkin_dataset.csv')
reviews_df = pd.read_csv('./Resources/reviews_dataset.csv')
tips_df = pd.read_csv('./Resources/tip_dataset.csv')
user_df = pd.read_csv('./Resources/user_dataset.csv')

---

#### Business dataset

#### <font color='blue'> Description:</font> 
**Contains business data including location data, attributes, and categories.**

#### Overview

In [None]:
business_df.head()

#### Info

In [None]:
business_df.info()

---

#### Checkin dataset

#### <font color='blue'> Description:</font>
**Checkins on a business.**

#### Overview

In [None]:
checkin_df.head()

#### Info

In [None]:
checkin_df.info()

#### **<font color='orange'> Notes:</font>**
**The team has determined this dataset would not add any value to our training data.**

---

#### Reviews dataset

#### <font color = 'blue'>Description:</font>
**Contains full review text data including the user_id that wrote the review and the business_id the review is written for.**

#### Overview

In [None]:
reviews_df.head()

#### Info

In [None]:
reviews_df.info()

#### Na count

In [None]:
reviews_df.isna().sum()

#### Dropping columns:
- **review_id**
- **useful**
- **funny**
- **cool**

In [None]:
reviews_df.drop(columns = ['review_id','useful','funny','cool'],
                inplace = True)

#### Renaming the 'text' field to 'review'

In [None]:
reviews_df.rename(columns = {'text':'review'},inplace = True)
reviews_df.head()

#### **<font color='orange'> Notes:</font>**
- **review_id: Eliminated due to low informational value.**
- **useful: Eliminated due to low relevance.**
- **funny: Eliminated due to low relevance.**
- **cool: Eliminated due to low relevance.**

  **The *<font color='green'>'business_id'</font>* feature will be used as the identifier, *<font color='green'>'stars'</font>* is the rating metric and the *<font color='grey'>'review'</font>*  field encapsulates**<br>
  **the data to be processed. the *<font color='green'>'date'</font>* variable is in place if time series analysis is needed.**

---

#### Tips dataset

#### <font color='blue'>Description:</font>
**Tips written by a user on a business. Tips are shorter than reviews and tend to convey quick suggestions.**

#### Overview

In [None]:
tips_df.head()

#### Info

In [None]:
tips_df.info()

#### Dropping columns:
- **compliment_count**

In [None]:
tips_df.drop(columns = ['compliment_count'],
             inplace =True)

#### Renaming the 'text' column to 'recommendations'

In [None]:
tips_df.rename(columns = {'text':'recommendations'},inplace = True)
tips_df.head()

#### **<font color='orange'> Notes:</font>**
- **compliment_count: Eliminated due to low informational value.**


 **Since this data set has recommendations from the user to improve customer experience the 'recommendations' field could be a useful target variable.**

---

#### User dataset

#### <font color = 'blue'>Note:</font>
**User data including the user's friend mapping and all the metadata associated with the user.**

#### Overview

In [None]:
user_df.head()

#### Info

In [None]:
user_df.info()

#### **<font color='orange'> Notes:</font>**
**This data set will not be included in the training data to preserve user anonimity.**

# //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

## **<font color = 'darkgrey'>Merging the reviews data set and the business data set</font>**

#### <font color = 'blue'>Description:</font>
**This data set contains the fields that will be used to train the model**

In [None]:
data_df = reviews_df.merge(business_df,how='left',on = 'business_id')

In [None]:
data_df.info()

In [None]:
data_df.head()

#### Na count

In [None]:
data_df.isna().sum()

In [None]:
na_prcnt = data_df[['attributes','categories','hours']].isna().sum()/data_df.shape[0]*100
nas_df = pd.DataFrame(na_prcnt, columns=['percentage'])
nas_df = nas_df.transpose()
nas_df.round(4)

In [None]:
sns.barplot(data = nas_df).set_title('Na percentage')

#### **<font color='orange'> Notes:</font>**
**After consulting with the team we decided to drop all three columns.**

#### Dropping rows with na values

In [None]:
data_df.drop(columns = ['attributes','categories','hours'],inplace=True)

In [None]:
data_df.isna().sum()

#### *<font color='grey'>stars_x* and *<font color='grey'>stars_y* comparison

In [None]:
data_df.loc[data_df['stars_x'] != data_df['stars_y']][['stars_x','stars_y']].head()

#### *<font color='grey'>stars_x* and *<font color='grey'>stars_y*  for the same customer

In [None]:
data_df.loc[data_df['business_id']=='XQfwVwDr-v0ZS3_CbbE5Xw'][['stars_x','stars_y']].head()

#### *<font color='grey'>stars_x* average

In [None]:
round(data_df.loc[data_df['business_id']=='XQfwVwDr-v0ZS3_CbbE5Xw']['stars_x'].mean(),2)

#### **<font color='orange'> Notes:</font>**
**Because** *<font color='grey'> star_y</font>* **represents the average star rating, renaming** *<font color='grey'> star_y:</font>* **to:** *<font color='grey'> star_avg:</font>*

#### Renaming

In [None]:
data_df.rename(columns={'stars_y':'stars_avg','stars_x':'stars'},inplace = True)

#### Dropping is_open feature

In [None]:
fig,ax = plt.subplots()
sns.countplot(data_df,
             x='is_open',
             hue = 'is_open',
             ax = ax).set_title('is_open Feature')

#### droppin is_open

In [None]:
data_df.drop(columns = ['is_open'],inplace = True)

#### **<font color='orange'> Notes:</font>**
**After cosulting with the team we decided to drop this feature due low informational value and feature imbalance**

# //////////////////////////////////////////////////////////////////////////////////////////////////

## **<font color='darkgrey'>Merging with the tips data set exploration</font>**

#### <font color = 'blue'>Description:</font>
**Contains customer recommendatins to improve experience**

In [None]:
tips_df.head()

In [None]:
tips_df.info()

#### Quantity of unique business_id in the tips data set

In [None]:
display(tips_df['business_id'].unique().shape[0])

#### Quantity of unique business_id in  data_df

In [None]:
data_df['business_id'].unique().shape[0]

#### Subset of *<font color='grey'>business_id</font>* in *<font color='grey'>data_df</font>* not found in *<font color='grey'>tips_df</font>*.

In [None]:
no_tips_df = data_df[~data_df['business_id'].isin(tips_df['business_id'])]
no_tips_df.head()

#### Number of *<font color='grey'>business_id</font>* in *<font color='grey'>data_df</font>* not found in *<font color='grey'>tips_df</font>*.

In [None]:
no_tips_df = data_df[~data_df['business_id'].isin(tips_df['business_id'])]
not_found = no_tips_df['business_id'].unique().shape[0]
print(f'Number of business_ids in tips_df not found in data_df: {not_found}')

#### Evidence

In [None]:
tips_df.loc[tips_df['business_id'] == no_tips_df['business_id'].iloc[33]]

#### Merge

In [None]:
test_df = pd.merge(tips_df,data_df,
                   on = ['business_id','user_id'],
                   how = 'inner')
                         

#### Overview

In [None]:
test_df.info()

In [None]:
test_df.head()

#### Comparison review vs. recommendations

In [None]:
test_df[['review','recommendations']].head()

##### **<font color='orange'> Notes:</font>**
**The <font color='grey'>data_df</font> has approximately <font color='green'>7 million</font> entries and <font color='grey'>tips_df</font> about <font color='green'>1 million</font> after merging them we end up the a little under half a million**.<br>
**In the comparison above I don't see a difference between a review from the *reviews data set* and a recommendation from the *tips data set***.<br>
**As shown above we stand to loose a significant amount of data if a merge is performed**.

# ///////////////////////////////////////////////////////////////////////////////////////////////////

## <font color='darkgrey'>Final Data Overview</font>

#### Dropping the user_id column to preserv user anonimity

In [None]:
data_df.drop(columns = ['user_id'],inplace = True)

#### Overview

In [None]:
data_df.head()

#### Info

In [None]:
data_df.info()

#### Na verification

In [None]:
data_df.isna().sum()

## Modeling

In [None]:
# Narrowing data to trainable scope
def sample_stars(df, val):
    df = df[df['stars'] == val].copy()
    if val >= 4:
        df = df.sample(1000)
    elif val <= 2:
        df = df.sample(1000)
    else:
        df = df.sample(2000)
    df.reset_index(inplace=True)
    return df

In [None]:
sample_5 = sample_stars(data_df,5)
sample_4 = sample_stars(data_df,4)
sample_3 = sample_stars(data_df,3)
sample_2 = sample_stars(data_df,2)
sample_1 = sample_stars(data_df,1)

In [None]:
sample_data_df = pd.concat(
    [
        sample_1,
        sample_2,
        sample_3,
        sample_4,
        sample_5
    ], axis=0, ignore_index=True
)

sample_data_df.shape

In [None]:
sample_data_df['stars'].value_counts()

In [None]:
sample_data_df['stars']= sample_data_df['stars'].replace(to_replace=[1,2], value=0)
sample_data_df['stars']= sample_data_df['stars'].replace(to_replace=3, value=1)
sample_data_df['stars']= sample_data_df['stars'].replace(to_replace=[4,5], value=2)

In [None]:
sample_data_df['stars'].value_counts()

In [None]:
sample_data_df.isna().sum()

In [None]:
sample_data_df.rename(columns={'review':'text','stars':'label'},inplace = True)

Removing accented characters

In [None]:
def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

Cleaning text

In [None]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    # Remove mentions
    text = re.sub(r"@\S+", "", text)
    # Remove hashtags
    text = re.sub(r"#\S+", "", text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

Preprocessing reviews

In [None]:
def pre_process_reviews(reviews):
  stop_words = set(stopwords.words('english'))  
  norm_reviews = []
  for review in tqdm(reviews):
    
    # Clean text
    review = clean_text(review)
    # remove extra newlines and convert them to spaces
    review = review.translate(review.maketrans("\n\t\r", "   "))
    # lower case
    review = review.lower()
    # remove accents
    review = remove_accented_chars(review)
    # remove special characters
    review = re.sub(r'[^a-zA-Z0-9\s]', '', review, flags=re.I|re.A)
    # remove extra whitespaces
    review = re.sub(' +', ' ', review)
    # remove leading and training whitespaces
    review = review.strip()

    review_tokens = word_tokenize(review)
    review = [w for w in review_tokens if not w in stop_words]
    review = ' '.join(review)
      
    norm_reviews.append(review)

  return norm_reviews

Tokenizer function

In [None]:
def tokenizer_function(review):
    # Extracting text
    text = review['text']

    # Tokenize text with truncation and padding
    tokenized_inputs = tokenizer(
        text,
        # Truncate to max_length from the right by default
        truncation=True,
        # Pad to the maximum length
        padding="max_length",
        # Maximum sequence length for BERT models
        max_length=512,
        # Assuming you are using PyTorch; change to 'np' if necessary
        return_tensors='pt'
    )

    return tokenized_inputs

Metrics

In [None]:
# Load multiple metrics
accuracy_metric = load("accuracy")
precision_metric = load("precision")
recall_metric = load("recall")
f1_metric = load("f1")

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    
    # Return a dictionary containing all metrics
    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
    }

Train Test Split

In [None]:
X = sample_data_df['text']
y = sample_data_df['label']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state=42)

Preprocessing

In [None]:
norm_train_reviews = pre_process_reviews(X_train)
norm_test_reviews = pre_process_reviews(X_test)

Datasets

In [None]:
train_dataset = Dataset.from_dict({'label':y_train.to_list(),'text':norm_train_reviews})
test_dataset = Dataset.from_dict({'label':y_test.to_list(),'text':norm_test_reviews})

Generating Model

In [None]:
#Pretrained model
model_checkpoint = 'distilbert-base-uncased'

#Defining label classes
id_to_label = {0:'Negative', 1:'Neutral', 2:'Positive'}
label_to_id = {'Negative':0, 'Neutral': 1,'Positive':2}

#Model definiftion
model = DistilBertForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels = 3,
    id2label = id_to_label,
    label2id =label_to_id 
)

Tokenizing

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenized_train_dataset = train_dataset.map(tokenizer_function, batched=True)
tokenized_test_dataset = train_dataset.map(tokenizer_function, batched=True)

Collator

In [None]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

Training Arguments

In [None]:
output_dir = 'model_sentiment'
lr = 2e-5
batch_size = 32
EPOCHS = 3


training_args = TrainingArguments(
    output_dir = output_dir,
    learning_rate = lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size*2,
    num_train_epochs = EPOCHS,
    weight_decay = 0.01,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
    logging_steps = 10,
    load_best_model_at_end = True,
    # Enable mixed precision
    fp16=True
)

Trainer

In [None]:
trainer = Trainer(
                  model = model,
                  args = training_args,
                  train_dataset = tokenized_train_dataset,
                  eval_dataset = tokenized_test_dataset,
                  tokenizer = tokenizer,
                  compute_metrics = compute_metrics,
                  data_collator = data_collator
)

Training

In [None]:
# Fine-tuning the model with sample set of balanced data
# (commented out to prevent re-training)

# trained_model_results = trainer.train()

Initial fine-tuning of pretrained model yielded accuracy values of ~40-50%.\
Final yielded accruacy values are around ~81%.

Steps taken to improve accuracy (in something close to resembling order of application);
* Changed pre-trained model from `distilbert-base-uncased` to `MarieAngeA13/Sentiment-Analysis-BERT`
* Adjusted sample sizes of data <br> (from ~100 records total to a balanced sample set of 1000 with equal representation for all ratings) <br> (it would be another iteration before that sample set would be a balanced represntaion of the *labels*, though)
* Updates to text cleaning to include more web-present syntax <br> (eg; mentions, multiple spaces, hashtags, and web address elements) <br> (because reviews aren't literary works, typically)
* Adjustted syntax and arguments of tokenizer function and the application of it
* Adjusted training arguments to better align with our BERT-based model <br> (*spoiler: this gets undone pretty soon after*)
* Added additional metrics for better understanding of neccessary optimization
* Increased sample data size, again, and removed subset step entirely <br> (started at 10,000 only to then decrease that sample size to 600 because of time, but it was still a larger sample than where it started)
* Adjusted batch size and epochs <br> (twice)
* Moved back to `distilbert-base-uncased` and adjusted learning tokenizers, learning rate, logging steps, and such hyperparameters accordingly <br> (because sometimes less Bert is better Bert)
* Bargained with Eldritch beings in the hopes of a single soul buying even just a 10% boost to accuracy <br> (which is to say the sample size was changed to 3,000) <br> (also added and evaluation step to get a better idea of performance)
* Exchanged soul because the deal was pretty tempting <br> (3,000 records had an accuracy of ~78%, so set the model to train overnight with 6,000 records in the hopes of an above 80% result)

Copied output from final evaluation;

> {'eval_loss': 0.4815390408039093, 'eval_accuracy': 0.8169047619047619, 'eval_precision': 0.8175331785953032, 'eval_recall': 0.8169047619047619, 'eval_f1': 0.8171466024398222, 'eval_runtime': 1743.5234, 'eval_samples_per_second': 2.409, 'eval_steps_per_second': 0.038, 'epoch': 3.0}


Evaluation

In [None]:
# Evaluate the model (commented out due to trainer already being trained)
# evaluation_metrics = trainer.evaluate()

# Print the final score (commented out due to trainer already being trained)
# print(evaluation_metrics)

Saving Model & Tokenizer

In [None]:
# Saving the model and toekenizer
# (commented out to prevent overwriting, # fetching handled through `gdown` and `zipfile`)
# trainer.save_model(model_path)

# tokenizer.save_pretrained(tokenizer_path)

Fetching and unzipping model

In [None]:
# Fetching model through `gdown`
url = 'https://drive.google.com/file/d/1tzYRkjv3wWpfg21pJ02SEYNXEcj-TVH3/view?usp=sharing'
output = 'Resources/Sentiment_Analysis.zip'
# Download model
gdown.download(url, output, fuzzy=True, quiet=False)

# Extracting model
with zipfile.ZipFile(output, 'r') as zip_ref:
    zip_ref.extractall('./Sentiment_Analysis')

Paths

In [None]:
model_path = 'Sentiment_Analysis/model'
tokenizer_path =  'Sentiment_Analysis/tokenizer'

Loading and Testing model

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_path)

roberto = pipeline('sentiment-analysis',model=model,tokenizer=tokenizer)

In [None]:
# Function to apply `roberto` to any DF
def apply_roberto(df,review_col):
    # Iterate through `df`
    for index,row in df.iterrows():
        # Set review text as `text`
        text = row[review_col]
        # Generate results for a given review
        result = roberto(text, truncation=True)[0]
        # Append the sentiment label
        df.at[index, 'sent_label'] = result['label']
        #Append the sentiment score
        df.at[index, 'sent_score'] = result['score']
    # Return `df`
    return df

In [None]:
# Creating a susbet to test model
sample_set_df = sample_data_df[:100]

# Applying `apply_roberto` to `sample_set_df`
sample_set_df = apply_roberto(sample_data_df,'text')

# Confirming results
sample_set_df.head(10)

*End Code Space*

In [None]:
# Install necessary libraries (uncomment if needed)
# ! pip install selenium --quiet
# ! pip install webdriver-manager --quiet
# ! pip install beautifulsoup4 --quiet

# **Leigh's Code Space**

*End Code Space*

# **Angelica's Code Space**

In [None]:
#import required libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

In [None]:
# define a function to create df with the business name, avg rating and address of each location
def business_Overview(business_name,avg_rating,address1,lat,long,df):
    df['bus_id'] = business_name
    df['avg_rating'] = avg_rating
    address_list = address1.split(',')
    df['bus_add'] = address_list[0]
    df['bus_city'] = address_list[1]
    df['lat'] = lat
    df['lon'] = long
    return df

# import business df with Google Maps url for web scrapping
url_df = pd.read_csv('Resources/business_urls.csv')


# create list of urls and lat/long for web scrapping step 
url = url_df['url'].tolist()
lat = url_df['lat'].astype(str).tolist()
long = url_df['long'].astype(str).tolist()
#url = ['https://www.google.com/maps/place/Victoria+and+Albert+Museum/@51.4966392,-0.17218,15z/data=!4m5!3m4!1s0x0:0x9eb7094dfdcd651f!8m2!3d51.4966392!4d-0.17218']

# initiate driver
driver = webdriver.Chrome(service = ChromeService(ChromeDriverManager().install()))

#create for loop to parse through the different locations in the url list above
c = 0
df_list = []

for i in range(0,len(url)):
    c += 1
    driver.get(url[i])
    time.sleep(5)

    # get parameters needed for business overview function
    response = BeautifulSoup(driver.page_source, 'html.parser')
    business_name = response.find('h1',class_='DUwDvf lfPIob').text
    avg_rating = response.find('div',class_='fontDisplayLarge').text
    address = response.find('div',class_= 'rogA2c').text
    lat_ = lat[i]
    long_ = long[i]
    
    # navigate to Reviews tab
    driver.find_element(By.CLASS_NAME, "RWPxGd").click()
    time.sleep(3)

    #Find the total number of reviews
    total_number_of_reviews = driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div[2]/div[3]').text.split(" ")[0]
    total_number_of_reviews = int(total_number_of_reviews.replace(',','')) if ',' in total_number_of_reviews else int(total_number_of_reviews)

    total_number_of_reviews = 10

    #Find scroll layout
    scrollable_div = driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')

    #Scroll as many times as necessary to load all reviews - 10 reviews shown at a time
    for i in range(0,(round(total_number_of_reviews/10 - 1))):
        driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)
        time.sleep(1)

    # #parse HTML and Data Extraction
    # response = BeautifulSoup(driver.page_source, 'html.parser')
    # reviews = response.find_all('div', class_= 'm6QErb DxyBCb kA9KIf dS8AEf XiKgde')

    # loop over the number of reviews 
    next_item = driver.find_elements('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[9]/div[1]/div/div')
    time.sleep(3)

    #expand review by click on 'more' button
    for i in next_item:
        button = i.find_elements(By.TAG_NAME,'button')
        for m in button:
            if m.text == "More":
                m.click()
        time.sleep(5)

    # parse through the HTML 
    response = BeautifulSoup(driver.page_source, 'html.parser')
    reviews = response.find_all('div',class_ = 'jftiEf')

    # define function to gather relevant data from the reviews result set obtained by parsing through HTML
    def get_review_summary(result_set):
            rev_dict = {
                'review' : [],
                'rating' : []}

            for result in result_set:
                #review_name = result.find(class_='d4r55').text
                review_text = result.find('span',class_='wiI7pd').text
                review_rating = result.find(class_='kvMYJc')['aria-label']
                review_rating = review_rating[0]
                rev_dict['review'].append(review_text)
                rev_dict['rating'].append(review_rating)
            
            import pandas as pd
            return(pd.DataFrame(rev_dict))
    
    # gather relevant data using newly created function above 
    summary_df = get_review_summary(reviews)

    # access the number of locations in the url list
    df = business_Overview(business_name,avg_rating,address,lat_,long_,summary_df)

    # append df to df list 
    df_list.append(df)


#concat list of data frames into one 
final_df = pd.concat(df_list, ignore_index=True)
final_df


In [None]:
# review df with reviews and locations
final_df


*End Code Space*

# **Odele's Code Space**

## Additional Libraries and Dependencies;

Application being developed with `Dash` by `Plotly`. Additional `pip install`s will be necessary.

In [None]:
# Install necessary packages
# Note: Uncomment if needed
# ! pip install dash
# ! pip install dash-bootstrap-components

In [1]:
# Import libraries and dependencies
import pandas as pd

# Dash
from dash import Dash, dcc, html, callback
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate

# Dash Boostrap Components
import dash_bootstrap_components as dbc

# Plotly
import plotly.express as px
import plotly.graph_objects as go

# Other
import math

## Components;

Existing variables and functions may necessistate refactoring. Temporary facsimiles used for development.

In [2]:
# Components
'''
Inputs and declearations necessary to make app function:

user_input: a URL or name of a business (tbd) for which to fetch reviews

sbmt_bttn: a submit button

bus_id: an ID str or name of a business (tbd) to display for a given business

bus_loc: an array or a str (tbd) representing the location(s) of a given business

avg_rating: an int representing the average rating for a given business

tot_ratings: an int representing the toal reviews submitted for a given business

reviews: a dictionary of reviews containing a given rating and associated comment

sentiment: a str representing the generated sentiment analysis based on all reviews

recommendation: a str representing the generated recommendations based on the generated sentiment
'''

'\nInputs and declearations necessary to make app function:\n\nuser_input: a URL or name of a business (tbd) for which to fetch reviews\n\nsbmt_bttn: a submit button\n\nbus_id: an ID str or name of a business (tbd) to display for a given business\n\nbus_loc: an array or a str (tbd) representing the location(s) of a given business\n\navg_rating: an int representing the average rating for a given business\n\ntot_ratings: an int representing the toal reviews submitted for a given business\n\nreviews: a dictionary of reviews containing a given rating and associated comment\n\nsentiment: a str representing the generated sentiment analysis based on all reviews\n\nrecommendation: a str representing the generated recommendations based on the generated sentiment\n'

#### Temporary Components;

Placeholders until merging with more finlized components possible.

In [3]:
# Example data: Replace with your real data from an API
locations = [
    {"lat": 37.7749, "lon": -122.4194, "name": "San Francisco"},
    {"lat": 34.0522, "lon": -118.2437, "name": "Los Angeles"}
]

lat_loc = [loc["lat"] for loc in locations]
lon_loc = [loc["lon"] for loc in locations]

lat_mean = sum(lat_loc)/len(lat_loc)
lon_mean = sum(lon_loc)/len(lon_loc)

lat_min, lat_max = min(lat_loc), max(lat_loc)
lon_min, lon_max = min(lon_loc), max(lon_loc)

# Function to calculate the zoom level
def calculate_zoom_level(lat_min, lat_max, lon_min, lon_max):
    lat_diff = lat_max - lat_min
    lon_diff = lon_max - lon_min

    # Using `loc()` to scale zoom based on distances at slower rates for larger geographic areas
    zoom = min(7 - math.log(lat_diff + 0.1), 7 - math.log(lon_diff + 0.1))
    return max(zoom, 0)  # Ensure the zoom level is not negative

zoom_level = calculate_zoom_level(lat_min, lat_max, lon_min, lon_max)

# Create the map figure
fig = go.Figure(go.Scattermapbox(
    lat=lat_loc,
    lon=lon_loc,
    mode='markers',
    hovertext = ['<br>' + loc["name"] for loc in locations],
    marker=dict(size=10)
))

# Update layout with map style and other properties
fig.update_layout(
    mapbox={
        'style': "open-street-map",
        'center': {'lon': lon_mean, 'lat': lat_mean},  # Center to cover all locations
        'zoom': zoom_level
    },
    margin={"r":0,"t":0,"l":0,"b":0},
    height=500
)

In [4]:
# Temporary default map

# Create a default map centered on the US
fig_placeholder = go.Figure(go.Scattermapbox())
fig_placeholder.update_layout(
    mapbox={
        'style': "open-street-map",
        'center': {'lon': -98.583, 'lat': 39.833},
        'zoom': 2.5
    },
    margin={"r":0,"t":0,"l":0,"b":0},
    height=400
)

## App Development;

Initialization, construction, and launch of app.

In [6]:
# Initialize app
app = Dash(external_stylesheets=[dbc.themes.QUARTZ])

# Layout
app.layout = html.Div([
    # Stack to establish negative space for whole of app
    dbc.Stack(
        [
            # Blank col for spacing
            dbc.Col('', width=1),
            # Col with all of GUI
            dbc.Col(
                [
                    # Row for header
                    dbc.Row(
                        html.H1(
                            'Review Sentiment Analysis and Recommendations',
                            style={'textAlign':'center'}
                        ), style={'margin-top': '20px', 'margin-bottom': '20px'}
                    ),
                    # Row for subheader
                    dbc.Row(
                        html.H3(
                            'An interactive application to leverage customer reviews into improved businesss',
                            style={'textAlign':'center'}
                        ), style={'margin-bottom': '20px'}
                    ),
                    # Row for user input
                    dbc.Stack(
                        [
                            # User input
                            dbc.Input(
                                id='user_input',
                                type='text',
                                placeholder='Input a Business Name'
                            ),
                            # Submit button
                            dbc.Button(
                                'Submit',
                                id='sbmt_bttn',
                                n_clicks=0
                            )
                        ],
                        style={'margin-bottom': '20px'},
                        direction='horizontal',
                        gap=1
                    ),
                    # Row for business name and ratings
                    dbc.Stack(
                        [
                            # Business name
                            dbc.Col(
                                dbc.Card(children='Business Name', id='bus_nm', body=True), width=8
                            ),
                            # Average rating
                            dbc.Col(
                                dbc.Card(children='Average Rating', id='avg_rtng', body=True), width=2
                            ),
                            # Total reviews
                            dbc.Col(
                                dbc.Card(children='Total Reviews', id='tot_rvws', body=True), width=2
                            )
                        ],
                        style={'margin-bottom': '20px'},
                        direction='horizontal',
                        gap=1
                    ),
                    # Row for map and accordion
                    dbc.Stack(
                        [
                            # Map
                            dbc.Col(
                                dcc.Graph(figure=fig_placeholder, id='bus_map'),
                                width=5
                            ),
                            # accordion
                            dbc.Col(
                                dbc.Accordion(
                                    [
                                        # Reviews
                                        dbc.AccordionItem(
                                            html.P(
                                                id='reviews',
                                                children='Select a business to see reviews.',
                                                style={'max-height': '300px', 'overflow-y': 'auto'}
                                            ),
                                            title='Reviews'
                                        ),
                                        # Sentiment analysis
                                        dbc.AccordionItem(
                                            html.P(
                                                id='sentiment',
                                                children='Select a business to generate sentiment analysis.',
                                                style={'max-height': '300px', 'overflow-y': 'auto'}
                                            ),
                                            title='Sentiment Analysis'
                                        ),
                                        # Recommendations
                                        dbc.AccordionItem(
                                            html.P(
                                                id='recommend',
                                                children='Select a business to generate recommendations.',
                                                style={'max-height': '300px', 'overflow-y': 'auto'}
                                            ),
                                            title='Recommendations'
                                        )
                                    ]
                                ),
                                align='start',
                                width=7
                            )
                        ],
                        direction='horizontal',
                        gap=1
                    )
                ], width=10
            ),
            # Blank col for spacing
            dbc.Col('', width=1)
        ],
        direction='horizontal',
        gap=1
    )
])

# Callback (in place for development only, will move to own cell once developed)
@callback(
    Output('bus_nm', 'children'),       # Name of the business == `user_input`
    Output('avg_rtng', 'children'),     # Average rating out of 5 stars
    Output('tot_rvws', 'children'),     # Total number of submitted reviews
    Output('bus_map', 'figure'),        # Lat and Lon of business location(s)
    Output('reviews', 'children'),      # Any review(s) left by users 
    Output('sentiment', 'children'),    # Generated seniment analysis based on reviews
    Output('recommend', 'children'),    # Generated recommendations based on sentiment analysis
    Input('sbmt_bttn', 'n_clicks'),     # Submit button to begin generation of content
    State('user_input', 'value')        # Business name to be entered by user
)

# Update content
def update_content(n_clicks, user_input):
    if n_clicks == 0 or not user_input:
        raise PreventUpdate
    
    business_name = user_input
    avg_rating = f'Average Rating: {4.5}'
    total_reviews = f'Total Reviews: {n_clicks}'
    fig_update = fig
    reviews = 'Collected reviews go here. Content will scroll if height of 300px exceeded.'
    sentiment = 'Generated sentiment analysis goes here. Content will scroll if height of 300px exceeded.'
    recommendation = 'Generated recommendations go here. Content will scroll if height of 300px exceeded.'
    return business_name, avg_rating, total_reviews, fig_update, reviews, sentiment, recommendation

# Launch app (in place for development only, will move to own cell once developed)
app.run(jupyter_mode='tab')
# app.run_server(debug=True)

Dash app running on http://127.0.0.1:8050/


<IPython.core.display.Javascript object>

*End Code Space*

# **Vanessa's Code Space**

*End Code Space*

# **Train Test Splitting**

# **Scaling and Encoding**

# **Modeling**

# **Application (?)**

# **Findings**

# **Citations and Licenses**

## Citations

## Licenses