# **Project Title**
#### *Project Subtitle*

## Hypothesis;

Project thesis

In [None]:
# Importing dependencies
import pandas as pd

# **Data**

## Dataset

Dataset details

In [None]:
# Reading in data
# (potentially?)

### EDA

In [None]:
# Beginning EDA

# **Ramona's Code Space**

*End Code Space*

# **Christian's Code Space**

### Dependencies

In [None]:
# Installing necessary libraries (uncomment if needed)
# %pip install gdown --quiet
# ! pip install evaluate --quiet

In [None]:
# Imports and dependencies
import os
import re
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# json
import json

# gdown
import gdown

# zipfile
import zipfile

from tqdm import tqdm
import unicodedata

import torch
from datasets import load_metric

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split

import transformers
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments,Trainer
from transformers import pipeline

from huggingface_hub import notebook_login

import accelerate

from datasets import Dataset

from evaluate import load

#### Resources path

In [None]:
# Defining a function to access datasets through `gdown`
def fetch_data(set):
    # Declaring `url` and `output` for dataset
    match set:
        case 'business':
            url = 'https://drive.google.com/file/d/1t-_rOjZ8oMqPcMJunVaMgY3OEbhnuSCv/view?usp=sharing'
            output = 'Resources/business_dataset.csv'
        case 'checkin':
            url = 'https://drive.google.com/file/d/1_AVWp31ymfvf4QgTiMN_WLAeapfr0omf/view?usp=sharing'
            output = 'Resources/checkin_dataset.csv'
        case 'reviews':
            url = 'https://drive.google.com/file/d/1L8rFjhOQyU90Ycr9t_OLA70vCYM0e7ck/view?usp=sharing'
            output = 'Resources/reviews_dataset.csv'
        case 'tip':
            url = 'https://drive.google.com/file/d/1LMkCi5AFC_58_m7ELmn1hR8YDykuXwqq/view?usp=sharing'
            output = 'Resources/tip_dataset.csv'
        case 'user':
            url = 'https://drive.google.com/file/d/1kQ522qcod7AjD5DO9vj8qFcSKxwJCDrO/view?usp=sharing'
            output = 'Resources/user_dataset.csv'
        case _:
            print('Invalid dataset selected, please try again')
            return None
    
    # Downloading dataset
    gdown.download(url, output, fuzzy=True, quiet=True)

    # Reading in the dataset
    df = pd.read_csv(output, low_memory=False)

    # Returning the dataset
    return df

Fetching/reading in all datasets

In [None]:
# Fetching all datasets (uncomment for first run of code)
# business_df = fetch_data('business')
# checkin_df = fetch_data('checkin')
# reviews_df = fetch_data('reviews')
# tips_df = fetch_data('tip')
# user_df = fetch_data('user')

# Reading in all datasets (uncomment if data already fetched)
business_df = pd.read_csv('./Resources/business_dataset.csv')
checkin_df = pd.read_csv('./Resources/checkin_dataset.csv')
reviews_df = pd.read_csv('./Resources/reviews_dataset.csv')
tips_df = pd.read_csv('./Resources/tip_dataset.csv')
user_df = pd.read_csv('./Resources/user_dataset.csv')

---

#### Business dataset

#### <font color='blue'> Description:</font> 
**Contains business data including location data, attributes, and categories.**

#### Overview

In [None]:
business_df.head()

#### Info

In [None]:
business_df.info()

---

#### Checkin dataset

#### <font color='blue'> Description:</font>
**Checkins on a business.**

#### Overview

In [None]:
checkin_df.head()

#### Info

In [None]:
checkin_df.info()

#### **<font color='orange'> Notes:</font>**
**The team has determined this dataset would not add any value to our training data.**

---

#### Reviews dataset

#### <font color = 'blue'>Description:</font>
**Contains full review text data including the user_id that wrote the review and the business_id the review is written for.**

#### Overview

In [None]:
reviews_df.head()

#### Info

In [None]:
reviews_df.info()

#### Na count

In [None]:
reviews_df.isna().sum()

#### Dropping columns:
- **review_id**
- **useful**
- **funny**
- **cool**

In [None]:
reviews_df.drop(columns = ['review_id','useful','funny','cool'],
                inplace = True)

#### Renaming the 'text' field to 'review'

In [None]:
reviews_df.rename(columns = {'text':'review'},inplace = True)
reviews_df.head()

#### **<font color='orange'> Notes:</font>**
- **review_id: Eliminated due to low informational value.**
- **useful: Eliminated due to low relevance.**
- **funny: Eliminated due to low relevance.**
- **cool: Eliminated due to low relevance.**

  **The *<font color='green'>'business_id'</font>* feature will be used as the identifier, *<font color='green'>'stars'</font>* is the rating metric and the *<font color='grey'>'review'</font>*  field encapsulates**<br>
  **the data to be processed. the *<font color='green'>'date'</font>* variable is in place if time series analysis is needed.**

---

#### Tips dataset

#### <font color='blue'>Description:</font>
**Tips written by a user on a business. Tips are shorter than reviews and tend to convey quick suggestions.**

#### Overview

In [None]:
tips_df.head()

#### Info

In [None]:
tips_df.info()

#### Dropping columns:
- **compliment_count**

In [None]:
tips_df.drop(columns = ['compliment_count'],
             inplace =True)

#### Renaming the 'text' column to 'recommendations'

In [None]:
tips_df.rename(columns = {'text':'recommendations'},inplace = True)
tips_df.head()

#### **<font color='orange'> Notes:</font>**
- **compliment_count: Eliminated due to low informational value.**


 **Since this data set has recommendations from the user to improve customer experience the 'recommendations' field could be a useful target variable.**

---

#### User dataset

#### <font color = 'blue'>Note:</font>
**User data including the user's friend mapping and all the metadata associated with the user.**

#### Overview

In [None]:
user_df.head()

#### Info

In [None]:
user_df.info()

#### **<font color='orange'> Notes:</font>**
**This data set will not be included in the training data to preserve user anonimity.**

# //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

## **<font color = 'darkgrey'>Merging the reviews data set and the business data set</font>**

#### <font color = 'blue'>Description:</font>
**This data set contains the fields that will be used to train the model**

In [None]:
data_df = reviews_df.merge(business_df,how='left',on = 'business_id')

In [None]:
data_df.info()

In [None]:
data_df.head()

#### Na count

In [None]:
data_df.isna().sum()

In [None]:
na_prcnt = data_df[['attributes','categories','hours']].isna().sum()/data_df.shape[0]*100
nas_df = pd.DataFrame(na_prcnt, columns=['percentage'])
nas_df = nas_df.transpose()
nas_df.round(4)

In [None]:
sns.barplot(data = nas_df).set_title('Na percentage')

#### **<font color='orange'> Notes:</font>**
**After consulting with the team we decided to drop all three columns.**

#### Dropping rows with na values

In [None]:
data_df.drop(columns = ['attributes','categories','hours'],inplace=True)

In [None]:
data_df.isna().sum()

#### *<font color='grey'>stars_x* and *<font color='grey'>stars_y* comparison

In [None]:
data_df.loc[data_df['stars_x'] != data_df['stars_y']][['stars_x','stars_y']].head()

#### *<font color='grey'>stars_x* and *<font color='grey'>stars_y*  for the same customer

In [None]:
data_df.loc[data_df['business_id']=='XQfwVwDr-v0ZS3_CbbE5Xw'][['stars_x','stars_y']].head()

#### *<font color='grey'>stars_x* average

In [None]:
round(data_df.loc[data_df['business_id']=='XQfwVwDr-v0ZS3_CbbE5Xw']['stars_x'].mean(),2)

#### **<font color='orange'> Notes:</font>**
**Because** *<font color='grey'> star_y</font>* **represents the average star rating, renaming** *<font color='grey'> star_y:</font>* **to:** *<font color='grey'> star_avg:</font>*

#### Renaming

In [None]:
data_df.rename(columns={'stars_y':'stars_avg','stars_x':'stars'},inplace = True)

#### Dropping is_open feature

In [None]:
fig,ax = plt.subplots()
sns.countplot(data_df,
             x='is_open',
             hue = 'is_open',
             ax = ax).set_title('is_open Feature')

#### droppin is_open

In [None]:
data_df.drop(columns = ['is_open'],inplace = True)

#### **<font color='orange'> Notes:</font>**
**After cosulting with the team we decided to drop this feature due low informational value and feature imbalance**

# //////////////////////////////////////////////////////////////////////////////////////////////////

## **<font color='darkgrey'>Merging with the tips data set exploration</font>**

#### <font color = 'blue'>Description:</font>
**Contains customer recommendatins to improve experience**

In [None]:
tips_df.head()

In [None]:
tips_df.info()

#### Quantity of unique business_id in the tips data set

In [None]:
display(tips_df['business_id'].unique().shape[0])

#### Quantity of unique business_id in  data_df

In [None]:
data_df['business_id'].unique().shape[0]

#### Subset of *<font color='grey'>business_id</font>* in *<font color='grey'>data_df</font>* not found in *<font color='grey'>tips_df</font>*.

In [None]:
no_tips_df = data_df[~data_df['business_id'].isin(tips_df['business_id'])]
no_tips_df.head()

#### Number of *<font color='grey'>business_id</font>* in *<font color='grey'>data_df</font>* not found in *<font color='grey'>tips_df</font>*.

In [None]:
no_tips_df = data_df[~data_df['business_id'].isin(tips_df['business_id'])]
not_found = no_tips_df['business_id'].unique().shape[0]
print(f'Number of business_ids in tips_df not found in data_df: {not_found}')

#### Evidence

In [None]:
tips_df.loc[tips_df['business_id'] == no_tips_df['business_id'].iloc[33]]

#### Merge

In [None]:
test_df = pd.merge(tips_df,data_df,
                   on = ['business_id','user_id'],
                   how = 'inner')
                         

#### Overview

In [None]:
test_df.info()

In [None]:
test_df.head()

#### Comparison review vs. recommendations

In [None]:
test_df[['review','recommendations']].head()

##### **<font color='orange'> Notes:</font>**
**The <font color='grey'>data_df</font> has approximately <font color='green'>7 million</font> entries and <font color='grey'>tips_df</font> about <font color='green'>1 million</font> after merging them we end up the a little under half a million**.<br>
**In the comparison above I don't see a difference between a review from the *reviews data set* and a recommendation from the *tips data set***.<br>
**As shown above we stand to loose a significant amount of data if a merge is performed**.

# ///////////////////////////////////////////////////////////////////////////////////////////////////

## <font color='darkgrey'>Final Data Overview</font>

#### Dropping the user_id column to preserv user anonimity

In [None]:
data_df.drop(columns = ['user_id'],inplace = True)

#### Overview

In [None]:
data_df.head()

#### Info

In [None]:
data_df.info()

#### Na verification

In [None]:
data_df.isna().sum()

## Modeling

In [None]:
# Narrowing data to trainable scope
def sample_stars(df, val):
    df = df[df['stars'] == val].copy()
    if val >= 4:
        df = df.sample(1000)
    elif val <= 2:
        df = df.sample(1000)
    else:
        df = df.sample(2000)
    df.reset_index(inplace=True)
    return df

In [None]:
sample_5 = sample_stars(data_df,5)
sample_4 = sample_stars(data_df,4)
sample_3 = sample_stars(data_df,3)
sample_2 = sample_stars(data_df,2)
sample_1 = sample_stars(data_df,1)

In [None]:
sample_data_df = pd.concat(
    [
        sample_1,
        sample_2,
        sample_3,
        sample_4,
        sample_5
    ], axis=0, ignore_index=True
)

sample_data_df.shape

In [None]:
sample_data_df['stars'].value_counts()

In [None]:
sample_data_df['stars']= sample_data_df['stars'].replace(to_replace=[1,2], value=0)
sample_data_df['stars']= sample_data_df['stars'].replace(to_replace=3, value=1)
sample_data_df['stars']= sample_data_df['stars'].replace(to_replace=[4,5], value=2)

In [None]:
sample_data_df['stars'].value_counts()

In [None]:
sample_data_df.isna().sum()

In [None]:
sample_data_df.rename(columns={'review':'text','stars':'label'},inplace = True)

Removing accented characters

In [None]:
def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

Cleaning text

In [None]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+", "", text)
    # Remove mentions
    text = re.sub(r"@\S+", "", text)
    # Remove hashtags
    text = re.sub(r"#\S+", "", text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text).strip()
    return text

Preprocessing reviews

In [None]:
def pre_process_reviews(reviews):
  stop_words = set(stopwords.words('english'))  
  norm_reviews = []
  for review in tqdm(reviews):
    
    # Clean text
    review = clean_text(review)
    # remove extra newlines and convert them to spaces
    review = review.translate(review.maketrans("\n\t\r", "   "))
    # lower case
    review = review.lower()
    # remove accents
    review = remove_accented_chars(review)
    # remove special characters
    review = re.sub(r'[^a-zA-Z0-9\s]', '', review, flags=re.I|re.A)
    # remove extra whitespaces
    review = re.sub(' +', ' ', review)
    # remove leading and training whitespaces
    review = review.strip()

    review_tokens = word_tokenize(review)
    review = [w for w in review_tokens if not w in stop_words]
    review = ' '.join(review)
      
    norm_reviews.append(review)

  return norm_reviews

Tokenizer function

In [None]:
def tokenizer_function(review):
    # Extracting text
    text = review['text']

    # Tokenize text with truncation and padding
    tokenized_inputs = tokenizer(
        text,
        # Truncate to max_length from the right by default
        truncation=True,
        # Pad to the maximum length
        padding="max_length",
        # Maximum sequence length for BERT models
        max_length=512,
        # Assuming you are using PyTorch; change to 'np' if necessary
        return_tensors='pt'
    )

    return tokenized_inputs

Metrics

In [None]:
# Load multiple metrics
accuracy_metric = load("accuracy")
precision_metric = load("precision")
recall_metric = load("recall")
f1_metric = load("f1")

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted')
    recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')
    f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
    
    # Return a dictionary containing all metrics
    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
    }

Train Test Split

In [None]:
X = sample_data_df['text']
y = sample_data_df['label']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,random_state=42)

Preprocessing

In [None]:
norm_train_reviews = pre_process_reviews(X_train)
norm_test_reviews = pre_process_reviews(X_test)

Datasets

In [None]:
train_dataset = Dataset.from_dict({'label':y_train.to_list(),'text':norm_train_reviews})
test_dataset = Dataset.from_dict({'label':y_test.to_list(),'text':norm_test_reviews})

Generating Model

In [None]:
#Pretrained model
model_checkpoint = 'distilbert-base-uncased'

#Defining label classes
id_to_label = {0:'Negative', 1:'Neutral', 2:'Positive'}
label_to_id = {'Negative':0, 'Neutral': 1,'Positive':2}

#Model definiftion
model = DistilBertForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels = 3,
    id2label = id_to_label,
    label2id =label_to_id 
)

Tokenizing

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained(model_checkpoint)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

tokenized_train_dataset = train_dataset.map(tokenizer_function, batched=True)
tokenized_test_dataset = train_dataset.map(tokenizer_function, batched=True)

Collator

In [None]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

Training Arguments

In [None]:
output_dir = 'model_sentiment'
lr = 2e-5
batch_size = 32
EPOCHS = 3


training_args = TrainingArguments(
    output_dir = output_dir,
    learning_rate = lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size*2,
    num_train_epochs = EPOCHS,
    weight_decay = 0.01,
    save_strategy = 'epoch',
    evaluation_strategy = 'epoch',
    logging_steps = 10,
    load_best_model_at_end = True,
    # Enable mixed precision
    fp16=True
)

Trainer

In [None]:
trainer = Trainer(
                  model = model,
                  args = training_args,
                  train_dataset = tokenized_train_dataset,
                  eval_dataset = tokenized_test_dataset,
                  tokenizer = tokenizer,
                  compute_metrics = compute_metrics,
                  data_collator = data_collator
)

Training

In [None]:
# Fine-tuning the model with sample set of balanced data
# (commented out to prevent re-training)

# trained_model_results = trainer.train()

Initial fine-tuning of pretrained model yielded accuracy values of ~40-50%.\
Final yielded accruacy values are around ~81%.

Steps taken to improve accuracy (in something close to resembling order of application);
* Changed pre-trained model from `distilbert-base-uncased` to `MarieAngeA13/Sentiment-Analysis-BERT`
* Adjusted sample sizes of data <br> (from ~100 records total to a balanced sample set of 1000 with equal representation for all ratings) <br> (it would be another iteration before that sample set would be a balanced represntaion of the *labels*, though)
* Updates to text cleaning to include more web-present syntax <br> (eg; mentions, multiple spaces, hashtags, and web address elements) <br> (because reviews aren't literary works, typically)
* Adjustted syntax and arguments of tokenizer function and the application of it
* Adjusted training arguments to better align with our BERT-based model <br> (*spoiler: this gets undone pretty soon after*)
* Added additional metrics for better understanding of neccessary optimization
* Increased sample data size, again, and removed subset step entirely <br> (started at 10,000 only to then decrease that sample size to 600 because of time, but it was still a larger sample than where it started)
* Adjusted batch size and epochs <br> (twice)
* Moved back to `distilbert-base-uncased` and adjusted learning tokenizers, learning rate, logging steps, and such hyperparameters accordingly <br> (because sometimes less Bert is better Bert)
* Bargained with Eldritch beings in the hopes of a single soul buying even just a 10% boost to accuracy <br> (which is to say the sample size was changed to 3,000) <br> (also added and evaluation step to get a better idea of performance)
* Exchanged soul because the deal was pretty tempting <br> (3,000 records had an accuracy of ~78%, so set the model to train overnight with 6,000 records in the hopes of an above 80% result)

Copied output from final evaluation;

> {'eval_loss': 0.4815390408039093, 'eval_accuracy': 0.8169047619047619, 'eval_precision': 0.8175331785953032, 'eval_recall': 0.8169047619047619, 'eval_f1': 0.8171466024398222, 'eval_runtime': 1743.5234, 'eval_samples_per_second': 2.409, 'eval_steps_per_second': 0.038, 'epoch': 3.0}


Evaluation

In [None]:
# Evaluate the model (commented out due to trainer already being trained)
# evaluation_metrics = trainer.evaluate()

# Print the final score (commented out due to trainer already being trained)
# print(evaluation_metrics)

Saving Model & Tokenizer

In [None]:
# Saving the model and toekenizer
# (commented out to prevent overwriting, # fetching handled through `gdown` and `zipfile`)
# trainer.save_model(model_path)

# tokenizer.save_pretrained(tokenizer_path)

Fetching and unzipping model

In [22]:
# Fetching model through `gdown`
url = 'https://drive.google.com/file/d/1tzYRkjv3wWpfg21pJ02SEYNXEcj-TVH3/view?usp=sharing'
output = 'Resources/Sentiment_Analysis.zip'
# Download model
gdown.download(url, output, fuzzy=True, quiet=False)

# Extracting model
with zipfile.ZipFile(output, 'r') as zip_ref:
    zip_ref.extractall('./Sentiment_Analysis')

Downloading...
From (original): https://drive.google.com/uc?id=1tzYRkjv3wWpfg21pJ02SEYNXEcj-TVH3
From (redirected): https://drive.google.com/uc?id=1tzYRkjv3wWpfg21pJ02SEYNXEcj-TVH3&confirm=t&uuid=fbba920e-71af-4885-98ba-db048c2f7301
To: /Users/angelicacalderon/repos/AI_Project_3_Group_01/Resources/Sentiment_Analysis.zip
100%|██████████| 247M/247M [00:21<00:00, 11.7MB/s] 


Paths

In [23]:
model_path = 'Sentiment_Analysis/Sentiment_Analysis/model'
tokenizer_path =  'Sentiment_Analysis/Sentiment_Analysis/tokenizer'

Loading and Testing model

In [24]:
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizer.from_pretrained(tokenizer_path)

roberto = pipeline('sentiment-analysis',model=model,tokenizer=tokenizer)

In [None]:
# Function to apply `roberto` to any DF
def apply_roberto(df,review_col):
    # Iterate through `df`
    for index,row in df.iterrows():
        # Set review text as `text`
        text = row[review_col]
        # Generate results for a given review
        result = roberto(text, truncation=True)[0]
        # Append the sentiment label
        df.at[index, 'sent_label'] = result['label']
        #Append the sentiment score
        df.at[index, 'sent_score'] = result['score']
    # Return `df`
    return df

In [None]:
# Creating a susbet to test model
sample_set_df = sample_data_df[:100]

# Applying `apply_roberto` to `sample_set_df` (commented out as a saved subset exists)
# sample_set_df = apply_roberto(sample_data_df,'text')

# Confirming results
sample_set_df.head(10)

*End Code Space*

In [None]:
# Exporting sample subset for later use (commented out because `.csv` in `Resources/`)
# sample_set_df.to_csv('./Resources/sample_set.csv')

In [None]:
# Reading in existing subset with `roberto` applied
sample_set_df = pd.read_csv('Resources/sample_set.csv')

# Confirming subset
sample_set_df.head(2)

# Functions (Pt 1)

While trained on Yelp! data, and developed for Google Reviews, the goal of the application is to be as univerally applicable to business reviews as possible - regardless of the source. The following functions were developed with their annotated purposes in mind:

| **Function** | **Notes** |
| :--- | :---|
| `apply_roberto()` | Generates sentiment analysis for reviews in a given dataset, and a confidence in that sentiment |
| `business_names_list()` | Generates a list of unique business names from a given dataset |
| `reviews_list()` | Generates a list of all reviews submitted to a business for all its locations |
| `general_sentiment()` | Classifies the general sentiment for a business' reviews and provides a mean confidence in that sentiment <br> *Note: To be run after a DataFrame has been passed through* `apply_roberto()` |

Function outlined in more detail below require a DataFrame with the following features:

| **Feature** | **Notes** |
| :--- | :--- |
| `bus_name_col` | A text column with the name of a business |
| `bus_add` | A text column with the street address of a business' location |
| `rev_col` | A text column with available reviews |
| `sent_lbl` | A text column with the generated sentiment classification <br> *Note: Generated through* `apply_roberto()` |
| `sent_scr` | A text column with the generated sentiment classification <br> *Note: Generated through* `apply_roberto()` |

In [25]:
# Function to apply `roberto` to any DF
def apply_roberto(df,rev_col):
    '''
    Applies the `roberto` model to generate sentiment analysis for the reviews in a DataFrame.

    Args:
        df (DataFrame):     Any DataFrame with sufficient data.
        rev_col (str):      A string with the feature name that contains the review text.
    
    Returns:
        df (DataFrame):     The same DataFrame with the appended sentiments and confidence scores.
    
    Raises:
        KeyError: If `rev_col` is not a valid column name in the DataFrame.
        TypeError: If `df` is not a DataFrame or if `review_col` is not a string.
    '''
    #Raises
    if not isinstance(df, pd.DataFrame):
        raise TypeError('The input `df` must be a pandas DataFrame.')
    if not isinstance(rev_col, str):
        raise TypeError('The `rev_col` parameter must be passed as a string.')
    if rev_col not in df.columns:
        raise KeyError(f"Column '{rev_col}' not found in DataFrame.")
    
    # Initializing features for results
    df['sent_label'] = ''
    df['sent_score'] = 0.0

    # Iterating through `df`
    for index,row in df.iterrows():
        # Setting review text as `text`
        text = row[rev_col]
        # Generating results for a given review
        result = roberto(text, truncation=True)[0]
        # Appending the sentiment label
        df.at[index, 'sent_label'] = result['label']
        #Appending the sentiment score
        df.at[index, 'sent_score'] = result['score']
    
    # Returning `df`
    return df

In [None]:
# Function to retrieve unique business names
def business_names_list(df, bus_name_col):
    '''
    Places unique names from a list of businesses into a list.

    Args:
        df (DataFrame):     Any DataFrame with sufficient data.
        bus_name_col (str): A string with the feature name that contains the business name.

    Returns:
        names (list):       A list of strings with only unique values.

    Raises:
        KeyError:           If `bus_name` is not a valid column name in the DataFrame.
        TypeError:          If `df` is not a DataFrame or if `bus_name` is not a string.
    '''
    # Raises
    if not isinstance(df, pd.DataFrame):
        raise TypeError('The input `df` must be a pandas DataFrame.')
    if not isinstance(bus_name_col, str):
        raise TypeError('The `bus_name` parameter must be passed as a string.')
    if bus_name_col not in df.columns:
        raise KeyError(f"Column '{bus_name_col}' not found in DataFrame.")

    # Generating a list of business names
    names = df[bus_name_col].unique().tolist()
    
    # Returning the list
    return names

In [36]:
# Function to retrieve all reviews
def reviews_list(df, bus_name_col, bus_name, bus_add, rev_col):
    '''
    Places all reviews for a given business into a list, attributing each review to its specific location.

    Args:
        df (DataFrame):     Any DataFrame with sufficient data.
        bus_name_col (str): A string with the feature name that contains the business name.
        bus_name (str):     A string of a specific business' name for which to map the locations.
        bus_add (str):      A string with the feature name that contains the business street address.
        rev_col (str):      A string with the feature name that contains the review text.

    Returns:
        reviews (list):     A list of strings with all the reviews for a given business.

    Raises:
        KeyError:           If any passed str is not a valid column name in the DataFrame, or if `bus_name` is not a value in `bus_name_col`.
        TypeError:          If `df` is not a DataFrame  if and feature is not a string, or if `bus_name` is not a string.
    '''
    # Raises
    if not isinstance(df, pd.DataFrame):
        raise TypeError('The input `df` must be a pandas DataFrame.')
    for param, name in zip(
        [bus_name_col, bus_add, rev_col],
        ['bus_name_col', 'bus_add', 'rev_col']
    ):
        if not isinstance(param, str):
            raise TypeError(f"The '{name}' parameter must be passed as a string.")
        if param not in df.columns:
            raise KeyError(f"Column '{param}' not found in DataFrame.")
    if not isinstance(bus_name, str):
        raise TypeError('The `bus_name` parameter must be passed as a string.')
    if bus_name not in df[bus_name_col].values:
        raise KeyError(f"Value '{bus_name}' not found in column '{bus_name_col}'.")
    
    # Filtering `df`
    filtered_df = df[[bus_add, rev_col]][df[bus_name_col] == bus_name].copy()

    # Handling missing or empty reviews
    filtered_df[rev_col] = filtered_df[rev_col].fillna('No review provided.')

    # Creating a list of all reviews
    reviews = filtered_df[bus_add] + ':\n' + filtered_df[rev_col] + '\n\n'

    # Converting to a list
    reviews = reviews.to_list()

    # Returning reviews
    return reviews

In [33]:
# Function to generalize the overall sentiment
def general_sentiment(df, bus_name_col, bus_name, sent_lbl, sent_scr):
    '''
    Compares the total positive, negative, and neutral reviews to classify an overall sentiment.

    Note:
        To be run after passing a DataFrame through `apply_roberto()`.

    Args:
        df (DataFrame):     Any DataFrame with sufficient data.
        bus_name_col (str): A string with the feature name that contains the business name.
        bus_name (str):     A string of a specific business' name for which to map the locations.
        sent_lbl (str):     A string with the feature name that contains the modeled sentiment label.
        sent_scr (str):     A string with the feature name that contains the modeled sentiment confidence.

    Returns:
        gen_sent (str):     A string with the overall sentiment, and the model's mean confidence in that classification.
    
    Raises:
        KeyError:           If any passed str is not a valid column name in the DataFrame, or if `bus_name` is not a value in `bus_name_col`.
        TypeError:          If `df` is not a DataFrame  if and feature is not a string, or if `bus_name` is not a string.
    '''
    # Raises
    if not isinstance(df, pd.DataFrame):
        raise TypeError('The input `df` must be a pandas DataFrame.')
    for param, name in zip(
        [bus_name_col, sent_lbl, sent_scr],
        ['bus_name_col', 'sent_lbl', 'sent_scr']
    ):
        if not isinstance(param, str):
            raise TypeError(f"The '{name}' parameter must be passed as a string.")
        if param not in df.columns:
            raise KeyError(f"Column '{param}' not found in DataFrame.")
    if not isinstance(bus_name, str):
        raise TypeError('The `bus_name` parameter must be passed as a string.')
    if bus_name not in df[bus_name_col].values:
        raise KeyError(f"Value '{bus_name}' not found in column '{bus_name_col}'.")
    
    # Filtering `df`
    filtered_df = df[[sent_lbl, sent_scr]][df[bus_name_col] == bus_name].copy()

    # Converting `sentiment` to lower case
    filtered_df[sent_lbl] = filtered_df[sent_lbl].str.lower()

    # Calculating total `positive` sentiment
    pos = filtered_df.loc[filtered_df[sent_lbl] == 'positive'].shape[0]
    # Calculating total `neutral` sentiment
    ntrl = filtered_df.loc[filtered_df[sent_lbl] == 'neutral'].shape[0]
    # Calculating total `negative` sentiment
    neg = filtered_df.loc[filtered_df[sent_lbl] == 'negative'].shape[0]

    # Match case to generate general sentiment
    match (pos, ntrl, neg):
        case (p, n, ng) if p > n > ng:
            sent = 'highly positive'
        case (p, n, ng) if p > n + ng:
            sent = 'strongly positive'
        case (p, n, ng) if p + n > ng:
            sent = 'moderately positive'
        case (p, n, ng) if p < n > ng:
            sent = 'generally neutral'
        case (p, n, ng) if p < n < ng:
            sent = 'moderately negative'
        case (p, n, ng) if p + n < ng:
            sent = 'strongly negative'
        case (p, n, ng) if ng > n > p:
            sent = 'highly negative'
        case (p, n, ng) if p == n == ng:
            sent = 'perfectly neutral'
        case _:
            sent = 'undetermined'
    
    # Calculate the mean confidence
    conf = filtered_df[sent_scr].mean() * 100

    # Generating the final sentiment
    if pos + ntrl + neg != 0:
        # Concatenating sentiment and confidence
        gen_sent = f'The general sentiment is {sent}, with an average confidence of {conf:.1f}%.'
    else:
        # When no sentment available due to no reviews
        gen_sent = 'Cannot confirm sentiment due to a lack of reviews.'

    # Returning sentiment
    return gen_sent

In [None]:
# Install necessary libraries (uncomment if needed)
# ! pip install selenium --quiet
# ! pip install webdriver-manager --quiet
# ! pip install beautifulsoup4 --quiet

# **Leigh's Code Space**

*End Code Space*

# **Angelica's Code Space**

In [18]:
#import required libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

In [19]:
# define a function to create df with the business name, avg rating and address of each location
def business_Overview(business_name,avg_rating,address1,lat,long,df):
    df['bus_id'] = business_name
    df['avg_rating'] = avg_rating
    address_list = address1.split(',')
    df['bus_add'] = address_list[0]
    df['bus_city'] = address_list[1]
    df['lat'] = lat
    df['lon'] = long
    return df

# import business df with Google Maps url for web scrapping
url_df = pd.read_csv('Resources/business_urls.csv')

# select the fist 11 business urls from the file
url_df = url_df.head(11)

# create list of urls and lat/long for web scrapping step 
url = url_df['url'].tolist()
lat = url_df['lat'].astype(str).tolist()
long = url_df['long'].astype(str).tolist()

# initiate driver
driver = webdriver.Chrome(service = ChromeService(ChromeDriverManager().install()))

#create for loop to parse through the different locations in the url list above
c = 0
df_list = []

for i in range(0,len(url)):
    c += 1
    driver.get(url[i])
    time.sleep(5)

    # get parameters needed for business overview function
    response = BeautifulSoup(driver.page_source, 'html.parser')
    business_name = response.find('h1',class_='DUwDvf lfPIob').text
    avg_rating = response.find('div',class_='fontDisplayLarge').text
    address = response.find('div',class_= 'rogA2c').text
    lat_ = lat[i]
    long_ = long[i]
    
    # navigate to Reviews tab
    driver.find_element(By.CLASS_NAME, "RWPxGd").click()
    time.sleep(3)

    #Find the total number of reviews
    total_number_of_reviews = driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div[2]/div[3]').text.split(" ")[0]
    total_number_of_reviews = int(total_number_of_reviews.replace(',','')) if ',' in total_number_of_reviews else int(total_number_of_reviews)

    total_number_of_reviews = 50

    #Find scroll layout
    scrollable_div = driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')

    #Scroll as many times as necessary to load all reviews - 10 reviews shown at a time
    for i in range(0,(round(total_number_of_reviews/10 - 1))):
        driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)
        time.sleep(1)

    # #parse HTML and Data Extraction
    # loop over the number of reviews 
    next_item = driver.find_elements('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[9]/div[1]/div/div')
    time.sleep(3)

    #expand review by click on 'more' button
    for i in next_item:
        button = i.find_elements(By.TAG_NAME,'button')
        for m in button:
            if m.text == "More":
                m.click()
        time.sleep(5)

    # parse through the HTML 
    response = BeautifulSoup(driver.page_source, 'html.parser')
    reviews = response.find_all('div',class_ = 'jftiEf')

    # define function to gather relevant data from the reviews result set obtained by parsing through HTML
    def get_review_summary(result_set):
            rev_dict = {
                'review' : [],
                'rating' : []}

            for result in result_set:
                #review_name = result.find(class_='d4r55').text
                review_text = result.find('span',class_='wiI7pd').text
                review_rating = result.find(class_='kvMYJc')['aria-label']
                review_rating = review_rating[0]
                rev_dict['review'].append(review_text)
                rev_dict['rating'].append(review_rating)
            
            import pandas as pd
            return(pd.DataFrame(rev_dict))
    
    # gather relevant data using newly created function above 
    summary_df = get_review_summary(reviews)

    # access the number of locations in the url list
    df = business_Overview(business_name,avg_rating,address,lat_,long_,summary_df)

    # append df to df list 
    df_list.append(df)


#concat list of data frames into one 
spooder_df = pd.concat(df_list, ignore_index=True)
spooder_df


Unnamed: 0,review,rating,bus_id,avg_rating,bus_add,bus_city,lat,lon
0,"The cakes are beautiful and delicious, but you...",5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625
1,The entire experience was excellent: well-brew...,5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625
2,"Everything was great.\n\nFood, service, and th...",5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625
3,The soup is amazing. My family likes the Itali...,5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625
4,Absolutely love Dulce de Leche Bakery in Jerse...,5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625
...,...,...,...,...,...,...,...,...
545,"Absolutely amazing. We ordered from their ""fa...",5,Texas Roadhouse,4.4,2105 NJ-35 Suite 103,Holmdel,40.412299,-74.145973
546,This place’s onion rings are my comfort food (...,5,Texas Roadhouse,4.4,2105 NJ-35 Suite 103,Holmdel,40.412299,-74.145973
547,Very good chain restaurant the food is always ...,4,Texas Roadhouse,4.4,2105 NJ-35 Suite 103,Holmdel,40.412299,-74.145973
548,Loaded sweet potato and the rolls are the best...,5,Texas Roadhouse,4.4,2105 NJ-35 Suite 103,Holmdel,40.412299,-74.145973


In [20]:
# Exporting spooder_df for later use (commented out because `.csv` in `Resources/`)
spooder_df.to_csv('./Resources/spooder.csv')

## Apply the Sentiment Analysis Model to the Web Scrapped Data

In [28]:
# apply Roberto to web scrapping dataframe with Google Reviews
roberto_df = apply_roberto(spooder_df,'review')
roberto_df

Unnamed: 0,review,rating,bus_id,avg_rating,bus_add,bus_city,lat,lon,sent_label,sent_score
0,"The cakes are beautiful and delicious, but you...",5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625,Positive,0.763528
1,The entire experience was excellent: well-brew...,5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625,Positive,0.889528
2,"Everything was great.\n\nFood, service, and th...",5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625,Positive,0.810901
3,The soup is amazing. My family likes the Itali...,5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625,Positive,0.791038
4,Absolutely love Dulce de Leche Bakery in Jerse...,5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625,Positive,0.932732
...,...,...,...,...,...,...,...,...,...,...
545,"Absolutely amazing. We ordered from their ""fa...",5,Texas Roadhouse,4.4,2105 NJ-35 Suite 103,Holmdel,40.412299,-74.145973,Positive,0.650678
546,This place’s onion rings are my comfort food (...,5,Texas Roadhouse,4.4,2105 NJ-35 Suite 103,Holmdel,40.412299,-74.145973,Neutral,0.456376
547,Very good chain restaurant the food is always ...,4,Texas Roadhouse,4.4,2105 NJ-35 Suite 103,Holmdel,40.412299,-74.145973,Positive,0.725514
548,Loaded sweet potato and the rolls are the best...,5,Texas Roadhouse,4.4,2105 NJ-35 Suite 103,Holmdel,40.412299,-74.145973,Negative,0.718005


In [29]:
# Exporting roberto_df for later use (commented out because `.csv` in `Resources/`)
roberto_df.to_csv('./Resources/roberto.csv')

In [34]:
# run function to get general sentiment per business to be used as input into chatgpt model 
general_sentiment_web_scrapping = general_sentiment(roberto_df, 'bus_id', 'Dulce de Leche Bakery', 'sent_label', 'sent_score')
general_sentiment_web_scrapping

'The general sentiment is highly positive, with an average confidence of 75.4%.'

In [39]:
# run function to add all reviews of a given business as a list
review_list_web_scrapping = reviews_list(roberto_df, 'bus_id', 'Dulce de Leche Bakery', 'bus_add', 'review')
review_list_web_scrapping[:10]

['55 W Palisade Ave:\nAwesome bakery right in the heart of Englewood. They are very busy, especially on weekends and holidays, but the staff keep the line moving quickly. All of their pastries are delicious, and their coffee is just right. The inside of the establishment is quite beautiful as well, with a minimalist open air decor and aesthetic. Highly recommend a visit!\n\n',
 '55 W Palisade Ave:\nLove! Love! Love! My favorite bakery! The cakes (slice or whole) is always so fresh and delicious! I order the Dulce De Leche Chantilly Cake Slice and it’s honestly omg. The top frosting tastes like hot chocolate. The chocolate cake is …\n\n',
 "55 W Palisade Ave:\nThis place gets crazy! This place is spacious and gets busy quickly. The staff is awesome and professional. They definitely try to get you out in advance. They have all kinds I'd pastries and they make cakes. I ordered chicken empanadas and …\n\n",
 '55 W Palisade Ave:\nThe cake was beautiful and delicious! Perfect for the  Birthd

## Use Reviews from Selected Business to run ChatGPT Model

In [40]:
# import ChatOpenAI and os
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import os

# Additional imports for prompt template and LLM chain.
from langchain import PromptTemplate
from langchain.chains import LLMChain

In [41]:
# Load environment variables.
load_dotenv()

# Set the model name for our LLMs.
OPENAI_MODEL = "gpt-3.5-turbo"
# Store the API key in a variable.
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [42]:
# Initialize the model.
llm=ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name=OPENAI_MODEL, temperature=0.3)

# Define the format for the template.
format = """

Provide a summary of the given reviews:{review_list} and three ways in which to improve the business. The summary should capture the main points and key details of the text 
while conveying the author's intended meaning accurately. The recommendations should be actionable, clear and conscise. Please ensure that the summary is well-organized and easy to read, 
with clear headings and subheadings to guide the reader through each section. The length of the summary should be appropriate to capture the main points and key details of the text, 
without including unnecessary information or becoming overly long.

reviews = {review_list}

"""

# Construct the prompt template.
prompt_template = PromptTemplate(
    input_variables=["review_list"],
    template=format
)

# Construct a chain using this template.
chain = LLMChain(llm=llm, prompt=prompt_template)

# Define the input variable as a dictionary
review_list = {"review_list": review_list}

# Run the chain using the query as input and get the result.
result = chain.invoke(review_list)
results = result["text"]

# split the results by new lines to extract review summary and business recommendations
results_list = results.split('\n')
reviews_summary = results_list[1]
recommendations = results_list[4]


print(result["text"])

### Summary:
The reviews for 55 W Palisade Ave highlight the bakery's delicious pastries, cakes, and coffee, as well as its beautiful interior and efficient staff. Customers appreciate the variety of offerings, reasonable prices, and friendly service. However, some reviews mention issues with service quality, cleanliness, and specific food items.

### Recommendations for Improvement:
1. **Enhance Customer Service:** Address complaints about lazy service and improve the overall attitude of staff members.
   
2. **Maintain Cleanliness:** Ensure that all food items are prepared and served in a hygienic manner to avoid issues like finding foreign objects in the food.
   
3. **Consistent Quality:** Maintain the high standard of food quality and service to meet customer expectations and avoid any negative experiences.


*End Code Space*

# **Odele's Code Space**

## Additional Libraries and Dependencies;

Application being developed with `Dash` by `Plotly`. Additional `pip install`s will be necessary.

In [None]:
# Install necessary packages
# Note: Uncomment if needed
# ! pip install dash
# ! pip install dash-bootstrap-components

In [None]:
# Import libraries and dependencies
import pandas as pd

# Dash
from dash import Dash, dcc, html, callback, callback_context
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate

# Dash Boostrap Components
import dash_bootstrap_components as dbc

# Plotly
import plotly.express as px
import plotly.graph_objects as go

# Other
import math

## Components;

Existing variables and functions may necessistate refactoring. Temporary facsimiles used for development.

In [None]:
sample_set_df.columns

In [None]:
# Components
'''
Inputs and declearations necessary to make app function:

user_input: a URL or name of a business (tbd) for which to fetch reviews

sbmt_bttn: a submit button

bus_id: an ID str or name of a business (tbd) to display for a given business

bus_loc: an array or a str (tbd) representing the location(s) of a given business

avg_rating: an int representing the average rating for a given business

tot_ratings: an int representing the toal reviews submitted for a given business

reviews: a dictionary of reviews containing a given rating and associated comment

sentiment: a str representing the generated sentiment analysis based on all reviews

recommendation: a str representing the generated recommendations based on the generated sentiment
'''

# Functions (Pt 2)

While trained on Yelp! data, and developed for Google Reviews, the goal of the application is to be as univerally applicable to business reviews as possible - regardless of the source. The following functions were developed with their annotated purposes in mind:

| **Function** | **Notes** |
| :--- | :---|
| `unique_locs_df()` | Creates a DataFrame with all unique locations in a given dataset |
| `location_details()` | Generates a dictionary with geographic coordinates for all locations of a given business <br> *Note: To be run on the DataFrame generated by* `unique_locs_df()` |
| `build_map()` | Constructs a Scattermapbox based on the locations from `location_details()` |
| `apply_davidlingo()` | Generates the final summary of a business' reviews, or recommendations for improvement based off the reviews and overall sentiment <br> *Note: To be used with the ouputs of* `reviews_list()` *and* `general_sentiment()` |

Each function outlined in more detail below requires a DataFrame with the following features:

| **Feature** | **Notes** |
| :--- | :--- |
| `bus_name_col` | A text column with the name of a business |
| `bus_add` | A text column with the street address of a business' location |
| `bus_lat` | A float column with the latitude coordinate for a business' location |
| `bus_lon` | A float column with the longitude coordinate for a business' location |
| `rev_col` | A text column with available reviews |
| `sent_lbl` | A text column with the generated sentiment classification <br> *Note: Generated through* `apply_roberto()` |
| `sent_scr` | A text column with the generated sentiment classification <br> *Note: Generated through* `apply_roberto()` |

In [None]:
# Function to retrieve unique business locations
def unique_locs_df(df, bus_name_col, bus_add, bus_lat, bus_lon):
    '''
    Gathers unique adresses and coordinates for unique locations into a DataFrame.

    Args:
        df (DataFrame):     Any DataFrame with sufficient data.
        bus_name_col (str): A string with the feature name that contains the business name.
        bus_add (str):      A string with the feature name that contains the business street address.
        bus_lat (str):      A string with the feature name that contains the latitude of a location.
        bus_lon (str):      A string with the feature name that contains the longitude of a location.
    
    Returns:
        loc (DataFrame):    A DataFrame with only unique locations.

    Raises:
        KeyError:           If any passed str is not a valid column name in the DataFrame.
        TypeError:          If `df` is not a DataFrame or if and feature is not a string.
    '''
    # Raises
    if not isinstance(df, pd.DataFrame):
        raise TypeError('The input `df` must be a pandas DataFrame.')
    for param, name in zip(
        [bus_name_col, bus_add, bus_lat, bus_lon],
        ['bus_name_col', 'bus_add', 'bus_lat', 'bus_lon']
    ):
        if not isinstance(param, str):
            raise TypeError(f"The '{name}' parameter must be passed as a string.")
        if param not in df.columns:
            raise KeyError(f"Column '{param}' not found in DataFrame.")
    
    # Generating a DataFrame of unique locations
    locs = df[[bus_name_col, bus_add, bus_lat, bus_lon]].drop_duplicates()
    
    # Returning the DataFrame
    return locs

In [None]:
# Function to generate a `locations` list of dictionaries
def location_details(df, bus_name_col, bus_name, bus_add, bus_lat, bus_lon):
    '''
    Transfers the latitude, longitude, and a concatenated identifier into a dictionary for later use in generating a Scattermapbox figure.

    Note:
        Advised to run `location_details()` on the DataFrame generated by `unique_locs_df()`

    Args:
        df (DataFrame):     Any DataFrame with sufficient data.
        bus_name_col (str): A string with the feature name that contains the business name.
        bus_name (str):     A string of a specific business' name for which to map the locations.
        bus_add (str):      A string with the feature name that contains the business street address.
        bus_lat (str):      A string with the feature name that contains the latitude of a location.
        bus_lon (str):      A string with the feature name that contains the longitude of a location.

    Returns:
        locs (dict):        A list of dictionaries with the necessary details for building a figure.
    
    Raises:
        KeyError:           If any passed str is not a valid column name in the DataFrame.
        TypeError:          If `df` is not a DataFrame or if and feature is not a string.
        ValueError:         If `bus_name` is not a value in `bus_col_name`.
    '''
    # Raises
    if not isinstance(df, pd.DataFrame):
        raise TypeError('The input `df` must be a pandas DataFrame.')
    for param, name in zip(
        [bus_name_col, bus_add, bus_lat, bus_lon],
        ['bus_name_col', 'bus_add', 'bus_lat', 'bus_lon']
    ):
        if not isinstance(param, str):
            raise TypeError(f"The '{name}' parameter must be passed as a string.")
        if param not in df.columns:
            raise KeyError(f"Column '{param}' not found in DataFrame.")
    if bus_name not in df[bus_name_col].values:
        raise ValueError(f"'{bus_name}' not found in column '{bus_name_col}'.")
    
    # Creating a list of features to retain
    retain = [bus_name_col, bus_add, bus_lat, bus_lon]

    # Filtering `df`
    filtered_df = df[retain][df[bus_name_col] == bus_name].copy()

    # Intializing a `name` feature

    # Creating a concatenated `name` feature with a business' name and location address
    filtered_df['loc_name'] = filtered_df[bus_name_col] + ' - ' + filtered_df[bus_add]

    # Renaming features
    filtered_df.rename(columns={bus_lat: 'lat', bus_lon: 'lon'}, inplace=True)

    # Dropping features
    filtered_df.drop([bus_name_col, bus_add], axis=1, inplace=True)

    # Converting `filtered_df` to a dictionary
    locs = filtered_df.to_dict('records')

    # Returning list of dictionaries
    return locs

In [None]:
# Function to build a map
def build_map(locs):
    '''
    Generates and updates a Scattermapbox figure based on the location details previously generated.

    Note:
        To be run on the dictionary returned by `location_details()`.

    Args:
        locs (dict):    A dictionary containing the latitude and longitude coordinates, as well as the business name and street address, of all given locations for that business.
    
    Returns:
        fig (fig):      A Scattermapbox formated to an appropriate zoom level and centered on all given locations for a business.
    
    Raises:
        TypeError:      If `locs` is not a list of dictionaries, or if the dictionaries do not contain the expected keys.
        KeyError:       If any of the expected keys are missing from the dictionaries.
        ValueError:     If `locs` is empty, or if latitude and longitude values are not valid numbers.
    '''
    # Raises
    if not isinstance(locs, list) or not all(isinstance(loc, dict) for loc in locs):
        raise TypeError("`locs` must be a list of dictionaries.")
    required_keys = {'lat', 'lon', 'loc_name'}
    for loc in locs:
        if not required_keys.issubset(loc):
            raise KeyError(f"Each dictionary in `locs` must contain the keys: {required_keys}.")
    if not locs:
        raise ValueError("`locs` cannot be an empty list.")

    # Generating location text
    hover_text = [loc['loc_name'] for loc in locs]

    # Generating location lat and lon
    lat_loc = [loc['lat'] for loc in locs]
    lon_loc = [loc['lon'] for loc in locs]

    # Calculating middle point for lat and lon
    lat_mean = sum(lat_loc)/len(lat_loc)
    lon_mean = sum(lon_loc)/len(lon_loc)

    # Calculating borders of locatoins
    lat_min, lat_max = min(lat_loc), max(lat_loc)
    lon_min, lon_max = min(lon_loc), max(lon_loc)

    # Calculating size of borders
    lat_diff = lat_max - lat_min
    lon_diff = lon_max - lon_min

    # Using `log()` to scale zoom based on distances at slower rates for larger geographic areas
    zoom = min(7 - math.log(lat_diff + 0.1), 7 - math.log(lon_diff + 0.1))

    # Creating the map figure
    fig = go.Figure(go.Scattermapbox(
        lat=lat_loc,
        lon=lon_loc,
        mode='markers',
        hovertext=hover_text,
        marker=dict(size=10)
    ))

    # Updating layout with map style and properties
    fig.update_layout(
        mapbox={
            'style': 'open-street-map',
            'center': {'lon': lon_mean, 'lat': lat_mean},
            'zoom': zoom
        },
        margin={"r":0,"t":0,"l":0,"b":0},
        height=500
    )

    # Returning figure
    return fig

In [None]:
# Function to generate OpenAI summary or recommendations
def apply_davidlingo(gen_sent, reviews):
    '''
    Assesses the general sentiment and reviews for a given business to then conditionally output either summary of positive reviews, or recommendations based on neutral or negative reviews.

    Note:
        To be run after `apply_roberto()`, `general_sentiment()`, and `reviews_list()`.
    '''
    # Raises
    
    # Placeholder
    gen_sent = gen_sent
    reviews = reviews

    # Placeholder
    rev_recs = 'This is a placeholder for applying a final text object generated by an OpenAI LangChain.'
    # Return final summary or recommendations
    return rev_recs

#### Temporary Components;

Starting features and figures for app loading state

In [None]:
# Temporary default map

# Create a default map centered on the US
fig_placeholder = go.Figure(go.Scattermapbox())
fig_placeholder.update_layout(
    mapbox={
        'style': "open-street-map",
        'center': {'lon': -98.583, 'lat': 39.833},
        'zoom': 2.5
    },
    margin={"r":0,"t":0,"l":0,"b":0},
    height=400
)

## App Development;

Initialization, construction, and launch of app.

Still very much in development!

In [None]:
# Initialize app
app = Dash(external_stylesheets=[dbc.themes.QUARTZ])

# Declare a DataFrame to be used for the app
app_df = roberto_df.copy()

# Creating list of business names
drop_opts = business_names_list(app_df, 'bus_id') # Replace args with live data

# Creating a DataFrame of uniqur locations
uniq_locs = unique_locs_df(app_df, 'bus_id', 'bus_add', 'bus_lat', 'bus_lon') # Replace args with live data

# Loading markdown content for guide
with open('Resources/SpooderApp_Guide.md', 'r') as file:
    guide_content = file.read()

# App layout
app.layout = html.Div([
    # Wrapping the whole GUI in a stack for uinform formatting
    dbc.Stack(
        [
            # Blank col for spacing (1/12 of parent container)
            dbc.Col('', width=1),
            # Col with all of GUI
            dbc.Col(
                [
                    # Row for header
                    dbc.Row(
                        # Header (as is evident by the `H1` method)
                        html.H1(
                            # Does whatever a SpooderApp™ can
                            'SpooderApp™',
                            # Placing in the middle of the page
                            style={'textAlign':'center'}
                        ),
                        # Give us some room, please
                        style={'margin-top': '20px', 'margin-bottom': '20px'}
                    ),
                    # Row for subheader
                    dbc.Row(
                        # Subheader (as is less evident by the `H3` method)
                        html.H3(
                            # Taglines are important
                            'Leveraging business reviews to gain insights for potential improvements.',
                            # Placing in the middle of the page
                            style={'textAlign':'center'}
                        ),
                        # Buffer space
                        style={'margin-bottom': '20px'}
                    ),
                    # Row for business name and ratings
                    dbc.Row(
                        [
                            # Col for user input
                            dbc.Col(
                                # InputGroup for user input
                                dbc.InputGroup(
                                    [
                                        # Dropdown menu for user input
                                        dbc.DropdownMenu(
                                            # Instructions for user input
                                            label = 'Select a business',
                                            # To know it's user input
                                            id = 'business_dropdown',
                                            # Selections for user input
                                            children = [
                                                dbc.DropdownMenuItem(
                                                    name,
                                                    id=f"menu_item_{i}",
                                                    style={'color': 'grey'}
                                                ) for i, name in enumerate(drop_opts)
                                            ],
                                            # Making it pretty ([insert sparkles here])
                                            class_name='btn-info'
                                        ),
                                        # Not actually user input, but reflects it
                                        dbc.InputGroupText(
                                            # Blank until user input selected
                                            children='',
                                            # To know where to put user input
                                            id='chld_nm',
                                            # Making it pretty, but not AS pretty
                                            class_name='form-control'
                                        )
                                    ],
                                    # Be tall, but only so tall, please
                                    style={'width': '100%', 'height': '60px'}
                                ),
                                # 6/12 of parent container, because math
                                width=6
                            ),
                            # Col for average rating information
                            dbc.Col(
                                # Card display for averate rating information
                                dbc.Card(
                                    # Blank until user input selected
                                    children='',
                                    # To know where to put average rating information
                                    id='avg_rtng',
                                    # Making it pretty-ish
                                    body=True,
                                    # Be no taller than the column to your left
                                    style={
                                        'width': '100%',
                                        'height': '60px',
                                        'display': 'flex',
                                        'align-items': 'left',
                                        'justify-content': 'center'
                                    }
                                ),
                                # 3/12 of parent container, or 1/4 but HTML/CSS doesn't like quarters as much
                                width=3
                            ),
                            # Column for total reviews information
                            dbc.Col(
                                # Card display for total reviews information
                                dbc.Card(
                                    # Blank until user input selected
                                    children='',
                                    # To know where to put total reviews information
                                    id='tot_rvws',
                                    # Making it pretty-ish like its sibling to the left
                                    body=True,
                                    # You must be this short to display
                                    style={
                                        'width': '100%',
                                        'height': '60px',
                                        'display': 'flex',
                                        'align-items': 'left',
                                        'justify-content': 'center'
                                    }
                                ),
                                # 3/12 of parent container, beacuse 12 - 6 - 3 leaves 3
                                width=3
                            )
                        ],
                        # Usually a good place to begin - The beginning
                        justify='start',
                        # Matching buffer space for that glossy, uniform look ([more sparkles])
                        style={'margin-bottom': '20px'},
                    ),
                    # "Row" for map and accordion
                    dbc.Stack(
                        [
                            # Col for map
                            dbc.Col(
                                # Map
                                dcc.Graph(figure=fig_placeholder, id='bus_map'),
                                # 5/12 of parent container, because the map wanted to be special
                                width=5
                            ),
                            # Col for accordion
                            dbc.Col(
                                # Unfortunately, an accodion menu, not a Weird Al cameo
                                dbc.Accordion(
                                    [
                                        # Menu item for reviews
                                        dbc.AccordionItem(
                                            # Paragraph - in the loosest sense - for reviews
                                            html.P(
                                                # To know where to put the reviews
                                                id='reviews',
                                                # Blank until user input selected
                                                children='',
                                                # Only be so tall, and scroll if longer
                                                style={'max-height': '295px', 'overflow-y': 'auto'}
                                            ),
                                            # So you know it's got the reviews in it
                                            title='Reviews'
                                        ),
                                        # Menu item for sentiment analysis
                                        dbc.AccordionItem(
                                            html.P(
                                                # To know where to put the sentiment analysis
                                                id='sentiment',
                                                # Blank until user input selected
                                                children='',
                                                # Overkill, since this will only ever be a single line of text
                                                style={'max-height': '295px', 'overflow-y': 'auto'}
                                            ),
                                            # To identify it as the container for the sentiment analysis
                                            title='Sentiment Analysis'
                                        ),
                                        # Menu item for recommendations
                                        dbc.AccordionItem(
                                            html.P(
                                                # To know where to put the OpenAI feedback
                                                id='feedback',
                                                # Blank until user input selected
                                                children='',
                                                # Only be so tall, and scroll if longer
                                                style={'max-height': '295px', 'overflow-y': 'auto'}
                                            ),
                                            # For the purposes of labeling it as the recepticle for feedback
                                            title='Feedback'
                                        )
                                    ]
                                ),
                                # Again, a good palce to begin
                                align='start',
                                # 7/12 of parent container, because that's what was left and it looks good
                                width=7
                            )
                        ],
                        # That's what was meant by "row", earlier - go this way <-->
                        direction='horizontal',
                        # Little bit of breathing room in there, too, please
                        gap=1
                    ),
                    # Row for markdown guide
                    dbc.Row(
                        # Markdown guide
                        dcc.Markdown(
                            # Content for the markdown guide
                            guide_content,
                            # Making the markdown guide pretty
                            style={
                                'margin-top': '50px',
                                'padding': '20px',
                                'background-color': 'rgba(255, 255, 255, 0.35)', # This one is super important!
                                'border-radius': '10px'
                            }
                        )
                    )
                ],
                # 10/12 of parent container, because this really is the star of the show, right here
                width=10
            ),
             # Blank col for spacing (1/12 of parent container)
            dbc.Col('', width=1),
        ],
        # Another go this way <--> bit
        direction='horizontal',
        # We like negative space, let's have more of that between things
        gap=1
    )
])

# Callback to populate the `DropdownMenu`
@callback(
    Output('chld_nm', 'children'),
    Output('avg_rtng', 'children'),
    Output('tot_rvws', 'children'),
    Output('bus_map', 'figure'),
    Output('reviews', 'children'),
    Output('sentiment', 'children'),
    Output('feedback', 'children'),
    [Input(f"menu_item_{i}", "n_clicks") for i in range(len(drop_opts))],
    [State(f"menu_item_{i}", "children") for i in range(len(drop_opts))]
)
def update_content(*args):
    # Default states for elements
    load_input = 'Use the dropdown menu on the left'
    load_avg_rtng = 'Average Rating: '
    load_tot_rvws = 'Total Available Reviews: '
    load_fig = fig_placeholder
    load_revs = 'Select a business to see reviews.'
    load_sent = 'Select a business to generate sentiment analysis.'
    load_rev_rec = 'Select a business to generate dynamic feedback.'

    # Confirming a dropdown selection has been made
    ctx = callback_context
    if ctx.triggered:
        # Finding which business was clicked
        selected_item_id = ctx.triggered[0]['prop_id'].split('.')[0]
        # Finding the index of the clicked business
        selected_index = int(selected_item_id.split('_')[-1])
        # Getting the selected business name
        selected_business = args[len(drop_opts) + selected_index]

        # Getting the average rating for the selected business
        # NOTE: Since `avg_rating` is stores on a per-business basis,
        # not a per-record basis, the first record's value will suffice
        rvw_avg = app_df.loc[app_df['bus_id'] == selected_business, 'avg_rating'].iloc[0]
        # Returning the average rating value
        avg_rtng = f'Average Rating: {rvw_avg:.1f}'

        # Gathering the reviews for the selected business
        reviews = reviews_list(app_df, 'bus_id', selected_business, 'bus_add', 'review')
        # Calculating the total number of reviews
        if len(reviews) >= 1:
            # If 1 or more, returning a count of available reviews
            rev_tot = f'Total Available Reviews: {len(reviews)}'
            # Preparing an empty list
            rev_list = []
            # Appending each review into `rev_list` with HTML formatting
            for rev in reviews:
                rev_list.append(html.P([rev.split(':\n')[0], ':', html.Br(), rev.split(':\n')[1], html.Br()]))
        else:
            rev_tot = 'Total Available Reviews: 0'
            rev_list = 'Too few reviews available to display.'

        # Preparing location details for the selected business
        locations = location_details(uniq_locs,'bus_id', selected_business, 'bus_add','bus_lat', 'bus_lon')
        # Building map based on locations for the selected business
        fig = build_map(locations)

        # Gather the general sentiment for the selected business
        gen_sent = general_sentiment(app_df, 'bus_id', selected_business, 'sentiment', 'accuracy')

        # Generate OpenAI response
        rev_rec = apply_davidlingo(gen_sent, reviews)

        # Returning the label corresponding to the clicked item
        return selected_business, avg_rtng, rev_tot, fig, rev_list, gen_sent, rev_rec
    # Returning original placeholder text if none selected
    return load_input, load_avg_rtng, load_tot_rvws, load_fig, load_revs, load_sent, load_rev_rec

# Launch app (in browser tab) (comment out if running in notebook)
app.run(jupyter_mode='tab')
# Launch app (in notebook) (uncomment to run)
# app.run_server(debug=True)

*End Code Space*

# **Vanessa's Code Space**

*End Code Space*

# **Train Test Splitting**

# **Scaling and Encoding**

# **Modeling**

# **Application (?)**

# **Findings**

# **Citations and Licenses**

## Citations

## Licenses