# **Project Title**
#### *Project Subtitle*

## Hypothesis;

Project thesis

In [None]:
# Importing dependencies
import pandas as pd

# **Data**

## Dataset

Dataset details

In [None]:
# Reading in data
# (potentially?)

### EDA

In [None]:
# Beginning EDA

# **Ramona's Code Space**

*End Code Space*

# **Christian's Code Space**

### Dependencies

In [None]:
# Installing gdown (uncomment if needed)
# %pip install gdown --quiet

In [None]:
# Imports and dependencies
import os
import re
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# json
import json

# gdown
import gdown

#### Resources path

In [None]:
# Defining a function to access datasets through `gdown`
def fetch_data(set):
    # Declaring `url` and `output` for dataset
    match set:
        case 'business':
            url = 'https://drive.google.com/file/d/1t-_rOjZ8oMqPcMJunVaMgY3OEbhnuSCv/view?usp=sharing'
            output = 'Resources/business_dataset.csv'
        case 'checkin':
            url = 'https://drive.google.com/file/d/1_AVWp31ymfvf4QgTiMN_WLAeapfr0omf/view?usp=sharing'
            output = 'Resources/checkin_dataset.csv'
        case 'reviews':
            url = 'https://drive.google.com/file/d/1L8rFjhOQyU90Ycr9t_OLA70vCYM0e7ck/view?usp=sharing'
            output = 'Resources/reviews_dataset.csv'
        case 'tip':
            url = 'https://drive.google.com/file/d/1LMkCi5AFC_58_m7ELmn1hR8YDykuXwqq/view?usp=sharing'
            output = 'Resources/tip_dataset.csv'
        case 'user':
            url = 'https://drive.google.com/file/d/1kQ522qcod7AjD5DO9vj8qFcSKxwJCDrO/view?usp=sharing'
            output = 'Resources/user_dataset.csv'
        case _:
            print('Invalid dataset selected, please try again')
            return None
    
    # Downloading dataset
    gdown.download(url, output, fuzzy=True, quiet=True)

    # Reading in the dataset
    df = pd.read_csv(output)

    # Returning the dataset
    return df

---

#### Business dataset

#### <font color='blue'> Description:</font> 
**Contains business data including location data, attributes, and categories.**

#### Loading data

In [None]:
# Fetching `business_dataset`
business_df = fetch_data('business')

#### Overview

In [None]:
business_df.head()

#### Info

In [None]:
business_df.info()

---

#### Checkin dataset

#### <font color='blue'> Description:</font>
**Checkins on a business.**

#### Loading Data

In [None]:
# Fethching `checkin_dataset`
checkin_df = fetch_data('checkin')

#### Overview

In [None]:
checkin_df.head()

#### Info

In [None]:
checkin_df.info()

#### **<font color='orange'> Notes:</font>**
**The team has determined this dataset would not add any value to our training data.**

---

#### Reviews dataset

#### <font color = 'blue'>Description:</font>
**Contains full review text data including the user_id that wrote the review and the business_id the review is written for.**

#### Loading Data

In [None]:
# Fetching `reviews_dataset`
reviews_df = fetch_data('reviews')

#### Overview

In [None]:
reviews_df.head()

#### Info

In [None]:
reviews_df.info()

#### Na count

In [None]:
reviews_df.isna().sum()

#### Dropping columns:
- **review_id**
- **useful**
- **funny**
- **cool**

In [None]:
reviews_df.drop(columns = ['review_id','useful','funny','cool'],
                inplace = True)

#### Renaming the 'text' field to 'review'

In [None]:
reviews_df.rename(columns = {'text':'review'},inplace = True)
reviews_df.head()

#### **<font color='orange'> Notes:</font>**
- **review_id: Eliminated due to low informational value.**
- **useful: Eliminated due to low relevance.**
- **funny: Eliminated due to low relevance.**
- **cool: Eliminated due to low relevance.**

  **The *<font color='green'>'business_id'</font>* feature will be used as the identifier, *<font color='green'>'stars'</font>* is the rating metric and the *<font color='grey'>'review'</font>*  field encapsulates**<br>
  **the data to be processed. the *<font color='green'>'date'</font>* variable is in place if time series analysis is needed.**

---

#### Tips dataset

#### <font color='blue'>Description:</font>
**Tips written by a user on a business. Tips are shorter than reviews and tend to convey quick suggestions.**

#### Loading Data

In [None]:
# Fetching `tips_dataset`
tips_df = fetch_data('tip')

#### Overview

In [None]:
tips_df.head()

#### Info

In [None]:
tips_df.info()

#### Dropping columns:
- **compliment_count**

In [None]:
tips_df.drop(columns = ['compliment_count'],
             inplace =True)

#### Renaming the 'text' column to 'recommendations'

In [None]:
tips_df.rename(columns = {'text':'recommendations'},inplace = True)
tips_df.head()

#### **<font color='orange'> Notes:</font>**
- **compliment_count: Eliminated due to low informational value.**


 **Since this data set has recommendations from the user to improve customer experience the 'recommendations' field could be a useful target variable.**

---

#### User dataset

#### <font color = 'blue'>Note:</font>
**User data including the user's friend mapping and all the metadata associated with the user.**

#### Loading Data

In [None]:
# Fetching `user_dataset`
user_df = fetch_data('user')

#### Overview

In [None]:
user_df.head()

#### Info

In [None]:
user_df.info()

#### **<font color='orange'> Notes:</font>**
**This data set will not be included in the training data to preserve user anonimity.**

# //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

## **<font color = 'darkgrey'>Merging the reviews data set and the business data set</font>**

#### <font color = 'blue'>Description:</font>
**This data set contains the fields that will be used to train the model**

In [None]:
data_df = reviews_df.merge(business_df,how='left',on = 'business_id')

In [None]:
data_df.info()

In [None]:
data_df.head()

#### Na count

In [None]:
data_df.isna().sum()

In [None]:
na_prcnt = data_df[['attributes','categories','hours']].isna().sum()/data_df.shape[0]*100
nas_df = pd.DataFrame(na_prcnt, columns=['percentage'])
nas_df = nas_df.transpose()
nas_df.round(4)

In [None]:
sns.barplot(data = nas_df).set_title('Na percentage')

#### **<font color='orange'> Notes:</font>**
**After consulting with the team we decided to drop all three columns.**

#### Dropping rows with na values

In [None]:
data_df.drop(columns = ['attributes','categories','hours'],inplace=True)

In [None]:
data_df.isna().sum()

#### *<font color='grey'>stars_x* and *<font color='grey'>stars_y* comparison

In [None]:
data_df.loc[data_df['stars_x'] != data_df['stars_y']][['stars_x','stars_y']].head()

#### *<font color='grey'>stars_x* and *<font color='grey'>stars_y*  for the same customer

In [None]:
data_df.loc[data_df['business_id']=='XQfwVwDr-v0ZS3_CbbE5Xw'][['stars_x','stars_y']].head()

#### *<font color='grey'>stars_x* average

In [None]:
round(data_df.loc[data_df['business_id']=='XQfwVwDr-v0ZS3_CbbE5Xw']['stars_x'].mean(),2)

#### **<font color='orange'> Notes:</font>**
**Because** *<font color='grey'> star_y</font>* **represents the average star rating, renaming** *<font color='grey'> star_y:</font>* **to:** *<font color='grey'> star_avg:</font>*

#### Renaming

In [None]:
data_df.rename(columns={'stars_y':'stars_avg','stars_x':'stars'},inplace = True)

#### Dropping is_open feature

In [None]:
fig,ax = plt.subplots()
sns.countplot(data_df,
             x='is_open',
             hue = 'is_open',
             ax = ax).set_title('is_open Feature')

#### droppin is_open

In [None]:
data_df.drop(columns = ['is_open'],inplace = True)

#### **<font color='orange'> Notes:</font>**
**After cosulting with the team we decided to drop this feature due low informational value and feature imbalance**

# //////////////////////////////////////////////////////////////////////////////////////////////////

## **<font color='darkgrey'>Merging with the tips data set exploration</font>**

#### <font color = 'blue'>Description:</font>
**Contains customer recommendatins to improve experience**

In [None]:
tips_df.head()

In [None]:
tips_df.info()

#### Quantity of unique business_id in the tips data set

In [None]:
display(tips_df['business_id'].unique().shape[0])

#### Quantity of unique business_id in  data_df

In [None]:
data_df['business_id'].unique().shape[0]

#### Subset of *<font color='grey'>business_id</font>* in *<font color='grey'>data_df</font>* not found in *<font color='grey'>tips_df</font>*.

In [None]:
no_tips_df = data_df[~data_df['business_id'].isin(tips_df['business_id'])]
no_tips_df.head()

#### Number of *<font color='grey'>business_id</font>* in *<font color='grey'>data_df</font>* not found in *<font color='grey'>tips_df</font>*.

In [None]:
no_tips_df = data_df[~data_df['business_id'].isin(tips_df['business_id'])]
not_found = no_tips_df['business_id'].unique().shape[0]
print(f'Number of business_ids in tips_df not found in data_df: {not_found}')

#### Evidence

In [None]:
tips_df.loc[tips_df['business_id'] == no_tips_df['business_id'].iloc[33]]

#### Merge

In [None]:
test_df = pd.merge(tips_df,data_df,
                   on = ['business_id','user_id'],
                   how = 'inner')
                         

#### Overview

In [None]:
test_df.info()

In [None]:
test_df.head()

#### Comparison review vs. recommendations

In [None]:
test_df[['review','recommendations']].head()

##### **<font color='orange'> Notes:</font>**
**The <font color='grey'>data_df</font> has approximately <font color='green'>7 million</font> entries and <font color='grey'>tips_df</font> about <font color='green'>1 million</font> after merging them we end up the a little under half a million**.<br>
**In the comparison above I don't see a difference between a review from the *reviews data set* and a recommendation from the *tips data set***.<br>
**As shown above we stand to loose a significant amount of data if a merge is performed**.

# ///////////////////////////////////////////////////////////////////////////////////////////////////

## <font color='darkgrey'>Final Data Overview</font>

#### Dropping the user_id column to preserv user anonimity

In [None]:
data_df.drop(columns = ['user_id'],inplace = True)

#### Overview

In [None]:
data_df.head()

#### Info

In [None]:
data_df.info()

#### Na verification

In [None]:
data_df.isna().sum()

*End Code Space*

# **Leigh's Code Space**

*End Code Space*

# **Angelica's Code Space**

In [2]:
#import required libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

In [3]:
# define a function to create df with the business name, avg rating and address of each location
def business_Overview(business_name,avg_rating,address1,lat,long,df):
    df['bus_id'] = business_name
    df['avg_rating'] = avg_rating
    address_list = address1.split(',')
    df['bus_add'] = address_list[0]
    df['bus_city'] = address_list[1]
    df['lat'] = lat
    df['lon'] = long
    return df

# import business df with Google Maps url for web scrapping
url_df = pd.read_csv('Resources/business_urls.csv')


# create list of urls and lat/long for web scrapping step 
url = url_df['url'].tolist()
lat = url_df['lat'].astype(str).tolist()
long = url_df['long'].astype(str).tolist()
#url = ['https://www.google.com/maps/place/Victoria+and+Albert+Museum/@51.4966392,-0.17218,15z/data=!4m5!3m4!1s0x0:0x9eb7094dfdcd651f!8m2!3d51.4966392!4d-0.17218']

# initiate driver
driver = webdriver.Chrome(service = ChromeService(ChromeDriverManager().install()))

#create for loop to parse through the different locations in the url list above
c = 0
df_list = []

for i in range(0,len(url)):
    c += 1
    driver.get(url[i])
    time.sleep(5)

    # get parameters needed for business overview function
    response = BeautifulSoup(driver.page_source, 'html.parser')
    business_name = response.find('h1',class_='DUwDvf lfPIob').text
    avg_rating = response.find('div',class_='fontDisplayLarge').text
    address = response.find('div',class_= 'rogA2c').text
    lat_ = lat[i]
    long_ = long[i]
    
    # navigate to Reviews tab
    driver.find_element(By.CLASS_NAME, "RWPxGd").click()
    time.sleep(3)

    #Find the total number of reviews
    total_number_of_reviews = driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[2]/div/div[2]/div[3]').text.split(" ")[0]
    total_number_of_reviews = int(total_number_of_reviews.replace(',','')) if ',' in total_number_of_reviews else int(total_number_of_reviews)

    total_number_of_reviews = 10

    #Find scroll layout
    scrollable_div = driver.find_element('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]')

    #Scroll as many times as necessary to load all reviews - 10 reviews shown at a time
    for i in range(0,(round(total_number_of_reviews/10 - 1))):
        driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scrollable_div)
        time.sleep(1)

    # #parse HTML and Data Extraction
    # response = BeautifulSoup(driver.page_source, 'html.parser')
    # reviews = response.find_all('div', class_= 'm6QErb DxyBCb kA9KIf dS8AEf XiKgde')

    # loop over the number of reviews 
    next_item = driver.find_elements('xpath','//*[@id="QA0Szd"]/div/div/div[1]/div[2]/div/div[1]/div/div/div[2]/div[9]/div[1]/div/div')
    time.sleep(3)

    #expand review by click on 'more' button
    for i in next_item:
        button = i.find_elements(By.TAG_NAME,'button')
        for m in button:
            if m.text == "More":
                m.click()
        time.sleep(5)

    # parse through the HTML 
    response = BeautifulSoup(driver.page_source, 'html.parser')
    reviews = response.find_all('div',class_ = 'jftiEf')

    # define function to gather relevant data from the reviews result set obtained by parsing through HTML
    def get_review_summary(result_set):
            rev_dict = {
                'review' : [],
                'rating' : []}

            for result in result_set:
                #review_name = result.find(class_='d4r55').text
                review_text = result.find('span',class_='wiI7pd').text
                review_rating = result.find(class_='kvMYJc')['aria-label']
                review_rating = review_rating[0]
                rev_dict['review'].append(review_text)
                rev_dict['rating'].append(review_rating)
            
            import pandas as pd
            return(pd.DataFrame(rev_dict))
    
    # gather relevant data using newly created function above 
    summary_df = get_review_summary(reviews)

    # access the number of locations in the url list
    df = business_Overview(business_name,avg_rating,address,lat_,long_,summary_df)

    # append df to df list 
    df_list.append(df)


#concat list of data frames into one 
final_df = pd.concat(df_list, ignore_index=True)
final_df


Unnamed: 0,review,rating,bus_id,avg_rating,bus_add,bus_city,lat,lon
0,"The cakes are beautiful and delicious, but you...",5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625
1,The entire experience was excellent: well-brew...,5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625
2,"Everything was great.\n\nFood, service, and th...",5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625
3,The soup is amazing. My family likes the Itali...,5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625
4,Absolutely love Dulce de Leche Bakery in Jerse...,5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625
5,We had the coconut mousse cake. It was beautif...,3,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625
6,I'd rate the food an impressive 8.3 out of 10 ...,5,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625
7,"Embarrasing. They raised the price of the ham,...",1,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625
8,I’ve been going to this bakery in this locatio...,4,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625
9,This bakery offers a delightful atmosphere wit...,4,Dulce De Leche Bakery,4.7,376 Central Ave,Jersey City,40.7476428,-74.0527625


*End Code Space*

# **Odele's Code Space**

*End Code Space*

# **Vanessa's Code Space**

*End Code Space*

# **Train Test Splitting**

# **Scaling and Encoding**

# **Modeling**

# **Application (?)**

# **Findings**

# **Citations and Licenses**

## Citations

## Licenses