# Data aquisition

In [None]:
import re
import sys
from concurrent import futures
from datetime import datetime

import chromedriver_autoinstaller
import pandas as pd
import yaml
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from tqdm.auto import tqdm

chromedriver_autoinstaller.install()
op = webdriver.ChromeOptions()
op.add_argument("--disable-extensions")
op.add_argument("--headless")
op.add_argument("--disable-gpu")
op.add_argument("--no-sandbox")
op.add_argument("--window-size=1200x600")
op.add_argument("--blink-settings=imagesEnabled=false")

# Creating a Google Driver for Selenium
driver_Create = lambda: webdriver.Chrome(options=op)

# config
cfg = yaml.safe_load(open("config.yml", "r"))

# Config
method = cfg["method"]  # one #fast #defined
base = cfg["base"]
city = cfg["city"]
thread = cfg["thread"]
shops = cfg["shops"]

df = pd.DataFrame(
    columns=[
        "Name",
        "href",
        "Rating",
        "Name_reviewer",
        "Rating_reviewer",
        "Elite",
        "Ort_reviewer",
        "Friends_count",
        "Reviews_count",
        "Pictures_count",
        "Date",
        "Comment",
        "Names_pictures",
        "Hilfreich",
        "Lustig",
        "Cool",
    ]
)

# Querry infos for every page
def get_pages(page_, driver_, rest_review):
    global df
    global pbar

    driver_.implicitly_wait(10)
    driver_.get(page_[1])

    def get_review_rating(page_, driver_, rest_review, retry):
        allnull = False
        try:
            # print("test ",page_[0])
            if rest_review == -1:
                try:
                    rest_review = WebDriverWait(driver_, 10).until(
                        EC.presence_of_element_located(
                            (
                                By.XPATH,
                                '//*[@id="wrap"]/div[2]/yelp-react-root/div[1]/div[3]/div[1]/div[1]/div/div/div[2]/div[2]/span',
                            )
                        )
                    )
                    rest_review = dig(rest_review.text)
                except:
                    allnull = True
                    rest_review = 0
            try:
                rating = WebDriverWait(driver_, 10).until(
                    EC.presence_of_element_located(
                        (
                            By.XPATH,
                            '//*[@id="wrap"]/div[2]/yelp-react-root/div[1]/div[3]/div[1]/div[1]/div/div/div[2]/div[1]/span/div',
                        )
                    )
                )
                rating = float(rating.get_attribute("aria-label")[:2])
                allnull = False
            except:
                rating = ""
            try:
                reviews = driver_.find_elements(
                    By.CLASS_NAME,
                    "review__373c0__3MsBX.border-color--default__373c0__1WKlL",
                )
                allnull = False
            except:
                reviews = []
            reviewer_rating = driver_.find_elements(
                By.CLASS_NAME, "i-stars__373c0___sZu0"
            )
            rest_review -= 10
            return reviews, rating, reviewer_rating, rest_review
        except:
            if retry == 10:
                print("Error by :", page_[0])
                return "error"

            retry += 1
            if allnull:
                print("Retry get_review_rating for ", page_[0], " :", retry)
                get_review_rating(page_, driver_, rest_review, retry)

    reviews, rating, reviewer_rating, rest_review = get_review_rating(
        page_, driver_, rest_review, 0
    )
    if reviews == "error":
        rest_review -= 10
        return

    if len(reviews) != 0:
        for stack, i in enumerate(reviews):
            # name , ort , friends , reviews , pictures ,date  , comment , name_photos ,Hilfreich , lustig, Cool
            liste = i.text.split("\n")

            try:
                rew_rat = reviewer_rating[stack + 2]
                rew_rat = float(rew_rat.get_attribute("aria-label")[:2])
            except:
                print(
                    "error in reviews by getting rating in",
                    page_[0],
                    "for",
                    str(stack + 1) + ".",
                    "review",
                )
                rew_rat = 0

            try:
                x = 1 if "Elite" in liste[1] else 0
                elite = {1: "Yes", 0: "No"}

                try:
                    pictures_count = dig(liste[6 + x])
                except:
                    pictures_count = 0

                review = {
                    "Name": page_[0],
                    "href": page_[1],
                    "Rating": rating,
                    "Name_reviewer": liste[0],
                    "Rating_reviewer": rew_rat,
                    "Elite": elite[x],
                    "Ort_reviewer": liste[1 + x],
                    "Friends_count": liste[2 + x],
                    "Reviews_count": liste[3 + x],
                    "Pictures_count": pictures_count,
                    "Hilfreich": dig(liste[-3]),
                    "Lustig": dig(liste[-2]),
                    "Cool": dig(liste[-1]),
                }

                re_date = r"\d{1,2}.\d{1,2}.\d{1,4}"
                if re.match(re_date, liste[5 + x]):
                    review["Date"] = liste[5 + x]
                else:
                    review["Date"] = ""

                if pictures_count <= 4:
                    review["Names_pictures"] = " ".join(
                        liste[-3 - pictures_count - x : -3]
                    )
                    review["Comment"] = " ".join(liste[7 + x : -3 - pictures_count - x])
                else:
                    review["Names_pictures"] = ""
                    review["Comment"] = " ".join(liste[7 + x : -4])

                df = df.append([review], ignore_index=True)
            except:
                print(
                    "error in reviews by:",
                    page_[0],
                    "for",
                    str(stack + 1) + ".",
                    "review",
                )
    else:
        df = df.append(
            [{"Name": page_[0], "href": page_[1], "Rating": rating}], ignore_index=True
        )

    if rest_review > 0:
        # driver3 = driver_Create()
        get_pages(page_, driver_, rest_review)
    else:
        print(page_[0], " processed...")
        pbar.update(1)
        driver_.close()
        driver_.quit()


# Querry all shops link and names
def get_shops(driver_):
    global df
    try:
        class_name = driver_.find_element(
            By.XPATH,
            '//*[@id="main-content"]/div/ul/li[4]/div/div/div/div[2]/div[1]/div[1]/div[1]/div/div/h4/span/a',
        )
    except:
        print("except")
        class_name = driver_.find_element(
            By.XPATH,
            '//*[@id="main-content"]/div/ul/li[4]/div/div/div/div[2]/div[1]/div/div[1]/div/div[1]/div/div/h4/span/a',
        )
    class_name = class_name.get_attribute("class")
    classes = driver_.find_elements(By.CLASS_NAME, class_name)
    for i in classes:
        if (i.get_attribute("href")).find("biz") != -1:
            page = [i.get_attribute("name"), i.get_attribute("href")]
            driver2 = driver_Create()
            get_pages(page, driver2, -1)


# Get dig in a word
def dig(word):
    a = re.findall(r"\d+", word)
    a = int(a[-1]) if len(a) != 0 else 0
    return a


# get total number of pages
base_driver = driver_Create()
base_driver.get(f"https://www.yelp.de/search?cflt={shops}&find_loc={city}")
total_pages = WebDriverWait(base_driver, 10).until(
    EC.presence_of_element_located(
        (By.XPATH, '//*[@id="main-content"]/div/ul/li[14]/div/div[2]/span')
    )
)

total_pages = dig(total_pages.text)
base_driver.quit()

# total_pages = thread
pbar = tqdm(total=total_pages * 10)

# Proccesing the scraper for the given city
def proc(url):
    global city
    driver = driver_Create()
    driver.get(base + f"search?cflt={shops}&find_loc=" + city + "&start=" + str(url))
    driver.implicitly_wait(10)
    get_shops(driver)


if method == "defined":
    # Multiprocessing with defined Chunk
    with futures.ThreadPoolExecutor() as executor:  # default/optimized number of threads
        total_pages = [i * 10 for i in range(total_pages)]
        chunk = [
            total_pages[i : i + thread] for i in range(0, len(total_pages), thread)
        ]
        for i in chunk:
            executor.map(proc, i)

elif method == "fast":
    # Multiprocessing with max perf
    with futures.ThreadPoolExecutor() as executor:  # default/optimized number of threads
        executor.map(proc, [i * 10 for i in range(total_pages)])

elif method == "one":
    # One by one
    for i in range(total_pages):
        proc(i * 10)


pbar.close()
df.to_csv(
    f"datas/data {city}'s {shops} {datetime.now().strftime('%d-%m-%Y')} {base.split('.')[-1][:-1]}.csv"
)

  0%|          | 0/240 [00:00<?, ?it/s]

Basilique du Sacré-Cœur de Montmartre  processed...
The Hardware Société  processed...
Le Temps des Cerises  processed...
La Droguerie du Marais  processed...
La Crêperie de Josselin  processed...
Le Cinq  processed...
Hank  processed...
Le Hide  processed...
Le Petit Canard  processed...
Pyramide du Louvre  processed...
