In [None]:
print('hello')

In [None]:
# standard python
import os
import json
import time
import random
import warnings
import csv
from datetime import datetime

# external libs
import pandas as pd
import requests
from tqdm.notebook import tqdm

# beautiful soup
from bs4 import BeautifulSoup

# selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
# selenium with undetected tracking
import undetected_chromedriver as uc

# better visualization in terminal
from IPython.display import display, Image, HTML, clear_output
from PIL import Image as PILImage
from colorama import Fore, Style, init

# no warnings
warnings.filterwarnings("ignore", message="Pyarrow will become a required dependency")


# 0 - Webscrapping demo

## A - Functions

In [None]:
# type with human effect function (just for style)
def type_with_human_effect(element, text, min_delay=0.05, max_delay=0.15):
    """texting with random time between keys so that it's almost like a human (only for display)"""
    for char in text:
        element.send_keys(char)
        time.sleep(random.uniform(min_delay, max_delay))

# scrolling function
def smooth_scroll(driver, scroll_to=None, duration=2, steps=25):
    """scrolling the page"""
    # continue until the end of the page
    if scroll_to is None:
        total_height = driver.execute_script("return document.body.scrollHeight")
    else:
        total_height = scroll_to
    
    current_position = driver.execute_script("return window.pageYOffset")
    step_size = (total_height - current_position) / steps
    
    for i in range(steps):
        new_position = current_position + step_size * (i + 1)
        driver.execute_script(f"window.scrollTo(0, {new_position})")
        time.sleep(duration / steps)

# screenshot function
def take_and_display_screenshot(driver, filename="screenshot.png", width=800):
    """take a screenshot and show it in the notebook"""
    driver.save_screenshot(filename)
    
    # resizing the image to display it
    img = PILImage.open(filename)
    wpercent = (width / float(img.size[0]))
    hsize = int((float(img.size[1]) * float(wpercent)))
    img = img.resize((width, hsize), PILImage.LANCZOS)
    img.save(filename)
    
    # display image
    display(Image(filename=filename, width=width))

# highlight elements
def highlight_element(driver, element, duration=2):
    """highlights an element in the page to better follow (only for display)"""
    original_style = element.get_attribute("style")
    driver.execute_script("""
    arguments[0].setAttribute('style', arguments[1] + 
        '; border: 2px solid red; background: yellow; color: black;');
    """, element, original_style)
    time.sleep(duration)
    driver.execute_script("""
    arguments[0].setAttribute('style', arguments[1]);
    """, element, original_style)


## B - Example of Browser - Filling out a form

In [None]:
# colorama init for terminal
init()

# setting up the browser
print(f"{Fore.CYAN}browser configuration...{Style.RESET_ALL}")

# compatible options with undetected browser
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")  # full screen window
options.add_argument("--disable-notifications")  # no notifs

# progress bar
for i in tqdm(range(10), desc="Webdriver init"):
    time.sleep(0.2)

# webdriver init with undetected lib (to avoid robots warnings from google)
driver = webdriver.Chrome(options=options)

print(f"\n{Fore.CYAN}📝 Filling out a Form{Style.RESET_ALL}")
driver.get("https://httpbin.org/forms/post")
time.sleep(2)
take_and_display_screenshot(driver, "form_initial.png")

print("...")

# Remplir les champs du formulaire avec un effet visuel
try:
    # Nom du client
    customer_name = driver.find_element(By.NAME, "custname")
    highlight_element(driver, customer_name)
    type_with_human_effect(customer_name, "Amaury Gellé")
    
    # Téléphone
    phone = driver.find_element(By.NAME, "custtel")
    highlight_element(driver, phone)
    type_with_human_effect(phone, "0123456789")
    
    # Email
    email = driver.find_element(By.NAME, "custemail")
    highlight_element(driver, email)
    type_with_human_effect(email, "amaurygelle@gmail.com")
    
    # Choix de pizza
    pizza_choice = driver.find_element(By.CSS_SELECTOR, "input[value='medium']")
    highlight_element(driver, pizza_choice)
    pizza_choice.click()
    
    # Garniture
    toppings = driver.find_element(By.NAME, "topping")
    highlight_element(driver, toppings)
    driver.execute_script("arguments[0].value = 'mushroom';", toppings)
    
    # Heure de livraison
    time_elem = driver.find_element(By.NAME, "delivery")
    highlight_element(driver, time_elem)
    type_with_human_effect(time_elem, "18:30")
    
    # Instructions
    instructions = driver.find_element(By.NAME, "comments")
    highlight_element(driver, instructions)
    type_with_human_effect(instructions, "Please leave it in front of the doorway - thanks!")
    
    take_and_display_screenshot(driver, "form_filled.png")
    
    # Soumission du formulaire
    print("Soumission du formulaire...")
    submit_button = driver.find_element(By.CSS_SELECTOR, "button[type='submit']")
    highlight_element(driver, submit_button)
    submit_button.click()
    
    time.sleep(2)
    take_and_display_screenshot(driver, "form_submitted.png")
    
except Exception as e:
    print(f"Erreur lors du remplissage du formulaire: {e}")

## C - Mimicking a Google search

In [None]:
# colorama init for terminal
init()

# setting up the browser
print(f"{Fore.CYAN}browser configuration...{Style.RESET_ALL}")

# compatible options with undetected browser
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")  # full screen window
options.add_argument("--disable-notifications")  # no notifs

# progress bar
for i in tqdm(range(10), desc="Webdriver init"):
    time.sleep(0.2)

# webdriver init with undetected lib (to avoid robots warnings from google)
driver = webdriver.Chrome(options=options)

print(f"{Fore.GREEN}Interactive Google Search{Style.RESET_ALL}")
print("Opening Google and accepting cookies...")

# Ouverture de Google
driver.get("https://www.google.com")
time.sleep(2)

# Accept cookies if we have to (might have been done in the past)
try:
    cookie_button = WebDriverWait(driver, 5).until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Accept all')]"))
    )
    cookie_button.click()
    print("Cookies accepted ✓")
    time.sleep(1)
except:
    print("No cookies found or cookies already accepted")

# Typing with human-like behavior
search_box = driver.find_element(By.NAME, "q")
highlight_element(driver, search_box)
search_term = "Albert School data courses"
print(f"Looking: '{search_term}'")

type_with_human_effect(search_box, search_term)
take_and_display_screenshot(driver, "google_search_typing.png")

# Submitting the request
search_box.send_keys(Keys.RETURN)
time.sleep(3)
take_and_display_screenshot(driver, "google_results.png")

# Scrolling the page and taking screenshots
print("Scrolling the page...")
smooth_scroll(driver, duration=4)
take_and_display_screenshot(driver, "google_results_scrolled.png")

# Extracting data
print(f"{Fore.BLUE} Extracting links and descriptions...{Style.RESET_ALL}")

# Waiting for the elements of the page to have loaded
try:
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CSS_SELECTOR, "a[href^='https://']"))
    )
    print("Loaded ✓")
except:
    print("Timeout for research terms")

# collect search results
search_results = []
time.sleep(1)

# Look for all elements
links = driver.find_elements(By.CSS_SELECTOR, "a[href^='https://']:not([href*='google'])")

print(f"Links found: {len(links)}")

for link in links:
    try:
        # get the URL
        url = link.get_attribute("href")
        
        # Get the text of the link
        title = link.text
        
        # if there is no title, get its parent element 
        if not title.strip():
            try:
                h3 = link.find_element(By.XPATH, "./ancestor::*//h3 | .//*//h3 | ./following::h3[1] | ./preceding::h3[1]")
                title = h3.text
            except:
                title = "No title found"
        
        # Find descriptions
        description = "No description found"
        try:
            desc_candidates = [
                link.find_element(By.XPATH, "./ancestor::*[3]//div[string-length(text()) > 50]"),
                link.find_element(By.XPATH, "./following::div[string-length(text()) > 50][1]"),
                link.find_element(By.XPATH, "./ancestor::*//div[contains(@class, 'desc') or contains(@class, 'snippet') or contains(@class, 'description')]")
            ]
            
            for candidate in desc_candidates:
                if candidate and candidate.text.strip():
                    description = candidate.text.strip()
                    break
        except:
            pass  # default value if none
        
        # we only keep rows with at least one non-null value
        if title.strip() and url and "google" not in url.lower():
            search_results.append({
                "Title": title,
                "URL": url,
                "Description": description
            })
    except Exception as e:
        print(f"Error when extracting: {str(e)[:100]}...")
        continue

# No duplicates
unique_results = []
seen_urls = set()
for result in search_results:
    if result["URL"] not in seen_urls:
        seen_urls.add(result["URL"])
        unique_results.append(result)

search_results = unique_results

# Display first results
print(f"\n{Fore.YELLOW}Looking at firt results ({len(search_results)} au total):{Style.RESET_ALL}")
for i, result in enumerate(search_results[:3], 1):
    print(f"\nRésultat {i}:")
    print(f"Title: {result['Title']}")
    print(f"URL: {result['URL']}")
    print(f"Description: {result['Description'][:100]}...")

timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"google_search_results_{timestamp}.csv"

# Writing in csv
with open(csv_filename, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Title', 'URL', 'Description']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    writer.writeheader()
    for result in search_results:
        writer.writerow(result)

print(f"\n{Fore.GREEN}Exported results in {csv_filename} ({len(search_results)} entrées){Style.RESET_ALL}")

print("\nExportation terminée!")
for i in tqdm(range(5), desc="Finalisation"):
    time.sleep(0.3)


# I - Webscrapping IMDB (Beautiful Soup)

## A - Making a request on the Top 250 movies page

In [None]:
# looking at the robots.txt

#https://www.imdb.com/robots.txt


In [None]:
# make a request to the IMDB Top 250 Movies page

url = "https://www.imdb.com/chart/top/"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9"
}

# your code here
#
#
#

print(f"request status code: {response.status_code}")
if response.status_code == 200:
    print("all good")
else:
    print("no webpage found")
print('')


In [None]:
# creating a Beautiful Soup object based on the response.text (the HTML we parsed from the URL)



## B - Starting small - retrieving the information of the first movie

In [None]:
# soup.select_one() to fetch the first instance of the element we want to fetch

#first_movie = 


In [None]:
# getting the title with first_movie.select_one() to fetch the first instance of the list category

#first_title = 


In [None]:
# fetch the first year

#first_year = 


In [None]:
# fetch the first rating

#first_rating = 


## C - Retrieve data for the whole page

In [None]:
# initializing lists to store the data

movie_titles = []
movie_years = []
movie_ratings = []

#
#
#

print('titles:', movie_titles)
print('years:', movie_years)
print('ratings:', movie_ratings)


## D - Creating a dataframe and saving the data

In [None]:
# creating the dataframe

movies_df = pd.DataFrame({
    'Title': movie_titles,
    'Year': movie_years,
    'Rating': movie_ratings
})

movies_df.head(5)


In [None]:
# saving the dataframe in a dedicated folder

path = 'xxxx'
file = 'imdb_top_movies.csv'
movies_df.to_csv(f'{path}/{file}', index=False)
print(f"data exported to {path}/{file}")


# II - Webscrapping an Olist Website

## A - Fetch one product

In [None]:
# Chrome configuration in headless mode
options = Options()
options.add_argument('--headless') # comment if you want to see what the browser is doing
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

# Install Chrome driver
service = Service(ChromeDriverManager().install())

# creating a browser with a driver object
driver = webdriver.Chrome(service=service, options=options)

url = "https://www.sunsetcosmeticos.com.br/produto/base-cobertura-total-maple-absolute-new-york-208"

driver.get(url)

#
#
#


In [None]:
# transforming it into a function

def fetch_product(driver, url):
    '''
    fetches a product's title, price, stars and description
    '''
    #
    #
    #

# executing the function to make sure it works



## B - Fetch one brand

In [None]:
# start by fetching the url of the first product of the page, then use the function defined above to fetch its information

# Chrome configuration in headless mode
options = Options()
options.add_argument('--headless') # comment if you want to see what the browser is doing
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

# Install Chrome driver
service = Service(ChromeDriverManager().install())

# creating a browser with a driver object
driver = webdriver.Chrome(service=service, options=options)

url = 'https://www.sunsetcosmeticos.com.br/absolute_new_york'

# fetch the url of the first product
#
#

# fetch its information
#
#


In [None]:
# generalize this method for every product of the page (no need to add a scroll for the time being)

#
#
#


In [None]:
# create a function to fetch all products of the page (and add a scroll this time, to get them all)

def scroll_and_load_more(max_scrolls=20):
    '''
    scrolls the page with a max of 20 scrolls to load more products
    we count the number of products in the page before scrolling
    we compare this number to the count of products after scrolling
    if we don't have any change, we stop the scroll
    '''
    previous_count = len(driver.find_elements(By.CLASS_NAME, "product-block"))
    scroll_count = 0
    
    #
    #
    #

def fetch_brand_products(driver, brand='absolute_new_york'):
    '''
    fetches all products of a page by using the scroll function
    captures the urls of the products
    then uses the fetch_product function to retrieve its informations
    '''
    
    #
    #
    #



In [None]:
# testing the code

# add browser configuration as we did previously

brand = 'absolute_new_york'
new_york = fetch_brand_products(driver, brand)
df = pd.DataFrame(new_york)

df.head(5)


In [None]:
# saving the dataframe in the folder you want

file_name = brand.replace('-', '_') + '.csv'
print(file_name)
path = 'xxxx'
df.to_csv(f'{path}/{file_name}', index=False)
print(f"Data saved to {file_name} ({len(new_york)} products)")


## C - All brands

In [None]:
# fill the final below function 


def get_all_brands(driver, url= 'https://www.sunsetcosmeticos.com.br/p/marcas'):
    """
    fetches all the brands url links present in this url: https://www.sunsetcosmeticos.com.br/p/marcas
    uses the previous functions to get all of their products urls, and their product information
    """
    #
    #
    #


In [None]:
# test your code, retrieve all products from sunsetcosmeticos, and save it to a file
