# Scrapping Crypto 

In [1]:
import os
import time
import random
import pandas as pd
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [None]:

def configure_driver():
    """
    Configure et retourne une instance du driver Selenium pour Chrome.
    """
    options = Options()
    # Désactiver le mode headless pour voir le navigateur en temps réel
    # options.add_argument("--headless")  # Supprimez ou commentez cette ligne
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def random_sleep():
    time.sleep(random.uniform(2, 5))

def convert_date_to_unix(date_str):
    """
    Convertit une date (format DD/MM/YYYY) en timestamp UNIX.
    """
    from datetime import datetime
    return int(datetime.strptime(date_str, "%d/%m/%Y").timestamp())

def process_crypto_data(driver, name, url):
    """
    Scrape les données historiques d'une crypto depuis Yahoo Finance.
    """
    try:
        print(f"Scraping pour {name}")
        driver.get(url)
        random_sleep()

     
        WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.TAG_NAME, "body")))

  
        try:
            cookie_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'Accept')]"))
            )
            cookie_button.click()
        except Exception:
            print(f"Pas de bouton de cookies trouvé pour {name}")

        random_sleep()

        
        try:
            table = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "table")))
            headers = [header.text for header in table.find_elements(By.TAG_NAME, "th")]
            rows = table.find_elements(By.TAG_NAME, "tr")[1:]
            
            data = []
            for row in rows:
                cells = row.find_elements(By.TAG_NAME, "td")
                data.append([cell.text.strip() for cell in cells])


            df = pd.DataFrame(data, columns=headers)
            df['Crypto'] = name

         
            raw_dir = "data/raw"
            os.makedirs(raw_dir, exist_ok=True)
            filename = os.path.join(raw_dir, f"{name.replace(' ', '_')}_yahoo_data.csv")
            df.to_csv(filename, index=False, encoding="utf-8")
            print(f"Données sauvegardées pour {name} : {filename}")
        except Exception as e:
            print(f"Erreur pour {name}: {e}")

    except Exception as e:
        print(f"Erreur générale pour {name}: {e}")


AVAILABLE_CRYPTOS = [
    "Bitcoin (BTC)", "Ethereum (ETH)", "Cardano (ADA)", "Dogecoin (DOGE)",
    "Litecoin (LTC)", "Cosmos (ATOM)", "Chainlink (LINK)",
    "Polygon (MATIC)", "XRP (XRP)", "Filecoin (FIL)"
]


start_date = "01/01/2020"
end_date = "31/12/2024"

start_timestamp = convert_date_to_unix(start_date)
end_timestamp = convert_date_to_unix(end_date)

yahoo_urls = {
    "Bitcoin (BTC)": f"https://finance.yahoo.com/quote/BTC-USD/history?period1={start_timestamp}&period2={end_timestamp}&interval=1d",
    "Ethereum (ETH)": f"https://finance.yahoo.com/quote/ETH-USD/history?period1={start_timestamp}&period2={end_timestamp}&interval=1d",
    "Cardano (ADA)": f"https://finance.yahoo.com/quote/ADA-USD/history?period1={start_timestamp}&period2={end_timestamp}&interval=1d",
    "Dogecoin (DOGE)": f"https://finance.yahoo.com/quote/DOGE-USD/history?period1={start_timestamp}&period2={end_timestamp}&interval=1d",
    "Litecoin (LTC)": f"https://finance.yahoo.com/quote/LTC-USD/history?period1={start_timestamp}&period2={end_timestamp}&interval=1d",
    "Cosmos (ATOM)": f"https://finance.yahoo.com/quote/ATOM-USD/history?period1={start_timestamp}&period2={end_timestamp}&interval=1d",
    "Chainlink (LINK)": f"https://finance.yahoo.com/quote/LINK-USD/history?period1={start_timestamp}&period2={end_timestamp}&interval=1d",
    "Polygon (MATIC)": f"https://finance.yahoo.com/quote/MATIC-USD/history?period1={start_timestamp}&period2={end_timestamp}&interval=1d",
    "XRP (XRP)": f"https://finance.yahoo.com/quote/XRP-USD/history?period1={start_timestamp}&period2={end_timestamp}&interval=1d",
    "Filecoin (FIL)": f"https://finance.yahoo.com/quote/FIL-USD/history?period1={start_timestamp}&period2={end_timestamp}&interval=1d",
}


driver = configure_driver()

for name, url in yahoo_urls.items():
    process_crypto_data(driver, name, url)


driver.quit()

print("Fusion des données en cours...")
raw_dir = "data/raw"
all_files = [f for f in os.listdir(raw_dir) if f.endswith(".csv")]

df_list = []
for file in all_files:
    file_path = os.path.join(raw_dir, file)
    df = pd.read_csv(file_path)
    df_list.append(df)

merged_df = pd.concat(df_list, ignore_index=True)


processed_dir = "data/processed"
os.makedirs(processed_dir, exist_ok=True)
merged_file = os.path.join(processed_dir, "merged_data.csv")
merged_df.to_csv(merged_file, index=False)
print(f"Données fusionnées sauvegardées dans : {merged_file}")

print(merged_df.head())
print("Nombre de lignes par crypto :")
print(merged_df['Crypto'].value_counts())


Scraping pour Bitcoin (BTC)
Données sauvegardées pour Bitcoin (BTC) : data/raw\Bitcoin_(BTC)_yahoo_data.csv
Scraping pour Ethereum (ETH)
Pas de bouton de cookies trouvé pour Ethereum (ETH)
Données sauvegardées pour Ethereum (ETH) : data/raw\Ethereum_(ETH)_yahoo_data.csv
Scraping pour Cardano (ADA)
Pas de bouton de cookies trouvé pour Cardano (ADA)
Données sauvegardées pour Cardano (ADA) : data/raw\Cardano_(ADA)_yahoo_data.csv
Scraping pour Dogecoin (DOGE)
Pas de bouton de cookies trouvé pour Dogecoin (DOGE)
Données sauvegardées pour Dogecoin (DOGE) : data/raw\Dogecoin_(DOGE)_yahoo_data.csv
Scraping pour Litecoin (LTC)
Pas de bouton de cookies trouvé pour Litecoin (LTC)
Données sauvegardées pour Litecoin (LTC) : data/raw\Litecoin_(LTC)_yahoo_data.csv
Scraping pour Cosmos (ATOM)
Pas de bouton de cookies trouvé pour Cosmos (ATOM)
Données sauvegardées pour Cosmos (ATOM) : data/raw\Cosmos_(ATOM)_yahoo_data.csv
Scraping pour Chainlink (LINK)
Pas de bouton de cookies trouvé pour Chainlink (LI