# Extraction de données

In [32]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Cas Pratique 1

### Chargement des données

In [33]:
url = "https://www.basketball-reference.com/leagues/NBA_2022_per_game.html"

In [34]:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

table = soup.find("table", {"id": "per_game_stats"})

In [35]:
headers = [th.text for th in table.find("thead").find_all("th")][1:]  # Exclure la première cellule vide

### Extraction des données

In [36]:
data = []
for row in table.find("tbody").find_all("tr"):
    player_data = []
    for td in row.find_all("td"):
        player_data.append(td.text if td.text else None)
    data.append(player_data)

df_nba = pd.DataFrame(data, columns=headers)

In [37]:
df_nba

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards
0,Joel Embiid,27,PHI,C,68,68,33.8,9.8,19.6,.499,...,2.1,9.6,11.7,4.2,1.1,1.5,3.1,2.7,30.6,"MVP-2,AS,NBA2"
1,LeBron James,37,LAL,C,56,56,37.2,11.4,21.8,.524,...,1.1,7.1,8.2,6.2,1.3,1.1,3.5,2.2,30.3,"MVP-10,AS,NBA3"
2,Giannis Antetokounmpo,27,MIL,PF,67,67,32.9,10.3,18.6,.553,...,2.0,9.6,11.6,5.8,1.1,1.4,3.3,3.2,29.9,"MVP-3,DPOY-6,AS,NBA1"
3,Kevin Durant,33,BRK,PF,55,55,37.2,10.5,20.3,.518,...,0.5,6.9,7.4,6.4,0.9,0.9,3.5,2.1,29.9,"MVP-10,AS,NBA2"
4,Luka DonÄiÄ,22,DAL,PG,65,65,35.4,9.9,21.6,.457,...,0.9,8.3,9.1,8.7,1.2,0.6,4.5,2.2,28.4,"MVP-5,AS,NBA1"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
808,Trayvon Palmer,27,DET,SG,1,0,17.0,0.0,1.0,.000,...,0.0,2.0,2.0,0.0,0.0,0.0,1.0,2.0,0.0,
809,Emanuel Terry,25,PHO,PF,3,0,6.0,0.0,1.7,.000,...,2.7,2.3,5.0,0.7,0.3,0.0,1.7,1.3,0.0,
810,Jon Teske,24,MEM,C,3,0,2.7,0.0,0.3,.000,...,0.0,0.7,0.7,0.3,0.3,0.0,0.0,0.3,0.0,
811,M.J. Walker,23,PHO,SG,2,0,4.0,0.0,2.0,.000,...,0.0,0.5,0.5,0.5,1.0,0.0,0.0,0.5,0.0,


### Export des données au format csv

In [38]:
df_nba.to_csv("./output/NBA/data_NBA_per_game_2022.csv", index=False)
print("Fichier CSV exporté : data_NBA_per_game_2022.csv")

Fichier CSV exporté : data_NBA_per_game_2022.csv


## Cas Pratique 2

### Chargement des données

In [39]:
url = "https://content.codecademy.com/courses/beautifulsoup/cacao/index.html"

In [40]:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

table = soup.find("table", {"id": "cacaoTable"})

In [41]:
headers = [th.text.strip() for th in table.find_all("th")]

### Extraction des données

In [42]:
data = []
for row in table.find_all("tr")[1:]:
    cells = row.find_all("td")
    cocoa_percent = cells[4].text.strip().replace('%', '') 
    rating = cells[6].text.strip()
    data.append([float(rating), float(cocoa_percent)])
        
df_cacao = pd.DataFrame(data, columns=["Rating", "CocoaPercentage"])

### Export des données au format csv

In [43]:
df_cacao.to_csv("./output/CACAO/cacao_ratings.csv", index=False)

In [44]:
print("Fichier CSV exporté avec succès.")

Fichier CSV exporté avec succès.


### Export des données au format json

In [45]:
df_cacao.to_json("./output/CACAO/cacao_ratings_index.json", orient="index")
df_cacao.to_json("./output/CACAO/cacao_ratings_records.json", orient="records")
df_cacao.to_json("./output/CACAO/cacao_ratings_values.json", orient="values")

In [46]:
print("Fichiers JSON exportés avec succès.")

Fichiers JSON exportés avec succès.


## Cas Pratique 3 : Laptops

### Chargement des données

In [47]:
base_url = "https://www.webscraper.io/test-sites/e-commerce/static/computers/laptops"

In [48]:
response = requests.get(base_url)
soup = BeautifulSoup(response.text, "html.parser")

In [49]:
last_page = int(soup.find_all("a", class_="page-link")[-2].text)

### Extraction des données

In [50]:
data = []

for page in range(1, last_page + 1):
    url = f"{base_url}?page={page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    items = soup.find_all("div", class_="thumbnail")
    
    for item in items:
        name = item.find("a", class_="title").text.strip()
        description = item.find("p", class_="description").text.strip()
        price = float(item.find("h4", class_="price").text.replace("$", "").strip())
        rating_balise = item.find("p", {"data-rating": True})
        rating = int(rating_balise["data-rating"]) if rating_balise else None
        
        data.append([name, description, price, rating])

df_laptop = pd.DataFrame(data, columns=["Name", "Description", "Price", "Rating"])

In [51]:
df_laptop

Unnamed: 0,Name,Description,Price,Rating
0,Packard 255 G2,"15.6"", AMD E2-3800 1.3GHz, 4GB, 500GB, Windows...",416.99,2
1,Aspire E1-510,"15.6"", Pentium N3520 2.16GHz, 4GB, 500GB, Linux",306.99,3
2,ThinkPad T540p,"15.6"", Core i5-4200M, 4GB, 500GB, Win7 Pro 64bit",1178.99,1
3,ProBook,"14"", Core i5 2.6GHz, 4GB, 500GB, Win7 Pro 64bit",739.99,4
4,ThinkPad X240,"12.5"", Core i5-4300U, 8GB, 240GB SSD, Win7 Pro...",1311.99,3
...,...,...,...,...
112,Lenovo ThinkPa...,"Lenovo ThinkPad T470, 14"" FHD IPS, Core i5-720...",1349.23,1
113,Lenovo ThinkPa...,"Lenovo ThinkPad Yoga 370 Black, 13.3"" FHD IPS ...",1362.24,2
114,Toshiba Porteg...,"Toshiba Portege X20W-D-10V Black/Blue, 12.5"" F...",1366.32,1
115,Asus ASUSPRO B...,"Asus ASUSPRO B9440UA-GV0279R Gray, 14"" FHD, Co...",1381.13,1


### Export des données au format csv

In [52]:
df_laptop.to_csv("./output/LAPTOPS/laptops_data.csv", sep=";", index=False)

### Analyse

Les 3 laptops les mieux notés.

In [53]:
top_rated = df_laptop.nlargest(3, "Rating")
top_rated

Unnamed: 0,Name,Description,Price,Rating
3,ProBook,"14"", Core i5 2.6GHz, 4GB, 500GB, Win7 Pro 64bit",739.99,4
15,Dell Vostro 15,"Dell Vostro 15 (3568) Black, 15.6"" FHD, Core i...",488.78,4
26,Asus ROG Strix...,"Asus ROG Strix GL702ZC-GC154T, 17.3"" FHD, Ryze...",1769.0,4


Les 3 laptops les moins bien notés.

In [54]:
lowest_rated = df_laptop.nsmallest(3, "Rating")
lowest_rated

Unnamed: 0,Name,Description,Price,Rating
2,ThinkPad T540p,"15.6"", Core i5-4200M, 4GB, 500GB, Win7 Pro 64bit",1178.99,1
5,Aspire E1-572G,"15.6"", Core i5-4200U, 8GB, 1TB, Radeon R7 M265...",581.99,1
7,Pavilion,"15.6"", Core i5-4200U, 6GB, 750GB, Windows 8.1",609.99,1


Les 3 laptops les plus chers.

In [55]:
most_expensive = df_laptop.nlargest(3, "Price")
most_expensive

Unnamed: 0,Name,Description,Price,Rating
28,Asus ROG Strix...,"Asus ROG Strix SCAR Edition GL503VM-ED115T, 15...",1799.0,3
26,Asus ROG Strix...,"Asus ROG Strix GL702ZC-GC154T, 17.3"" FHD, Ryze...",1769.0,4
27,Asus ROG Strix...,"Asus ROG Strix GL702ZC-GC209T, 17.3"" FHD IPS, ...",1769.0,1


Les 3 laptops les moins chers.

In [56]:
least_expensive = df_laptop.nsmallest(3, "Price")
least_expensive

Unnamed: 0,Name,Description,Price,Rating
29,Asus VivoBook...,"Asus VivoBook X441NA-GA190 Chocolate Black, 14...",295.99,3
30,Prestigio Smar...,"Prestigio SmartBook 133S Dark Grey, 13.3"" FHD ...",299.0,2
31,Prestigio Smar...,"Prestigio SmartBook 133S Gold, 13.3"" FHD IPS, ...",299.0,4


## Cas Pratique 3 : Tablets

### Chargement des données

In [57]:
base_url = "https://www.webscraper.io/test-sites/e-commerce/static/computers/tablets"

In [58]:
last_page = int(soup.find_all("a", class_="page-link")[-2].text)

### Extraction des données

In [59]:
data = []

for page in range(1, last_page + 1):
    url = f"{base_url}?page={page}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    items = soup.find_all("div", class_="thumbnail")
    
    for item in items:
        name = item.find("a", class_="title").text.strip()
        description = item.find("p", class_="description").text.strip()
        price = float(item.find("h4", class_="price").text.replace("$", "").strip())
        rating_balise = item.find("p", {"data-rating": True})
        rating = int(rating_balise["data-rating"]) if rating_balise else None
        
        data.append([name, description, price, rating])

df_tablets = pd.DataFrame(data, columns=["Name", "Description", "Price", "Rating"])

In [60]:
df_tablets

Unnamed: 0,Name,Description,Price,Rating
0,Lenovo IdeaTab,"7"" screen, Android",69.99,3
1,Acer Iconia,"7"" screen, Android, 16GB",96.99,1
2,Asus MeMO Pad,"7"" screen, Android, 8GB",102.99,4
3,Amazon Kindle,"6"" screen, wifi",103.99,4
4,iPad Mini Reti...,"Wi-Fi + Cellular, 32GB, Silver",537.99,2
5,IdeaTab A3500L,"Black, 7"" IPS, Quad-Core 1.2GHz, 8GB, Android 4.2",88.99,4
6,Galaxy Tab,"16GB, White",251.99,3
7,IdeaTab A3500-...,"Blue, 7"" IPS, Quad-Core 1.3GHz, 8GB, 3G, Andro...",148.99,2
8,Galaxy Tab 3,"7"", 8GB, Wi-Fi, Android 4.2, White",97.99,2
9,Galaxy Note,"12.2"", 32GB, WiFi, Android 4.4, White",489.99,3


### Export des données au format csv

In [61]:
df_tablets.to_csv("./output/TABLETS/tablets_data.csv", sep=";", index=False)

### Analyse

Les 3 tablettes les mieux notés.

In [62]:
top_rated = df_tablets.nlargest(3, "Rating")
top_rated

Unnamed: 0,Name,Description,Price,Rating
2,Asus MeMO Pad,"7"" screen, Android, 8GB",102.99,4
3,Amazon Kindle,"6"" screen, wifi",103.99,4
5,IdeaTab A3500L,"Black, 7"" IPS, Quad-Core 1.2GHz, 8GB, Android 4.2",88.99,4


Les 3 tablettes les moins bien notés.

In [63]:
lowest_rated = df_tablets.nsmallest(3, "Rating")
lowest_rated

Unnamed: 0,Name,Description,Price,Rating
1,Acer Iconia,"7"" screen, Android, 16GB",96.99,1
10,MeMO Pad 7,"White, 7"", Atom 1.2GHz, 8GB, Android 4.4",130.99,1
4,iPad Mini Reti...,"Wi-Fi + Cellular, 32GB, Silver",537.99,2


Les 3 tablettes les plus chers.

In [64]:
most_expensive = df_tablets.nlargest(3, "Price")
most_expensive

Unnamed: 0,Name,Description,Price,Rating
20,Apple iPad Air,"Wi-Fi, 64GB, Silver",603.99,3
14,Galaxy Note 10...,"10.1"", 32GB, Black",587.99,2
4,iPad Mini Reti...,"Wi-Fi + Cellular, 32GB, Silver",537.99,2


Les 3 tablettes les moins chers.

In [65]:
least_expensive = df_tablets.nsmallest(3, "Price")
least_expensive

Unnamed: 0,Name,Description,Price,Rating
0,Lenovo IdeaTab,"7"" screen, Android",69.99,3
5,IdeaTab A3500L,"Black, 7"" IPS, Quad-Core 1.2GHz, 8GB, Android 4.2",88.99,4
1,Acer Iconia,"7"" screen, Android, 16GB",96.99,1
