# Scraping Beautifulsoup 

In [1]:
!pip install requests beautifulsoup4 pandas





## Cas pratique 1

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.basketball-reference.com/leagues/NBA_2022_per_game.html"
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

table = soup.find('table', {'id': 'per_game_stats'})

headers = [th.text for th in table.find('thead').find_all('th')][1:]  

rows = []
for row in table.find('tbody').find_all('tr'):
    cells = row.find_all('td')
    if len(cells) == len(headers):  
        rows.append([cell.text for cell in cells])

df_nba = pd.DataFrame(rows, columns=headers)

df_nba.to_csv('data/nba_player_stats_2022.csv', index=False)

print("Données de statistiques des joueurs NBA exportées avec succès.")


Données de statistiques des joueurs NBA exportées avec succès.


In [4]:
df_nba

Unnamed: 0,Player,Age,Team,Pos,G,GS,MP,FG,FGA,FG%,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Awards
0,Joel Embiid,27,PHI,C,68,68,33.8,9.8,19.6,.499,...,2.1,9.6,11.7,4.2,1.1,1.5,3.1,2.7,30.6,"MVP-2,AS,NBA2"
1,LeBron James,37,LAL,C,56,56,37.2,11.4,21.8,.524,...,1.1,7.1,8.2,6.2,1.3,1.1,3.5,2.2,30.3,"MVP-10,AS,NBA3"
2,Giannis Antetokounmpo,27,MIL,PF,67,67,32.9,10.3,18.6,.553,...,2.0,9.6,11.6,5.8,1.1,1.4,3.3,3.2,29.9,"MVP-3,DPOY-6,AS,NBA1"
3,Kevin Durant,33,BRK,PF,55,55,37.2,10.5,20.3,.518,...,0.5,6.9,7.4,6.4,0.9,0.9,3.5,2.1,29.9,"MVP-10,AS,NBA2"
4,Luka Dončić,22,DAL,PG,65,65,35.4,9.9,21.6,.457,...,0.9,8.3,9.1,8.7,1.2,0.6,4.5,2.2,28.4,"MVP-5,AS,NBA1"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
808,Trayvon Palmer,27,DET,SG,1,0,17.0,0.0,1.0,.000,...,0.0,2.0,2.0,0.0,0.0,0.0,1.0,2.0,0.0,
809,Emanuel Terry,25,PHO,PF,3,0,6.0,0.0,1.7,.000,...,2.7,2.3,5.0,0.7,0.3,0.0,1.7,1.3,0.0,
810,Jon Teske,24,MEM,C,3,0,2.7,0.0,0.3,.000,...,0.0,0.7,0.7,0.3,0.3,0.0,0.0,0.3,0.0,
811,M.J. Walker,23,PHO,SG,2,0,4.0,0.0,2.0,.000,...,0.0,0.5,0.5,0.5,1.0,0.0,0.0,0.5,0.0,


## Cas pratique 2

In [13]:
from bs4 import BeautifulSoup
import pandas as pd

url = "https://content.codecademy.com/courses/beautifulsoup/cacao/index.html"
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

cocoa_percent = [td.text.strip('%') for td in soup.find_all('td', class_='CocoaPercent')]
ratings = [td.text for td in soup.find_all('td', class_='Rating')]

df_cacao = pd.DataFrame({
    'Cocoa percent': cocoa_percent,
    'Rating': ratings
})


df_cacao = df_cacao.drop(0)

df_cacao.to_csv('data/cacao_ratings.csv', index=False)
df_cacao.to_json('data/cacao_ratings_records.json', orient='records')
df_cacao.to_json('data/cacao_ratings_values.json', orient='values')

print("Données des pourcentages de cacao et des évaluations exportées avec succès.")


Données des pourcentages de cacao et des évaluations exportées avec succès.


In [14]:
df_cacao

Unnamed: 0,Cocoa percent,Rating
1,63,3.75
2,70,2.75
3,70,3
4,70,3.5
5,70,3.5
...,...,...
1791,70,3.75
1792,65,3
1793,65,3.5
1794,62,3.25


## Cas pratique 3

## PC

In [17]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.webscraper.io/test-sites/e-commerce/static/computers/laptops"
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

names = [item.text for item in soup.find_all('a', class_='title')]
descriptions = [item.text for item in soup.find_all('p', class_='description')]
prices = [item.text for item in soup.find_all('h4', class_='price')]

ratings = []
for rating_div in soup.find_all('div', class_='ratings'):
    stars = rating_div.find('p').text if rating_div.find('p') else 'N/A'
    ratings.append(stars)

df_laptops = pd.DataFrame({
    'Name': names,
    'Description': descriptions,
    'Price': prices,
    'Rating': ratings
})

df_laptops.to_csv('data/laptops.csv', index=False)

print("Données des laptops exportées avec succès.")


Données des laptops exportées avec succès.


In [30]:
df_laptops.dtypes

Name            object
Description     object
Price          float64
Rating          object
dtype: object

In [32]:
df_laptops

Unnamed: 0,Name,Description,Price,Rating
0,Packard 255 G2,"15.6"", AMD E2-3800 1.3GHz, 4GB, 500GB, Windows...",416.99,2 reviews
1,Aspire E1-510,"15.6"", Pentium N3520 2.16GHz, 4GB, 500GB, Linux",306.99,2 reviews
2,ThinkPad T540p,"15.6"", Core i5-4200M, 4GB, 500GB, Win7 Pro 64bit",1178.99,2 reviews
3,ProBook,"14"", Core i5 2.6GHz, 4GB, 500GB, Win7 Pro 64bit",739.99,8 reviews
4,ThinkPad X240,"12.5"", Core i5-4300U, 8GB, 240GB SSD, Win7 Pro...",1311.99,12 reviews
5,Aspire E1-572G,"15.6"", Core i5-4200U, 8GB, 1TB, Radeon R7 M265...",581.99,2 reviews


In [33]:
df_laptops['Rating'] = df_laptops['Rating'].str.replace(' reviews', '').astype(float)


## Tablette

In [20]:
url_tablets = "https://www.webscraper.io/test-sites/e-commerce/static/computers/tablets"
response = requests.get(url_tablets)

soup = BeautifulSoup(response.content, 'html.parser')

# Extraire nom, description, prix, et rating
names = [item.text for item in soup.find_all('a', class_='title')]
descriptions = [item.text for item in soup.find_all('p', class_='description')]
prices = [item.text for item in soup.find_all('h4', class_='price')]

ratings = []
for rating_div in soup.find_all('div', class_='ratings'):
    stars = rating_div.find('p').text if rating_div.find('p') else 'N/A'
    ratings.append(stars)

df_tablets = pd.DataFrame({
    'Name': names,
    'Description': descriptions,
    'Price': prices,
    'Rating': ratings
})

df_tablets['Price'] = df_tablets['Price'].replace('[\$,]', '', regex=True).astype(float)

df_tablets.to_csv('data/tablets.csv', index=False)

print("Données des tablets exportées avec succès.")


  df_tablets['Price'] = df_tablets['Price'].replace('[\$,]', '', regex=True).astype(float)


Données des tablets exportées avec succès.


In [34]:
df_tablets

Unnamed: 0,Name,Description,Price,Rating
0,Lenovo IdeaTab,"7"" screen, Android",69.99,7 reviews
1,Acer Iconia,"7"" screen, Android, 16GB",96.99,7 reviews
2,Asus MeMO Pad,"7"" screen, Android, 8GB",102.99,14 reviews
3,Amazon Kindle,"6"" screen, wifi",103.99,3 reviews
4,iPad Mini Reti...,"Wi-Fi + Cellular, 32GB, Silver",537.99,8 reviews
5,IdeaTab A3500L,"Black, 7"" IPS, Quad-Core 1.2GHz, 8GB, Android 4.2",88.99,7 reviews


In [35]:
df_tablets['Rating'] = df_tablets['Rating'].str.replace(' reviews', '').astype(float)


## Requete

## PC

In [36]:
# Les 3 laptops les mieux notés
top_3_rated_laptops = df_laptops.nlargest(3, 'Rating')
print("Top 3 laptops les mieux notés :")
top_3_rated_laptops

Top 3 laptops les mieux notés :


Unnamed: 0,Name,Description,Price,Rating
4,ThinkPad X240,"12.5"", Core i5-4300U, 8GB, 240GB SSD, Win7 Pro...",1311.99,12.0
3,ProBook,"14"", Core i5 2.6GHz, 4GB, 500GB, Win7 Pro 64bit",739.99,8.0
0,Packard 255 G2,"15.6"", AMD E2-3800 1.3GHz, 4GB, 500GB, Windows...",416.99,2.0


In [37]:
# Les 3 laptops les moins bien notés
bottom_3_rated_laptops = df_laptops.nsmallest(3, 'Rating')
print("Bottom 3 laptops les moins bien notés :")
bottom_3_rated_laptops

Bottom 3 laptops les moins bien notés :


Unnamed: 0,Name,Description,Price,Rating
0,Packard 255 G2,"15.6"", AMD E2-3800 1.3GHz, 4GB, 500GB, Windows...",416.99,2.0
1,Aspire E1-510,"15.6"", Pentium N3520 2.16GHz, 4GB, 500GB, Linux",306.99,2.0
2,ThinkPad T540p,"15.6"", Core i5-4200M, 4GB, 500GB, Win7 Pro 64bit",1178.99,2.0


In [38]:
# Les 3 laptops les plus chers
df_laptops['Price'] = df_laptops['Price'].replace('[\$,]', '', regex=True).astype(float)  # Nettoyer le prix
top_3_expensive_laptops = df_laptops.nlargest(3, 'Price')
print("Top 3 laptops les plus chers :")
top_3_expensive_laptops

Top 3 laptops les plus chers :


  df_laptops['Price'] = df_laptops['Price'].replace('[\$,]', '', regex=True).astype(float)  # Nettoyer le prix


Unnamed: 0,Name,Description,Price,Rating
4,ThinkPad X240,"12.5"", Core i5-4300U, 8GB, 240GB SSD, Win7 Pro...",1311.99,12.0
2,ThinkPad T540p,"15.6"", Core i5-4200M, 4GB, 500GB, Win7 Pro 64bit",1178.99,2.0
3,ProBook,"14"", Core i5 2.6GHz, 4GB, 500GB, Win7 Pro 64bit",739.99,8.0


In [39]:
# Les 3 laptops les moins chers
bottom_3_expensive_laptops = df_laptops.nsmallest(3, 'Price')
print("Bottom 3 laptops les moins chers :")
bottom_3_expensive_laptops

Bottom 3 laptops les moins chers :


Unnamed: 0,Name,Description,Price,Rating
1,Aspire E1-510,"15.6"", Pentium N3520 2.16GHz, 4GB, 500GB, Linux",306.99,2.0
0,Packard 255 G2,"15.6"", AMD E2-3800 1.3GHz, 4GB, 500GB, Windows...",416.99,2.0
5,Aspire E1-572G,"15.6"", Core i5-4200U, 8GB, 1TB, Radeon R7 M265...",581.99,2.0


## Tablette

In [40]:
# Les 3 tablets les mieux notés
top_3_rated_tablets = df_tablets.nlargest(3, 'Rating')
print("Top 3 tablets les mieux notés :")
top_3_rated_tablets

Top 3 tablets les mieux notés :


Unnamed: 0,Name,Description,Price,Rating
2,Asus MeMO Pad,"7"" screen, Android, 8GB",102.99,14.0
4,iPad Mini Reti...,"Wi-Fi + Cellular, 32GB, Silver",537.99,8.0
0,Lenovo IdeaTab,"7"" screen, Android",69.99,7.0


In [41]:
# Les 3 tablets les moins bien notés
bottom_3_rated_tablets = df_tablets.nsmallest(3, 'Rating')
print("Bottom 3 tablets les moins bien notés :")
bottom_3_rated_tablets

Bottom 3 tablets les moins bien notés :


Unnamed: 0,Name,Description,Price,Rating
3,Amazon Kindle,"6"" screen, wifi",103.99,3.0
0,Lenovo IdeaTab,"7"" screen, Android",69.99,7.0
1,Acer Iconia,"7"" screen, Android, 16GB",96.99,7.0


In [28]:
# Les 3 tablets les moins chers
bottom_3_expensive_tablets = df_tablets.nsmallest(3, 'Price')
print("Bottom 3 tablets les moins chers :")
bottom_3_expensive_tablets

Bottom 3 tablets les moins chers :


Unnamed: 0,Name,Description,Price,Rating
0,Lenovo IdeaTab,"7"" screen, Android",69.99,7 reviews
5,IdeaTab A3500L,"Black, 7"" IPS, Quad-Core 1.2GHz, 8GB, Android 4.2",88.99,7 reviews
1,Acer Iconia,"7"" screen, Android, 16GB",96.99,7 reviews


## Bonus

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "http://books.toscrape.com/"
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, "html.parser")

    books = []
    prices = []

    for book in soup.find_all('h3'):
        title = book.a['title']
        books.append(title)

    for price in soup.find_all('p', class_='price_color'):
        prices.append(price.text)

    df = pd.DataFrame({'Title': books, 'Price': prices})

    df.to_csv('data/static_scraping_data.csv', index=False)

    print("Données extraites et sauvegardées avec succès.")
else:
    print(f"Erreur lors de la requête, code de statut : {response.status_code}")


Données extraites et sauvegardées avec succès.
