## Scrapping et nettoyage des données

### Scrapping

In [2]:
# installation des packages nécessaires

# !pip install beautifulsoup4
# !pip install requests

In [3]:
# importation des librairies

from requests import get 
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
# importation de l'url

url_3 = "https://www.expat-dakar.com/cuisinieres-fours?page=4"

In [5]:
# récupération du contenu de la page

source = get(url_3)
source.text # voir le contenu de la page récupérée
# source

'<!DOCTYPE html>\n<html\n    lang="fr"\n    class=""\n    prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#"\n><head><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><meta name="imagemode" content="force"/><link rel="dns-prefetch" href="https://i.roamcdn.net"><link rel="preconnect" href="https://i.roamcdn.net"><link rel="dns-prefetch" href="https://adservice.google.com"><link rel="preconnect" href="https://adservice.google.com"><link rel="dns-prefetch" href="https://cdn.cookielaw.org"><link rel="preconnect" href="https://cdn.cookielaw.org"><link rel="dns-prefetch" href="https://cdn.onesignal.com"><link rel="preconnect" href="https://cdn.onesignal.com"><link rel="dns-prefetch" href="https://connect.facebook.net"><link rel="preconnect" href="https://connect.facebook.net"><link rel="dns-prefetch" href="https://creativecdn.com"><link rel="preconnect" href="https://creativecdn.com"><link rel="dns-prefetch" href="https://googleads.g.doubleclick.net">

In [6]:
# vérification de la récupération de contenu
source.status_code

200

In [7]:
# stockage de la page dans beautifulsoup

soup = BeautifulSoup(source.text, 'html.parser')
soup

<!DOCTYPE html>

<html class="" lang="fr" prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#"><head><meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/><meta content="force" name="imagemode"><link href="https://i.roamcdn.net" rel="dns-prefetch"/><link href="https://i.roamcdn.net" rel="preconnect"/><link href="https://adservice.google.com" rel="dns-prefetch"/><link href="https://adservice.google.com" rel="preconnect"/><link href="https://cdn.cookielaw.org" rel="dns-prefetch"/><link href="https://cdn.cookielaw.org" rel="preconnect"/><link href="https://cdn.onesignal.com" rel="dns-prefetch"/><link href="https://cdn.onesignal.com" rel="preconnect"/><link href="https://connect.facebook.net" rel="dns-prefetch"/><link href="https://connect.facebook.net" rel="preconnect"/><link href="https://creativecdn.com" rel="dns-prefetch"/><link href="https://creativecdn.com" rel="preconnect"/><link href="https://googleads.g.doubleclick.net" rel="dns-prefetch"/><link

In [8]:
# récupérer les conteneurs qui contiennent les informations : ce sont les blocs refrigerateurs/congelateurs

conteneurs = soup.find_all('div', {'class': 'listings-cards__list-item'})
# conteneurs = soup.find_all('div', class_ = 'listings-cards__list-item')

In [9]:
# nombre de conteneurs
len(conteneurs)

10

1. Récupérer les infos sur chaque conteneur (détails, état, adresse, prix, image_lien)

In [10]:
## récupérer les informations sur chaque conteneur 
conteneur_unique = conteneurs[2]

In [11]:
# récupération "details"

details = conteneur_unique.find('div', {'class': 'listing-card__header__title'}).text.strip()
details

'PLAQUE HAIER ENCASTRABLE 4FEUX MIXTE 3F A GAZ 1F ELECTRIQUE'

In [12]:
# récupération "etat"
etat = conteneur_unique.find('span', {'class': 'listing-card__header__tags__item listing-card__header__tags__item--condition listing-card__header__tags__item--condition_new'}).text
etat

'Neuf'

In [13]:
# récupération "adresse"
adresse = conteneur_unique.find('div', {'class': 'listing-card__header__location'}).text.strip().replace(',\n', '').strip()
adresse

'Plateau                                                                                Dakar'

In [14]:
# récupération "prix"
prix = conteneur_unique.find('span', {'class': 'listing-card__price__value 1'}).text.strip().replace('\u202f', '').replace(' F Cfa', '')
prix

'139000'

In [15]:
# récupération "image _lien"
image_lien = soup.find('img', {'class': 'listing-card__image__resource vh-img'})['src']
image_lien

'https://i.roamcdn.net/hz/ed/listing-thumb-224w/8c3c1e940980896de38c2fd336c30285/-/horizon-files-prod/ed/picture/qkr8epm8/74b0033a23785aa1af1f5216cc1ca5759d0bb934.jpg'

In [16]:
# Afficher toutes les infos récupérées
print("Détails : " + details,
      "\nEtat : " + str(etat),
      "\nAdresse : " + (adresse),
      "\nPrix : " + prix,
      "\nImage_lien : " + image_lien)

Détails : PLAQUE HAIER ENCASTRABLE 4FEUX MIXTE 3F A GAZ 1F ELECTRIQUE 
Etat : Neuf 
Adresse : Plateau                                                                                Dakar 
Prix : 139000 
Image_lien : https://i.roamcdn.net/hz/ed/listing-thumb-224w/8c3c1e940980896de38c2fd336c30285/-/horizon-files-prod/ed/picture/qkr8epm8/74b0033a23785aa1af1f5216cc1ca5759d0bb934.jpg


2. Généraliser sur tous les conteneurs

In [21]:
# Initialiser une liste pour stocker les données

base = []

for conteneur_unique in conteneurs:
    try:
        conteneurs = soup.find_all('div', {'class': 'listings-cards__list-item'})
        details = conteneur_unique.find('div', {'class': 'listing-card__header__title'}).text.strip()
        etat = conteneur_unique.find('span', {'class': 'listing-card__header__tags__item listing-card__header__tags__item--condition listing-card__header__tags__item--condition_new'}).text
        adresse = conteneur_unique.find('div', {'class':'listing-card__header__location'}).text.strip().replace(',\n','').strip()
        prix = soup.find('span', {'class': 'listing-card__price__value 1'}).text.strip().replace('\u202f', '').replace(' F Cfa', '')
        image_lien = soup.find('img', {'class': 'listing-card__image__resource vh-img'})['src']

        dict =  {'details':details, 'etat':etat, 'adresse':adresse, 'prix':prix, 'image_lien':image_lien}
        base.append(dict)
    except:
        pass

base_ = pd.DataFrame(base)

In [22]:
# affichage de la base
base_.head()

Unnamed: 0,details,etat,adresse,prix,image_lien
0,"Friteuse Sans Huile Air Fryer Capacité de 5,0 L",Neuf,Plateau ...,35000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
1,CUISINIÈRE REALCE 4 FEUX 50X50 WHITE,Neuf,Plateau ...,35000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
2,CUISINIÈRE - ASTECH - 4 FEUX - 60×60 - FULL OP...,Neuf,Ouakam ...,35000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
3,Friteuse à air numérique RAF 8L,Neuf,Plateau ...,35000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
4,FOUR -ELECTRIQUE - ASTECH - 42 LITRES - AVEC G...,Neuf,Ouakam ...,35000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...


In [23]:
# vérification de la dimension
base_.shape

(10, 5)

In [20]:
# scraper sur toutes pages

base_donnee = pd.DataFrame()

for page in range(1,105):
    url_3 = f'https://www.expat-dakar.com/cuisinieres-fours?page={page}'
    source = get(url_3)
    soup = BeautifulSoup(source.text, 'html.parser')
    base = []
    for conteneur_unique in conteneurs:
        try:
            conteneurs = soup.find_all('div', {'class': 'listings-cards__list-item'})
            details = conteneur_unique.find('div', {'class': 'listing-card__header__title'}).text.strip()
            etat = conteneur_unique.find('span', {'class': 'listing-card__header__tags__item listing-card__header__tags__item--condition listing-card__header__tags__item--condition_new'}).text
            adresse = conteneur_unique.find('div', {'class':'listing-card__header__location'}).text.strip().replace(',\n','').strip()
            prix = soup.find('span', {'class': 'listing-card__price__value 1'}).text.strip().replace('\u202f', '').replace(' F Cfa', '')
            image_lien = soup.find('img', {'class': 'listing-card__image__resource vh-img'})['src']

            dict =  {'details':details, 'etat':etat, 'adresse':adresse, 'prix':prix, 'image_lien':image_lien}
            base.append(dict)
        except:
             pass
    
    base_ = pd.DataFrame(base)
    base_donnee = pd.concat([base_, base_donnee], axis = 0).reset_index(drop = True)

In [24]:
# affichage de la base finale
base_donnee.head()

Unnamed: 0,details,etat,adresse,prix,image_lien
0,PLAQUE ENCASTRABLE TECHNOLUX 5 FEUX 70×60 À GAZ,Neuf,Plateau ...,35000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
1,CUISINIÈRE CAC 4 FEUX 50×50 BLACK CAC50G,Neuf,Plateau ...,35000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
2,CUISINIÈRE CAC 4 FEUX 60×60 BLACK CAC60G,Neuf,Plateau ...,35000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
3,Friteuse Double bacs 9 L,Neuf,Plateau ...,35000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
4,GAZINIERE 2 Feux et 3 feux,Neuf,Point-e ...,35000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...


In [25]:
# vérification de la dimension finale
base_donnee.shape

(1019, 5)

In [26]:
# enrégistrement de la base

base_donnee.to_csv('base_donnee_url3.csv', index = False, header = True)
base_donnee.to_excel('base_donnee_url3.xlsx', index = False, header = True)