## Scrapping et nettoyage des données

### Scrapping

In [1]:
# installation des packages nécessaires

# !pip install beautifulsoup4
# !pip install requests

In [2]:
# importation des librairies

from requests import get 
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
# importation de l'url

url_4 = "https://www.expat-dakar.com/machines-a-laver?page=77"

In [4]:
# récupération du contenu de la page

source = get(url_4)
source.text # voir le contenu de la page récupérée
# source

'<!DOCTYPE html>\n<html\n    lang="fr"\n    class=""\n    prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#"\n><head><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><meta name="imagemode" content="force"/><link rel="dns-prefetch" href="https://i.roamcdn.net"><link rel="preconnect" href="https://i.roamcdn.net"><link rel="dns-prefetch" href="https://adservice.google.com"><link rel="preconnect" href="https://adservice.google.com"><link rel="dns-prefetch" href="https://cdn.cookielaw.org"><link rel="preconnect" href="https://cdn.cookielaw.org"><link rel="dns-prefetch" href="https://cdn.onesignal.com"><link rel="preconnect" href="https://cdn.onesignal.com"><link rel="dns-prefetch" href="https://connect.facebook.net"><link rel="preconnect" href="https://connect.facebook.net"><link rel="dns-prefetch" href="https://creativecdn.com"><link rel="preconnect" href="https://creativecdn.com"><link rel="dns-prefetch" href="https://googleads.g.doubleclick.net">

In [5]:
# vérification de la récupération de contenu
source.status_code

200

In [6]:
# stockage de la page dans beautifulsoup

soup = BeautifulSoup(source.text, 'html.parser')
soup

<!DOCTYPE html>

<html class="" lang="fr" prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#"><head><meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/><meta content="force" name="imagemode"><link href="https://i.roamcdn.net" rel="dns-prefetch"/><link href="https://i.roamcdn.net" rel="preconnect"/><link href="https://adservice.google.com" rel="dns-prefetch"/><link href="https://adservice.google.com" rel="preconnect"/><link href="https://cdn.cookielaw.org" rel="dns-prefetch"/><link href="https://cdn.cookielaw.org" rel="preconnect"/><link href="https://cdn.onesignal.com" rel="dns-prefetch"/><link href="https://cdn.onesignal.com" rel="preconnect"/><link href="https://connect.facebook.net" rel="dns-prefetch"/><link href="https://connect.facebook.net" rel="preconnect"/><link href="https://creativecdn.com" rel="dns-prefetch"/><link href="https://creativecdn.com" rel="preconnect"/><link href="https://googleads.g.doubleclick.net" rel="dns-prefetch"/><link

In [7]:
# récupérer les conteneurs qui contiennent les informations : ce sont les blocs refrigerateurs/congelateurs

conteneurs = soup.find_all('div', {'class': 'listings-cards__list-item'})
# conteneurs = soup.find_all('div', class_ = 'listings-cards__list-item')

In [8]:
# nombre de conteneurs
len(conteneurs)

9

1. Récupérer les infos sur chaque conteneur (détails, état, adresse, prix, image_lien)

In [9]:
## récupérer les informations sur chaque conteneur 
conteneur_unique = conteneurs[5]

In [10]:
# récupération "details"

details = conteneur_unique.find('div', {'class': 'listing-card__header__title'}).text.strip()
details


'MACHINE A LAVER LG 2 EN 1 20KG F0L2CRV2T2'

In [11]:
# récupération "etat"
etat = conteneur_unique.find('span', {'class': 'listing-card__header__tags__item listing-card__header__tags__item--condition listing-card__header__tags__item--condition_new'}).text
etat

'Neuf'

In [12]:
# récupération "adresse"
adresse = conteneur_unique.find('div', {'class': 'listing-card__header__location'}).text.strip().replace(',\n', '').strip()
adresse

'Plateau                                                                                Dakar'

In [13]:
# récupération "prix"
prix = conteneur_unique.find('span', {'class': 'listing-card__price__value 1'}).text.strip().replace('\u202f', '').replace(' F Cfa', '')
prix

'1100000'

In [14]:
# récupération "image _lien"
image_lien = soup.find('img', {'class': 'listing-card__image__resource vh-img'})['src']
image_lien

'https://i.roamcdn.net/hz/ed/listing-thumb-224w/63de3fb3319aa1aba8b7dfb9c0521d14/-/horizon-files-prod/ed/picture/q44jwr9p/768e839fa225880bc916ca306994443d942ee649.jpg'

In [15]:
# Afficher toutes les infos récupérées
print("Détails : " + details,
      "\nEtat : " + str(etat),
      "\nAdresse : " + adresse,
      "\nPrix : " + prix,
      "\nImage_lien : " + image_lien)

Détails : MACHINE A LAVER LG 2 EN 1 20KG F0L2CRV2T2 
Etat : Neuf 
Adresse : Plateau                                                                                Dakar 
Prix : 1100000 
Image_lien : https://i.roamcdn.net/hz/ed/listing-thumb-224w/63de3fb3319aa1aba8b7dfb9c0521d14/-/horizon-files-prod/ed/picture/q44jwr9p/768e839fa225880bc916ca306994443d942ee649.jpg


2. Généraliser sur tous les conteneurs

In [16]:
# Initialiser une liste pour stocker les données

base = []

for conteneur_unique in conteneurs:
    try:
        conteneurs = soup.find_all('div', {'class': 'listings-cards__list-item'})
        details = conteneur_unique.find('div', {'class': 'listing-card__header__title'}).text.strip()
        etat = conteneur_unique.find('span', {'class': 'listing-card__header__tags__item listing-card__header__tags__item--condition listing-card__header__tags__item--condition_new'}).text
        adresse = conteneur_unique.find('div', {'class':'listing-card__header__location'}).text.strip().replace(',\n','').strip()
        prix = soup.find('span', {'class': 'listing-card__price__value 1'}).text.strip().replace('\u202f', '').replace(' F Cfa', '')
        image_lien = soup.find('img', {'class': 'listing-card__image__resource vh-img'})['src']

        dict =  {'details':details, 'etat':etat, 'adresse':adresse, 'prix':prix, 'image_lien':image_lien}
        base.append(dict)
    except:
        pass

base_ = pd.DataFrame(base)

In [17]:
# affichage de la base
base_.head()

Unnamed: 0,details,etat,adresse,prix,image_lien
0,Machine À Laver Semi - Automatique 7 - 8 - 9KG,Neuf,Point-e ...,130000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
1,MACHINE A LAVER HISENSE 10.5KG TOP- LOAD,Neuf,Ouakam ...,130000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
2,MACHINE A LAVER - HISENSE - 16 KG - TOP- LOAD,Neuf,Ouakam ...,130000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
3,MACHINE A LAVER - CANDY – 8 KG – SMART TOUCH –...,Neuf,Ouakam ...,130000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
4,MACHINE A LAVER LG 15KG / 8KG SECHANTE,Neuf,Plateau ...,130000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...


In [18]:
# vérification de la dimension
base_.shape

(9, 5)

In [19]:
# scraper sur toutes pages

base_donnee = pd.DataFrame()

for page in range(1,78):
    url_4 = f'https://www.expat-dakar.com/machines-a-laver?page={page}'
    source = get(url_4)
    soup = BeautifulSoup(source.text, 'html.parser')
    base = []
    for conteneur_unique in conteneurs:
        try:
            conteneurs = soup.find_all('div', {'class': 'listings-cards__list-item'})
            details = conteneur_unique.find('div', {'class': 'listing-card__header__title'}).text.strip()
            etat = conteneur_unique.find('span', {'class': 'listing-card__header__tags__item listing-card__header__tags__item--condition listing-card__header__tags__item--condition_new'}).text
            adresse = conteneur_unique.find('div', {'class':'listing-card__header__location'}).text.strip().replace(',\n','').strip()
            prix = soup.find('span', {'class': 'listing-card__price__value 1'}).text.strip().replace('\u202f', '').replace(' F Cfa', '')
            image_lien = soup.find('img', {'class': 'listing-card__image__resource vh-img'})['src']

            dict =  {'details':details, 'etat':etat, 'adresse':adresse, 'prix':prix, 'image_lien':image_lien}
            base.append(dict)
        except:
             pass
    
    base_ = pd.DataFrame(base)
    base_donnee = pd.concat([base_, base_donnee], axis = 0).reset_index(drop = True)

In [20]:
# affichage de la base finale
base_donnee.head()

Unnamed: 0,details,etat,adresse,prix,image_lien
0,MACHINE A LAVER INDESIT,Neuf,Plateau ...,130000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
1,MACHINE A LAVER IBERNA,Neuf,Plateau ...,130000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
2,MACHINE À LAVER ASTECH 7 KG MLG73V730DG,Neuf,Plateau ...,130000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
3,MACHINE A LAVER ASTECH 12KG MLG12C-V120C-CI,Neuf,Plateau ...,130000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
4,MACHINE À LAVER 9KG ASTECH SILVER,Neuf,Plateau ...,130000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...


In [21]:
# vérification de la dimension finale
base_donnee.shape

(752, 5)

In [22]:
# enrégistrement de la base

base_donnee.to_csv('base_donnee_url4.csv', index = False, header = True)
base_donnee.to_excel('base_donnee_url4.xlsx', index = False, header = True)