## Scrapping et nettoyage des données

### Scrapping

In [1]:
# installation des packages nécessaires

# !pip install beautifulsoup4
# !pip install requests

In [35]:
# importation des librairies

from requests import get 
from bs4 import BeautifulSoup
import pandas as pd

In [36]:
# importation de l'url

url_2 = "https://www.expat-dakar.com/climatisation?page=5"

In [37]:
# récupération du contenu de la page

source = get(url_2)
source.text # voir le contenu de la page récupérée
# source

'<!DOCTYPE html>\n<html\n    lang="fr"\n    class=""\n    prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#"\n><head><meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"><meta name="imagemode" content="force"/><link rel="dns-prefetch" href="https://i.roamcdn.net"><link rel="preconnect" href="https://i.roamcdn.net"><link rel="dns-prefetch" href="https://adservice.google.com"><link rel="preconnect" href="https://adservice.google.com"><link rel="dns-prefetch" href="https://cdn.cookielaw.org"><link rel="preconnect" href="https://cdn.cookielaw.org"><link rel="dns-prefetch" href="https://cdn.onesignal.com"><link rel="preconnect" href="https://cdn.onesignal.com"><link rel="dns-prefetch" href="https://connect.facebook.net"><link rel="preconnect" href="https://connect.facebook.net"><link rel="dns-prefetch" href="https://creativecdn.com"><link rel="preconnect" href="https://creativecdn.com"><link rel="dns-prefetch" href="https://googleads.g.doubleclick.net">

In [38]:
# vérification de la récupération de contenu
source.status_code

200

In [40]:
# stockage de la page dans beautifulsoup

soup = BeautifulSoup(source.text, 'html.parser')
soup

<!DOCTYPE html>

<html class="" lang="fr" prefix="og: http://ogp.me/ns# fb: http://ogp.me/ns/fb#"><head><meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/><meta content="force" name="imagemode"><link href="https://i.roamcdn.net" rel="dns-prefetch"/><link href="https://i.roamcdn.net" rel="preconnect"/><link href="https://adservice.google.com" rel="dns-prefetch"/><link href="https://adservice.google.com" rel="preconnect"/><link href="https://cdn.cookielaw.org" rel="dns-prefetch"/><link href="https://cdn.cookielaw.org" rel="preconnect"/><link href="https://cdn.onesignal.com" rel="dns-prefetch"/><link href="https://cdn.onesignal.com" rel="preconnect"/><link href="https://connect.facebook.net" rel="dns-prefetch"/><link href="https://connect.facebook.net" rel="preconnect"/><link href="https://creativecdn.com" rel="dns-prefetch"/><link href="https://creativecdn.com" rel="preconnect"/><link href="https://googleads.g.doubleclick.net" rel="dns-prefetch"/><link

In [41]:
# récupérer les conteneurs qui contiennent les informations : ce sont les blocs refrigerateurs/congelateurs

conteneurs = soup.find_all('div', {'class': 'listings-cards__list-item'})
# conteneurs = soup.find_all('div', class_ = 'listing-card__content 1')

In [42]:
# nombre de conteneurs
len(conteneurs)

11

1. Récupérer les infos sur chaque conteneur (détails, état, adresse, prix, image_lien)

In [43]:
## récupérer les informations sur chaque conteneur 
conteneur_unique = conteneurs[1]

In [44]:
# récupération "details"

details = conteneur_unique.find('div', {'class': 'listing-card__header__title'}).text.strip()
details


'ASTECH 12000BTU 1.5 CV R410'

In [45]:
# récupération "etat"
etat = conteneur_unique.find('span', {'class': 'listing-card__header__tags__item listing-card__header__tags__item--condition listing-card__header__tags__item--condition_new'}).text
etat

'Neuf'

In [46]:
# récupération "adresse"
adresse = conteneur_unique.find('div', {'class':'listing-card__header__location'}).text.strip().replace(',\n','').strip()
adresse


'Plateau                                                                                Dakar'

In [47]:
# récupération "prix"
prix = soup.find('span', {'class': 'listing-card__price__value 1'}).text.strip().replace('\u202f', '').replace(' F Cfa', '')
prix

'165000'

In [48]:
# récupération "image _lien"
image_lien = soup.find('img', {'class': 'listing-card__image__resource vh-img'})['src']
image_lien

'https://i.roamcdn.net/hz/ed/listing-thumb-224w/99d3a444b0b5a675500fdb727e14ddeb/-/horizon-files-prod/ed/picture/qj4rkx44/9ebdde3c8c6e87d7ae60ceccf16f2dc9a09f54a0.jpg'

In [49]:
# Afficher toutes les infos récupérées
print("Détails : " + details,
      "\nEtat : " + str(etat),
      "\nAdresse : " + adresse,
      "\nPrix : " + prix,
      "\nImage_lien : " + image_lien)

Détails : ASTECH 12000BTU 1.5 CV R410 
Etat : Neuf 
Adresse : Plateau                                                                                Dakar 
Prix : 165000 
Image_lien : https://i.roamcdn.net/hz/ed/listing-thumb-224w/99d3a444b0b5a675500fdb727e14ddeb/-/horizon-files-prod/ed/picture/qj4rkx44/9ebdde3c8c6e87d7ae60ceccf16f2dc9a09f54a0.jpg


2. Généraliser sur tous les conteneurs

In [50]:
# Initialiser une liste pour stocker les données

base = []

for conteneur_unique in conteneurs:
    try:
        conteneurs = soup.find_all('div', {'class': 'listings-cards__list-item'})
        details = conteneur_unique.find('div', {'class': 'listing-card__header__title'}).text.strip()
        etat = conteneur_unique.find('span', {'class': 'listing-card__header__tags__item listing-card__header__tags__item--condition listing-card__header__tags__item--condition_new'}).text
        adresse = conteneur_unique.find('div', {'class':'listing-card__header__location'}).text.strip().replace(',\n','').strip()
        prix = soup.find('span', {'class': 'listing-card__price__value 1'}).text.strip().replace('\u202f', '').replace(' F Cfa', '')
        image_lien = soup.find('img', {'class': 'listing-card__image__resource vh-img'})['src']

        dict =  {'details':details, 'etat':etat, 'adresse':adresse, 'prix':prix, 'image_lien':image_lien}
        base.append(dict)
    except:
        pass

base_ = pd.DataFrame(base)

In [51]:
# affichage de la base
base_.head()

Unnamed: 0,details,etat,adresse,prix,image_lien
0,Kit Ventilateur Clim Solaire,Neuf,Keur Massar ...,165000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
1,ASTECH 12000BTU 1.5 CV R410,Neuf,Plateau ...,165000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
2,SPLIT LG 24000BTU 3CV DUAL INVERTER S4Q24K2QAL,Neuf,Plateau ...,165000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
3,SPLIT HISENSE 18000BTU 2CV R410A,Neuf,Plateau ...,165000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
4,SPLIT HISENSE 12000BTU INVERTER AS12TR4SYETG,Neuf,Plateau ...,165000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...


In [52]:
# vérification de la dimension
base_.shape

(11, 5)

In [53]:
# scraper sur toutes pages

base_donnee = pd.DataFrame()

for page in range(1,126):
    url_2 = f'https://www.expat-dakar.com/climatisation?page={page}'
    source = get(url_2)
    soup = BeautifulSoup(source.text, 'html.parser')
    base = []
    for conteneur_unique in conteneurs:
        try:
            conteneurs = soup.find_all('div', {'class': 'listings-cards__list-item'})
            details = conteneur_unique.find('div', {'class': 'listing-card__header__title'}).text.strip()
            etat = conteneur_unique.find('span', {'class': 'listing-card__header__tags__item listing-card__header__tags__item--condition listing-card__header__tags__item--condition_new'}).text
            adresse = conteneur_unique.find('div', {'class':'listing-card__header__location'}).text.strip().replace(',\n','').strip()
            prix = soup.find('span', {'class': 'listing-card__price__value 1'}).text.strip().replace('\u202f', '').replace(' F Cfa', '')
            image_lien = soup.find('img', {'class': 'listing-card__image__resource vh-img'})['src']

            dict =  {'details':details, 'etat':etat, 'adresse':adresse, 'prix':prix, 'image_lien':image_lien}
            base.append(dict)
        except:
             pass
    
    base_ = pd.DataFrame(base)
    base_donnee = pd.concat([base_, base_donnee], axis = 0).reset_index(drop = True)

In [54]:
# affichage de la base finale
base_donnee.head()

Unnamed: 0,details,etat,adresse,prix,image_lien
0,Kit Ventilateur Clim Solaire,Neuf,Keur Massar ...,149000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
1,Split Grée 1.5cv 12000btu R410,Neuf,Plateau ...,149000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
2,SPLIT HISENSE 12000BTU R410 AS12HRASC,Neuf,Plateau ...,149000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
3,SPLIT HISENSE 18000 BTU INVERTER 2CV R410A,Neuf,Plateau ...,149000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...
4,SPLIT SMART 1CV /9000BTU GAZ 410 2022,Neuf,Plateau ...,149000,https://i.roamcdn.net/hz/ed/listing-thumb-224w...


In [55]:
# vérification de la dimension finale
base_donnee.shape

(1367, 5)

In [56]:
# enrégistrement de la base

base_donnee.to_csv('base_donnee_url2.csv', index = False, header = True)
base_donnee.to_excel('base_donnee_url2.xlsx', index = False, header = True)