In [1]:
import requests
import pprint
from pprint import pformat
from bs4 import BeautifulSoup
import json
import pandas as pd
import logging
import matplotlib.pyplot as plt
import numpy as np
import math

In [2]:
# User agent for not being a robot
headers = {
            # 'User-Agent': '*',
            #
            #'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:65.0) Gecko/20100101 Firefox/65.0',
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36 OPR/64.0.3417.47',
           'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
           'Accept-Language': 'fr-FR,en;q=0.5',
           'Accept-Encoding': 'gzip, deflate',
           'Connection': 'keep-alive',
            'Cache-Control': 'max-age=0',
           'Upgrade-Insecure-Requests': '1'}



In [3]:
logging.basicConfig(level=logging.CRITICAL, format='%(message)s')
logger = logging.getLogger()

In [4]:
class Scraper():
    def __init__(self, url, page_suffix):
        self.logger = logging.getLogger()
        self.headers = headers
        self.url = url
        self.page_suffix = page_suffix

    def reach_website(self, url, headers):
        # Create a session for the website
        s = requests.Session()
        s.headers.update(headers)
        print("URL : : : ", url )
        r = s.get(url)
        if r.status_code == 200:
            self.logger.info("WebSite reached")
            return r
        else:
            logging.error(f"WebSite unreachable, response : {r.text}")
            return None

    def find_scripts_on_page(self, response):
        print("response" , response)
        soup = BeautifulSoup(response.text, 'html.parser')
        self.logger.debug(f"Raw Soup {soup}")

        script_list = soup.find_all('script')
        self.logger.debug(f"Script list {script_list}")

        return script_list

    ## TO OVERLOAD ##
    def find_json_in_scripts(self, script_list):
        self.logger.critical("TO OVERLOAD")

        for script_item in script_list:
            if 'BALISE' in script_item.text:
                json_data = None  ## To replace

                return json_data

        logging.error("No BALISE on loaded Page")
        return None

    ## / TO OVERLOAD ##
    def get_appt_list(self, json_data):
        self.logger.critical("TO OVERLOAD")

        tmp_list_appt_1 = json_data['ads']
        tmp_list_appt_2 = json_data['ads_alu']
        tmp_list_appt = tmp_list_appt_1 + tmp_list_appt_2
        self.logger.debug(f"List appt : {pformat(tmp_list_appt)} ")

        # list_appt = list(filter(lambda x: x['publicationId'] is not None, list(tmp_list_appt)))

        return tmp_list_appt

    def show_data(self, data):
        df = pd.DataFrame(data)
        self.logger.debug(f" Data Frame : {df}")

        with pd.option_context('display.max_rows', None, 'display.max_columns',
                               None):  # more options can be specified also
            print(df)

    def get_appt_from_url(self, url):
        response = self.reach_website(url, headers)
        script_list = self.find_scripts_on_page(response)
        json_data = self.find_json_in_scripts(script_list)
        list_appt = self.get_appt_list(json_data)
        self.logger.info(f"Added {len(list_appt)} appartements.")

        return list_appt, json_data

    def get_full_list_appt(self):
        logger.info(f"Page scrapped : 1")
        list_appt, json_data = self.get_appt_from_url(self.url)

        logger.info(f"Total appartement scrapped {len(list_appt)} appartements.")
        #self.show_data(list_appt)
        return list_appt


In [5]:


class LebonCoin_Scraper(Scraper):

    def find_json_in_scripts(self, script_list):

        for script_item in script_list:
            if 'window.__REDIAL_PROPS__' in script_item.text:
                self.logger.debug(f"script_item window.__REDIAL_PROPS__ : {script_item.text} ")
                string_data = script_item.text.split('=', 1)[1]
                
                self.logger.debug(f"string_data : {pformat(string_data)} ")
                json_data = json.loads(string_data)[4]["data"]
                self.logger.debug(f"Raw JSON : {pformat(json_data)} ")

                return json_data

        logging.error("Non __REDIAL_PROPS__ on loaded Page")
        return None

    def get_appt_list(self, json_data):
        tmp_list_appt_1 = json_data['ads']
        tmp_list_appt_2 = json_data['ads_alu']
        tmp_list_appt = tmp_list_appt_1 + tmp_list_appt_2
        self.logger.debug(f"List appt : {pformat(tmp_list_appt)} ")

        # list_appt = list(filter(lambda x: x['publicationId'] is not None, list(tmp_list_appt)))

        return tmp_list_appt



In [6]:
print(json.dumps(headers, indent=4, sort_keys=True))

{
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate",
    "Accept-Language": "fr-FR,en;q=0.5",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36 OPR/64.0.3417.47"
}


In [7]:
url_leboncoin = 'https://www.leboncoin.fr/_immobilier_/offres/ile_de_france/'
page_add_leboncoin = '&page=1'

In [8]:
scraper_1 = LebonCoin_Scraper(url_leboncoin, page_add_leboncoin)
#list_appt_1 = scraper_1.get_full_list_appt()

In [9]:
list_appt_1 = scraper_1.get_full_list_appt()

URL : : :  https://www.leboncoin.fr/_immobilier_/offres/ile_de_france/
response <Response [200]>


In [10]:
df = pd.DataFrame(list_appt_1)

In [11]:
df.columns

Index(['ad_type', 'attributes', 'body', 'category_id', 'category_name',
       'expiration_date', 'first_publication_date', 'has_phone', 'images',
       'index_date', 'list_id', 'location', 'options', 'owner', 'price',
       'price_calendar', 'status', 'subject', 'url'],
      dtype='object')

In [12]:
import datetime as dt

In [14]:
pd.to_datetime(df.index_date)[0]

Timestamp('2019-11-02 22:34:48')

In [26]:
df[["index_date","expiration_date", "first_publication_date"]]

Unnamed: 0,index_date,expiration_date,first_publication_date
0,2019-11-02 22:34:48,2020-01-01 22:34:48,2019-11-02 22:34:48
1,2019-11-02 22:33:15,2019-12-16 23:40:47,2019-10-17 23:40:47
2,2019-11-02 22:29:00,2020-01-01 22:29:00,2019-11-02 22:29:00
3,2019-11-02 22:28:47,2020-01-01 22:28:47,2019-11-02 22:28:47
4,2019-11-02 22:28:32,2020-01-01 21:18:34,2019-11-02 21:18:34
5,2019-11-02 22:27:44,2020-01-01 22:27:44,2019-11-02 22:27:44
6,2019-11-02 22:25:01,2020-01-01 16:02:17,2019-11-02 16:02:17
7,2019-11-02 22:23:18,2020-01-01 22:23:18,2019-11-02 22:23:18
8,2019-11-02 22:23:14,2020-01-01 22:23:14,2019-11-02 22:23:14
9,2019-11-02 22:21:25,2020-01-01 22:21:25,2019-11-02 22:21:25


In [27]:
#Get Location, body, 
def preprocessing_leboncoin(df = df):
    for loc, body, img, url, sub in zip(df.location, df.body, df.images, df.url, df.subject ) :
        print(url) 
        print(img) 
        print(loc) 
        print(body)
        print(sub)
        print("\n---------"*2)
    
    
preprocessing_leboncoin(df)

https://www.leboncoin.fr/locations/1700372349.htm
{'thumb_url': 'https://img1.leboncoin.fr/ad-thumb/0674ab933e07cb13b12137f5e061cac41523f922.jpg', 'small_url': 'https://img1.leboncoin.fr/ad-small/0674ab933e07cb13b12137f5e061cac41523f922.jpg', 'nb_images': 1, 'urls': ['https://img1.leboncoin.fr/ad-image/0674ab933e07cb13b12137f5e061cac41523f922.jpg'], 'urls_thumb': ['https://img1.leboncoin.fr/ad-thumb/0674ab933e07cb13b12137f5e061cac41523f922.jpg'], 'urls_large': ['https://img1.leboncoin.fr/ad-large/0674ab933e07cb13b12137f5e061cac41523f922.jpg']}
{'region_id': '12', 'region_name': 'Ile-de-France', 'department_id': '93', 'department_name': 'Seine-Saint-Denis', 'city_label': 'Drancy 93700', 'city': 'Drancy', 'zipcode': '93700', 'lat': 48.9234, 'lng': 2.44597, 'source': 'city', 'provider': 'here', 'is_shape': True}
1er contact par mail en précisant votre situation tél et mail
Grand appartement 3 pièces de 62 m² très agréable dans une belle résidence familiale, propre et sécurisée. L'appartem

In [16]:
set(df.location.apply( lambda x : str(sorted(list(x.keys())))))

{"['city', 'city_label', 'department_id', 'department_name', 'is_shape', 'lat', 'lng', 'provider', 'region_id', 'region_name', 'source', 'zipcode']"}

In [33]:
pd.DataFrame(str_json)

ValueError: DataFrame constructor not properly called!