# Extracting Info from Ads

`Author: James Smith`

`Date: 22/12/2019`

Given an add link, we want to extract all information that we want from it

- Phone Number - using workaround
- All other info - using JSON dictionary located at bottom of page code

Example Ad: https://www.donedeal.ie/cars-for-sale/very-clean-4x4/23802005

## Function to get info from ad link

In [63]:
def get_info_from_ad_link(url):
    """
    Given a url it returns a dataframe of info
    """
    
    #----- Define custom functions -----#
    
    def find_char_index(s, ch):
        """
        Function to return indexes of a character in a given string
        https://stackoverflow.com/questions/11122291/how-to-find-char-in-string-and-get-all-the-indexes
        """
        return [i for i, ltr in enumerate(s) if ltr == ch]
    
    def get_phone_number(url):
        """
        Given a url this function return a dictionary containing 
        the sellers phone number.
        Returns None if an error occurs
        This happens due to frequent scraping - "Please contact customer support"
        """
        phone_headers = {"User-Agent": "Mozilla/5", "Referer": url}

        try:
            phone_url= "https://www.donedeal.ie/cadview/api/v3/view/ad/" + str(details_dict["id"]) + "/phone/"
            phone_request = requests.post(phone_url, headers = phone_headers)
            return json.loads(phone_request.text)
        except:
            return {"phone" : None}

    
    #----- Get page HTML code -----#
    headers = {'User-Agent':'Mozilla/5'}
    r = requests.get(url, headers = headers)
    html_contents = r.text
    html_soup = BeautifulSoup(html_contents, 'html.parser')
    
    #----- Obtain Main JSON Dictionary -----#
    scripts = html_soup.find_all('script')
    window_ad_details_script = [script for script in scripts if 'window.adDetails' in script.text][0]
    window_ad_details_script

    #----- Clean in order to represent as Python Dictionary -----#

    beginning = find_char_index(window_ad_details_script.text, '{')[0] # Begins at first opening bracket
    end = find_char_index(window_ad_details_script.text, '}')[-1] # Ends at last closing bracket

    window_ad_details_script_short = window_ad_details_script.text[beginning :end+1]
    window_ad_details_script_short

    details_dict = json.loads(window_ad_details_script_short)
    
    #----- Extract Data from Python Dictionary -----#

    # passing in list rather than string
    # https://stackoverflow.com/questions/17839973/constructing-pandas-dataframe-from-values-in-variables-gives-valueerror-if-usi

    # Contains car info as a dictionary as a list in a value of the main dictionary
    # The 'name' key contains the wanted key and similarly for 'value'
    # Only the first 14 elements contain car attributes

    car_attributes_list = details_dict["displayAttributes"][0:14]
    car_attributes_dict = {attribute["name"] : attribute["value"] for attribute in car_attributes_list}

    #----- Collect Data -----#
    
    # Obtaining values from keys directly can throw up a KeyValue error when it does not exist.
    # The method .get() is a safer way to access the dictionary

    data = {"ad_header" : details_dict.get("header"),
            "ad_description" : details_dict.get("description"),
            "ad_age" : details_dict.get("age"),
            "ad_id" : details_dict.get("id"),
            "ad_url" : [url],
            "ad_price" : details_dict.get("price"),
            "ad_currency" : details_dict.get("currency"),

            "seller_id" : details_dict.get("seller").get("id"),
            "seller_name" : details_dict.get("seller").get("name"),
            "seller_num_ads" : details_dict.get("seller").get("adCount"),
            "seller_type" : details_dict.get("seller").get("type"),
            "seller_county" : details_dict.get("seller").get("county"),
            "seller_county_town" : details_dict.get("seller").get("countyTown"),
            
            "seller_reg_date" : details_dict.get("seller").get("registrationDate"),

            "seller_phone_number" : [get_phone_number(url)["phone"]],

            "green_light_verified" : details_dict.get("greenlightVerified"),

            "car_make" : car_attributes_dict.get("make"),
            "car_model" : car_attributes_dict.get("model"),
            "car_year" : car_attributes_dict.get("year"),
            "car_mileage" : car_attributes_dict.get("mileage"),
            "car_fuel_type" : car_attributes_dict.get("fuelType"),
            "car_transmission" : car_attributes_dict.get("transmission"),
            "car_body_type" : car_attributes_dict.get("bodyType"),
            "car_engine" : car_attributes_dict.get("engine"),
            "car_road_tax" : car_attributes_dict.get("roadTax"),
            "car_nct_expiry" : car_attributes_dict.get("NCT"),
            "car_num_prev_owners" : car_attributes_dict.get("previousOwners"),
            "car_reg_country" : car_attributes_dict.get("country"),
            "car_colour" : car_attributes_dict.get("colour"),
            "car_num_doors" : car_attributes_dict.get("numDoors")
           }

    return pd.DataFrame.from_dict(data)

Test Function

In [64]:
# # Imports
# import pandas as pd
# import requests
# from bs4 import BeautifulSoup
# import json

url = 'https://www.donedeal.ie/cars-for-sale/very-clean-4x4/23802005'
get_info_from_ad_link(url)

Unnamed: 0,ad_age,ad_currency,ad_description,ad_header,ad_id,ad_price,ad_url,car_body_type,car_colour,car_engine,...,car_year,green_light_verified,seller_county,seller_county_town,seller_id,seller_name,seller_num_ads,seller_phone_number,seller_reg_date,seller_type
0,4 days,EUR,Suzuki grand virara for sale.\r\nExcellent con...,Suzuki grand vitara,23802005,1299,https://www.donedeal.ie/cars-for-sale/very-cle...,SUV,Red,1.6 litre,...,2005,False,Donegal,Carndonagh,146465,John,3,,2009-09-20 23:02:31.154,PRIVATE


## Function to gather links

In [69]:
def get_ad_links(max_links = 100):
    """
    Collects all car ads from DoneDeal
    Options to include number of links to collect
    Can later change criteria to automatically collect all ads
    """
    
    #----- Define custom functions -----#

    def generate_url(base_url, page_number):
        """
        Based on the structure of the donedeal website. 
        This function will returns the url given the desired page number
        """
        if page_number < 1:
            raise Exception("page_number cannot be less than 1")

        if page_number == 1:
            url = base_url
        else:
            extra_pages_url_addition = "&start={}".format(str((page_number - 1) * 28))
            url = base_url + extra_pages_url_addition

        return url  
    
    #----- Set up parameters -----#
    
    max_links = max_links
    base_url = 'https://www.donedeal.ie/cars?source=private'
    sub_str_car_add = "https://www.donedeal.ie/cars-for-sale"


    car_links = []
    page_number = 1
    status_code = 200

    headers = {'User-Agent':'Mozilla/5'}

    #----- Loop over pages based on stopping criteria -----#

    while len(car_links) < max_links and status_code == 200:

        url = generate_url(base_url, page_number = page_number)
        r = requests.get(url, headers = headers)
        status_code = r.status_code

        print("Scraping info from page number", page_number)
        print("url: ", url)

        if status_code == 200:

            html_contents = r.text
            html_soup = BeautifulSoup(html_contents, 'html.parser')

            all_links_on_page = html_soup.find_all('a', class_ = 'card__link', href = True)
            car_links_on_page = [link['href'] for link in all_links_on_page 
                                 if sub_str_car_add in link['href']]
            # Join lists together
            car_links = car_links + car_links_on_page

            # Update page number to get info from the next page
            page_number = page_number + 1

        else:

            print("Oops... This page returned the status code: ", status_code)
            print("The url ", url, " likely doesn't exist")

    print("We have collected ", len(car_links), " links to DoneDeal car adds")

    return car_links

Test Function

In [70]:
# car_links = get_ad_links(base_url, max_links = 100)

## Function to generate table of ad info

In [75]:
def generate_table_of_ad_info(max_links = 100):
    """
    Function to get table of ad info
    """
    
    ad_columns = ["ad_header","ad_description","ad_age","ad_id","ad_url","ad_price","ad_currency","seller_id",
                  "seller_name","seller_num_ads","seller_type","seller_county","seller_county_town","seller_reg_date",
                  "seller_phone_number","green_light_verified","car_make","car_model","car_year","car_mileage","car_fuel_type",
                  "car_transmission","car_body_type","car_engine","car_road_tax","car_nct_expiry","car_num_prev_owners",
                  "car_reg_country","car_colour","car_num_doors"]

    ad_info_table = pd.DataFrame(columns = ad_columns)

    car_links = get_ad_links(max_links = 100)
    
    for i in car_links:
        # Clear print statements
        clear_output(wait = True)
        ad_info = get_info_from_ad_link(i)
        ad_info_table = ad_info_table.append(ad_info)
        print("Data collected for", str(len(ad_info_table)), "out of", str(len(car_links)), "ads")

    return ad_info_table 

In [76]:
from IPython.display import clear_output

ad_info_table = generate_table_of_ad_info(max_links = 100)

Data collected for 132 out of 132 ads


In [77]:
ad_info_table

Unnamed: 0,ad_age,ad_currency,ad_description,ad_header,ad_id,ad_price,ad_url,car_body_type,car_colour,car_engine,...,car_year,green_light_verified,seller_county,seller_county_town,seller_id,seller_name,seller_num_ads,seller_phone_number,seller_reg_date,seller_type
0,1 day,EUR,"Nissan qashqai very well looked after,, like n...",Car Nissan qashqai,23809138,9750,https://www.donedeal.ie/cars-for-sale/car-niss...,SUV,Grey,1.6 litre,...,2012,True,Dublin,South County,1463616,harry,1,,2010-03-29 18:48:24.607,PRIVATE
0,15 hours,EUR,"Skoda Octavia Diesel, Immaculate condition, Fu...",132 1.6tdi Ambition Skoda Octavia Estate,23609135,6750,https://www.donedeal.ie/cars-for-sale/132-1-6t...,Estate,Beige,1.6 litre,...,2013,False,Leitrim,,147276,Seller,1,,2010-01-11 20:30:10.063,PRIVATE
0,3 mins,EUR,2011 Toyota Avensis Estate 2.0D \nFull Main De...,Toyota Avensis Estate FSH,23761659,4650,https://www.donedeal.ie/cars-for-sale/toyota-a...,Estate,Grey,2.0 litre,...,2011,True,Wicklow,,905642,Peter,1,,2014-03-10 08:24:23.529,PRIVATE
0,3 mins,EUR,ABSOLUTELY BEAUTIFUL CAR IT REALLY IS LIKE NEW...,2016 OPEL INSIGNIA LOW MILEAGE IRISH CAR,23782038,9600,https://www.donedeal.ie/cars-for-sale/2016-ope...,Saloon,Black,1.6 litre,...,2016,True,Limerick,,1092178,call,5,,2014-07-16 14:23:34.041,PRIVATE
0,14 mins,EUR,AUTOMATIC HYUNDAI DELUX I 10 - 2015\n\nONLY 16...,16 k Miles - 998 cc Auto Hyundai Delux i 10 - ...,23722633,8895,https://www.donedeal.ie/cars-for-sale/16-k-mil...,Hatchback,Grey,1.0 litre,...,2015,True,Dublin,City Centre,739240,brian,3,,2010-03-21 22:19:26.56,PRIVATE
0,15 mins,EUR,Manual 7 seater car for sale! \r\n\r\n2.0 TDI...,Ford S-max 2.0 Tdi 7 Seater,23363577,11850,https://www.donedeal.ie/cars-for-sale/ford-s-m...,MPV,Silver,2.0 litre,...,2014,True,Waterford,Waterford City,363336,RR,2,,2010-03-01 16:12:32.703,PRIVATE
0,19 mins,EUR,Car driving perfect \nVery well maintained \...,2009 Ford Focus 1.8 tdci,23527430,2100,https://www.donedeal.ie/cars-for-sale/2009-for...,Hatchback,,1.8 litre,...,2009,False,Cavan,,199182,Alan,1,,2012-01-21 11:21:00.255,PRIVATE
0,20 mins,EUR,2005 Toyota Corolla 1.4 Strata 5dr\r\n\r\nIris...,2005 Toyota Corolla 1.4i Strata 5dr,23653318,950,https://www.donedeal.ie/cars-for-sale/2005-toy...,Hatchback,Silver,1.4 litre,...,2005,True,Louth,Ardee,1059015,James,3,,2014-06-26 16:00:40.479,PRIVATE
0,22 mins,EUR,Very good condition,Car,23734144,1250,https://www.donedeal.ie/cars-for-sale/car/2373...,Hatchback,Red,1.8 litre,...,2006,False,Laois,,1567568,benson,1,,2011-11-12 23:38:25.445,PRIVATE
0,29 mins,EUR,2005 ford fairlane cardinal 4 door hearse. Aut...,Hearse/removal ambulance,23433837,11000,https://www.donedeal.ie/cars-for-sale/hearse-r...,,Black,,...,2005,False,Donegal,Ballyshannon,255920,jackie carron,2,,2009-09-17 21:18:40.196,PRIVATE
