# Extracting Info from Ads

`Author: James Smith`

`Date: 22/12/2019`

Given an add link, we want to extract all information that we want from it

- Phone Number - using workaround
- All other info - using JSON dictionary located at bottom of page code

Example Ad: https://www.donedeal.ie/cars-for-sale/very-clean-4x4/23802005

## Function to get info from ad link

In [85]:
def get_info_from_ad_link(url):
    """
    Given a url it returns a dataframe of info
    """
    
    #----- Define custom functions -----#
    
    def find_char_index(s, ch):
        """
        Function to return indexes of a character in a given string
        https://stackoverflow.com/questions/11122291/how-to-find-char-in-string-and-get-all-the-indexes
        """
        return [i for i, ltr in enumerate(s) if ltr == ch]
    
    def get_phone_number(url):
        """
        Given a url this function return a dictionary containing 
        the sellers phone number.
        Returns None if an error occurs
        This happens due to frequent scraping - "Please contact customer support"
        """
        phone_headers = {"User-Agent": "Mozilla/5", "Referer": url}

        try:
            phone_url= "https://www.donedeal.ie/cadview/api/v3/view/ad/" + str(details_dict["id"]) + "/phone/"
            phone_request = requests.post(phone_url, headers = phone_headers)
            return json.loads(phone_request.text)
        except:
            return {"phone" : None}

    
    #----- Get page HTML code -----#
    headers = {'User-Agent':'Mozilla/5'}
    r = requests.get(url, headers = headers)
    html_contents = r.text
    html_soup = BeautifulSoup(html_contents, 'html.parser')
    
    #----- Obtain Main JSON Dictionary -----#
    scripts = html_soup.find_all('script')
    window_ad_details_script = [script for script in scripts if 'window.adDetails' in script.text][0]
    window_ad_details_script

    #----- Clean in order to represent as Python Dictionary -----#

    beginning = find_char_index(window_ad_details_script.text, '{')[0] # Begins at first opening bracket
    end = find_char_index(window_ad_details_script.text, '}')[-1] # Ends at last closing bracket

    window_ad_details_script_short = window_ad_details_script.text[beginning :end+1]
    window_ad_details_script_short

    details_dict = json.loads(window_ad_details_script_short)
    
    #----- Extract Data from Python Dictionary -----#

    # passing in list rather than string
    # https://stackoverflow.com/questions/17839973/constructing-pandas-dataframe-from-values-in-variables-gives-valueerror-if-usi

    # Contains car info as a dictionary as a list in a value of the main dictionary
    # The 'name' key contains the wanted key and similarly for 'value'
    # Only the first 14 elements contain car attributes

    car_attributes_list = details_dict["displayAttributes"][0:14]
    car_attributes_dict = {attribute["name"] : attribute["value"] for attribute in car_attributes_list}

    #----- Collect Data -----#
    
    # Obtaining values from keys directly can throw up a KeyValue error when it does not exist.
    # The method .get() is a safer way to access the dictionary

    data = {"ad_header" : details_dict.get("header"),
            "ad_description" : details_dict.get("description"),
            "ad_age" : details_dict.get("age"),
            "ad_id" : details_dict.get("id"),
            "ad_url" : [url],
            "ad_price" : details_dict.get("price"),
            "ad_currency" : details_dict.get("currency"),

            "seller_id" : details_dict.get("seller").get("id"),
            "seller_name" : details_dict.get("seller").get("name"),
            "seller_num_ads" : details_dict.get("seller").get("adCount"),
            "seller_type" : details_dict.get("seller").get("type"),
            "seller_county" : details_dict.get("seller").get("county"),
            "seller_county_town" : details_dict.get("seller").get("countyTown"),
            
            "seller_reg_date" : details_dict.get("seller").get("registrationDate"),

            "seller_phone_number" : [get_phone_number(url)["phone"]],

            "green_light_verified" : details_dict.get("greenlightVerified"),

            "car_make" : car_attributes_dict.get("make"),
            "car_model" : car_attributes_dict.get("model"),
            "car_year" : car_attributes_dict.get("year"),
            "car_mileage" : car_attributes_dict.get("mileage"),
            "car_fuel_type" : car_attributes_dict.get("fuelType"),
            "car_transmission" : car_attributes_dict.get("transmission"),
            "car_body_type" : car_attributes_dict.get("bodyType"),
            "car_engine" : car_attributes_dict.get("engine"),
            "car_road_tax" : car_attributes_dict.get("roadTax"),
            "car_nct_expiry" : car_attributes_dict.get("NCT"),
            "car_num_prev_owners" : car_attributes_dict.get("previousOwners"),
            "car_reg_country" : car_attributes_dict.get("country"),
            "car_colour" : car_attributes_dict.get("colour"),
            "car_num_doors" : car_attributes_dict.get("numDoors")
           }

    return pd.DataFrame.from_dict(data)

Test Function

In [86]:
# # Imports
# import pandas as pd
# import requests
# from bs4 import BeautifulSoup
# import json

url = 'https://www.donedeal.ie/cars-for-sale/very-clean-4x4/23802005'
get_info_from_ad_link(url)

Unnamed: 0,ad_age,ad_currency,ad_description,ad_header,ad_id,ad_price,ad_url,car_body_type,car_colour,car_engine,...,car_year,green_light_verified,seller_county,seller_county_town,seller_id,seller_name,seller_num_ads,seller_phone_number,seller_reg_date,seller_type
0,4 days,EUR,Suzuki grand virara for sale.\r\nExcellent con...,Suzuki grand vitara,23802005,1299,https://www.donedeal.ie/cars-for-sale/very-cle...,SUV,Red,1.6 litre,...,2005,False,Donegal,Carndonagh,146465,John,3,,2009-09-20 23:02:31.154,PRIVATE


## Function to gather links

In [87]:
def get_ad_links(max_links):
    """
    Collects all car ads from DoneDeal
    Options to include number of links to collect
    Can later change criteria to automatically collect all ads
    """
    
    #----- Define custom functions -----#

    def generate_url(base_url, page_number):
        """
        Based on the structure of the donedeal website. 
        This function will returns the url given the desired page number
        """
        if page_number < 1:
            raise Exception("page_number cannot be less than 1")

        if page_number == 1:
            url = base_url
        else:
            extra_pages_url_addition = "&start={}".format(str((page_number - 1) * 28))
            url = base_url + extra_pages_url_addition

        return url  
    
    #----- Set up parameters -----#
    
    max_links = max_links
    base_url = 'https://www.donedeal.ie/cars?source=private'
    sub_str_car_add = "https://www.donedeal.ie/cars-for-sale"


    car_links = []
    page_number = 1
    status_code = 200

    headers = {'User-Agent':'Mozilla/5'}

    #----- Loop over pages based on stopping criteria -----#

    while len(car_links) < max_links and status_code == 200:

        url = generate_url(base_url, page_number = page_number)
        r = requests.get(url, headers = headers)
        status_code = r.status_code

        print("Scraping info from page number", page_number)
        print("url: ", url)

        if status_code == 200:

            html_contents = r.text
            html_soup = BeautifulSoup(html_contents, 'html.parser')

            all_links_on_page = html_soup.find_all('a', class_ = 'card__link', href = True)
            car_links_on_page = [link['href'] for link in all_links_on_page 
                                 if sub_str_car_add in link['href']]
            # Join lists together
            car_links = car_links + car_links_on_page

            # Update page number to get info from the next page
            page_number = page_number + 1

        else:

            print("Oops... This page returned the status code: ", status_code)
            print("The url ", url, " likely doesn't exist")

    print("We have collected ", len(car_links), " links to DoneDeal car adds")

    return car_links

Test Function

In [88]:
# car_links = get_ad_links(base_url, max_links = 100)

## Function to generate table of ad info

In [93]:
def generate_table_of_ad_info(max_links):
    """
    Function to get table of ad info
    """
    
    ad_columns = ["ad_header","ad_description","ad_age","ad_id","ad_url","ad_price","ad_currency","seller_id",
                  "seller_name","seller_num_ads","seller_type","seller_county","seller_county_town","seller_reg_date",
                  "seller_phone_number","green_light_verified","car_make","car_model","car_year","car_mileage","car_fuel_type",
                  "car_transmission","car_body_type","car_engine","car_road_tax","car_nct_expiry","car_num_prev_owners",
                  "car_reg_country","car_colour","car_num_doors"]

    ad_info_table = pd.DataFrame(columns = ad_columns)

    car_links = get_ad_links(max_links = max_links)
    
    for i in car_links:
        # Error handling, will try next iteration if it fails
        try:
            # Clear print statements
            clear_output(wait = True)
            ad_info = get_info_from_ad_link(i)
            ad_info_table = ad_info_table.append(ad_info)
            print("Data collected for", str(len(ad_info_table)), "out of", str(len(car_links)), "ads")
        except:
            pass
        
    return ad_info_table 


In [94]:
from IPython.display import clear_output

ad_info_table = generate_table_of_ad_info(max_links = 10000)

Data collected for 10007 out of 10021 ads


In [95]:
ad_info_table

Unnamed: 0,ad_age,ad_currency,ad_description,ad_header,ad_id,ad_price,ad_url,car_body_type,car_colour,car_engine,...,car_year,green_light_verified,seller_county,seller_county_town,seller_id,seller_name,seller_num_ads,seller_phone_number,seller_reg_date,seller_type
0,5 hours,EUR,VW CC GT in excellent condition with 67 mile o...,Volkswagen CC GT (price drop),22850112,13000,https://www.donedeal.ie/cars-for-sale/volkswag...,Coupe,Brown,2.0 litre,...,2015,True,Limerick,,45000,Mike,1,,2010-06-28 14:20:52.312,PRIVATE
0,11 days,EUR,2007 mercedes c180 PETROL 1.8 cc.\r\n NCT NOVE...,Mercedes c180 NCT NOV 2020 taxed,23728903,1495,https://www.donedeal.ie/cars-for-sale/mercedes...,Saloon,Blue,1.8 litre,...,2007,True,Cork,Mallow,31572,Jim S,2,,2009-09-08 22:43:12.027,PRIVATE
0,7 mins,EUR,Hello selling my snow white Ford Focus mk3 20...,Ford Focus mk3 2011 1.6 tdci zetec Nct 01.08.21,23749372,5599,https://www.donedeal.ie/cars-for-sale/ford-foc...,Saloon,White,1.6 litre,...,2011,True,Tipperary,Tipperary Town,3394285,Ford,1,,2019-12-13 21:40:41.166,PRIVATE
0,8 mins,EUR,Tax 5/20\nNct 14/2/20\n1.6 hdi 7 seater\nAll g...,Peugeot 307 1.6hdi 7 seater tax & nct,23753474,675,https://www.donedeal.ie/cars-for-sale/peugeot-...,Hatchback,Blue,1.6 litre,...,2007,False,Kilkenny,,17268,DAVE,8,,2010-10-15 18:39:45.67,PRIVATE
0,12 mins,EUR,A very good car drives great new NCT call for ...,Volkswagen,23815100,10500,https://www.donedeal.ie/cars-for-sale/volkswag...,MPV,Black,1.6 litre,...,2014,False,Dublin,,1305041,Joseph,1,,2014-12-09 17:10:33.103,PRIVATE
0,14 mins,EUR,2007 Toyota Corolla 1.4 diesel serviced 2thous...,2007 Toyota Corolla,23792855,1575,https://www.donedeal.ie/cars-for-sale/2007-toy...,Saloon,Blue,1.4 litre,...,2007,False,Galway,Tuam,3294883,Seller,2,,2019-08-24 13:46:54.711,PRIVATE
0,16 mins,EUR,2006 Micra Sport for sale in Dublin with low m...,2006 Nissan Micra Sport,23815086,1250,https://www.donedeal.ie/cars-for-sale/2006-nis...,Hatchback,Gold,1.2 litre,...,2006,False,Dublin,,2683024,Karen,1,,2017-07-09 15:45:53.435,PRIVATE
0,16 mins,EUR,"Renault Scenic, 2004 \r\nClean and good condi...",Renault scenic 2004,23815087,300,https://www.donedeal.ie/cars-for-sale/renault-...,Hatchback,Blue,1.4 litre,...,2004,True,Dublin,South County,3200762,Victoria,6,,2019-05-04 20:49:29.546,PRIVATE
0,17 mins,EUR,"I am selling my golf, It is a 152 gti with the...",Golf gti performance 230 bhp,23395706,22500,https://www.donedeal.ie/cars-for-sale/golf-gti...,Hatchback,White,2.0 litre,...,2015,True,Cork,,916648,Brian,1,,2014-03-19 18:18:37.946,PRIVATE
0,19 mins,EUR,Nissan qashqai for sale \nIn great condition \...,Nissan qashqai for sale,23525665,2000,https://www.donedeal.ie/cars-for-sale/nissan-q...,Hatchback,Brown,1.5 litre,...,2007,True,Wicklow,,1619223,Ford7810,3,,2015-06-02 15:38:11.201,PRIVATE


In [96]:
table_location = r"C:\Users\User\Documents\ITB Year 2\Text Analytics and Web Content Mining\Assignments\Assignment 3\github\DoneDeal_Analytics\data\ad_info_table.csv"
ad_info_table.to_csv(table_location)