# Extracting Info from Ads

`Author: James Smith`

`Date: 22/12/2019`

In [1]:
# Imports
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json

Given an add link, we want to extract all information that we want from it

- Phone Number - using workaround
- All other info - using JSON dictionary located at bottom of page code

Example Ad: https://www.donedeal.ie/cars-for-sale/very-clean-4x4/23802005

In [2]:
#----- Config Info -----#
url = 'https://www.donedeal.ie/cars-for-sale/very-clean-4x4/23802005'
headers = {'User-Agent':'Mozilla/5'}

#----- Get page HTML code -----#
r = requests.get(url, headers = headers)
html_contents = r.text
html_soup = BeautifulSoup(html_contents, 'html.parser')

In [3]:
#----- Obtain Main JSON Dictionary -----#
scripts = html_soup.find_all('script')
window_ad_details_script = [script for script in scripts if 'window.adDetails' in script.text][0]
window_ad_details_script

#----- Clean in order to represent as Python Dictionary -----#
def find_char_index(s, ch):
    """
    Function to return indexes of a character in a given string
    https://stackoverflow.com/questions/11122291/how-to-find-char-in-string-and-get-all-the-indexes
    """
    return [i for i, ltr in enumerate(s) if ltr == ch]

beginning = find_char_index(window_ad_details_script.text, '{')[0] # Begins at first opening bracket
end = find_char_index(window_ad_details_script.text, '}')[-1] # Ends at last closing bracket

window_ad_details_script_short = window_ad_details_script.text[beginning :end+1]
window_ad_details_script_short

details_dict = json.loads(window_ad_details_script_short)
#details_dict

In [5]:
#----- Extract Data from Python Dictionary -----#

# passing in list rather than string
# https://stackoverflow.com/questions/17839973/constructing-pandas-dataframe-from-values-in-variables-gives-valueerror-if-usi

# Contains car info as a dictionary as a list in a value of the main dictionary
# The 'name' key contains the wanted key and similarly for 'value'
# Only the first 14 elements contain car attributes

car_attributes_list = details_dict["displayAttributes"][0:14]
car_attributes_dict = {attribute["name"] : attribute["value"] for attribute in car_attributes_list}

# Phone 
def get_phone_number(url):
    """
    Given a url this function return a dictionary containing 
    the sellers phone number
    """
    
    phone_headers = {"User-Agent": "Mozilla/5", "Referer": url}

    phone_url= "https://www.donedeal.ie/cadview/api/v3/view/ad/" + str(details_dict["id"]) + "/phone/"
    phone_request = requests.post(phone_url, headers = phone_headers)
    return json.loads(phone_request.text)

    
#----- Collect Data -----#

data = {"ad_header" : [details_dict["header"]],
        "ad_description" : [details_dict["description"]],
        "ad_age" : [details_dict["age"]],
        "ad_id" : [details_dict["id"]],
        "ad_url" : [url],
        "ad_price" : [details_dict["price"]],
        "ad_currency" : [details_dict["currency"]],
        
        "seller_id" : [details_dict["seller"]["id"]],
        "seller_name" : [details_dict["seller"]["name"]],
        "seller_num_ads" : [details_dict["seller"]["adCount"]],
        "seller_type" : [details_dict["seller"]["type"]],
        "seller_county" : [details_dict["seller"]["county"]],
        "seller_county_town" : [details_dict["seller"]["countyTown"]],
        "seller_reg_date" : [details_dict["seller"]["registrationDate"]],
        
        "seller_phone_number" : [get_phone_number(url)["phone"]],
        
        "green_light_verified" : [details_dict["greenlightVerified"]],
        
        "car_make" : [car_attributes_dict["make"]],
        "car_model" : [car_attributes_dict["model"]],
        "car_year" : [car_attributes_dict["year"]],
        "car_mileage" : [car_attributes_dict["mileage"]],
        "car_fuel_type" : [car_attributes_dict["fuelType"]],
        "car_transmission" : [car_attributes_dict["transmission"]],
        "car_body_type" : [car_attributes_dict["bodyType"]],
        "car_engine" : [car_attributes_dict["engine"]],
        "car_road_tax" : [car_attributes_dict["roadTax"]],
        "car_nct_expiry" : [car_attributes_dict["NCT"]],
        "car_num_prev_owners" : [car_attributes_dict["previousOwners"]],
        "car_reg_country" : [car_attributes_dict["country"]],
        "car_colour" : [car_attributes_dict["colour"]],
        "car_num_doors" : [car_attributes_dict["numDoors"]]
       }

ad_data = pd.DataFrame.from_dict(data)
ad_data

Unnamed: 0,ad_age,ad_currency,ad_description,ad_header,ad_id,ad_price,ad_url,car_body_type,car_colour,car_engine,...,car_year,green_light_verified,seller_county,seller_county_town,seller_id,seller_name,seller_num_ads,seller_phone_number,seller_reg_date,seller_type
0,4 hours,EUR,Suzuki grand virara for sale.\r\nExcellent con...,Very clean 4x4,23802005,1299,https://www.donedeal.ie/cars-for-sale/very-cle...,SUV,Red,1.6 litre,...,2005,False,Kildare,Kill,146465,John,3,862788511,2009-09-20 23:02:31.154,PRIVATE


In [1]:
def generate_url(base_url, page_number):
    """
    Based on the structure of the donedeal website. 
    This function will returns the url given the desired page number
    """
    if page_number < 1:
        raise Exception("page_number cannot be less than 1")

    if page_number == 1:
        url = base_url
    else:
        extra_pages_url_addition = "&start={}".format(str((page_number - 1) * 28))
        url = base_url + extra_pages_url_addition
    
    return url  

In [None]:
# Change this code so that it doesn't include ads at bottom of page and stop scraping 
# when there's no more than 3 ads per page

car_links = []
page_number = 1
status_code = 200

headers = {'User-Agent':'Mozilla/5'}
base_url = 'https://www.donedeal.ie/cars?source=private'
sub_str_car_add = "https://www.donedeal.ie/cars-for-sale"


while status_code == 200:
    
    url = generate_url(base_url, page_number = page_number)
    r = requests.get(url, headers = headers)
    status_code = r.status_code
    
    print("Scraping info from page number", page_number)
    print("url: ", url)
    
    if status_code == 200:
        
        html_contents = r.text
        html_soup = BeautifulSoup(html_contents, 'html.parser')

        all_links_on_page = html_soup.find_all('a', class_ = 'card__link', href = True)
        car_links_on_page = [link['href'] for link in all_links_on_page 
                             if sub_str_car_add in link['href']]
        # Join lists together
        car_links = car_links + car_links_on_page
        
        # Update page number to get info from the next page
        page_number = page_number + 1
        
    else:
        
        print("Oops... This page returned the status code: ", status_code)
        print("The url ", url, " likely doesn't exist")

print("We have collected ", len(car_links), " links to DoneDeal car adds")