### Import libraries

In [None]:
import requests as req
import certifi as cert
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import pandas as pd
import json
from tqdm import tqdm
from scraping_scripts import scrape_type, scrape_address, scrape_location, scrape_price, scrape_shared_ownership, scrape_specs_list, scrape_description_list, scrape_description_text, scrape_listing_data, scrape_travel_times_list, scrape_average_sale_price

### Define baseurl and headers

In [None]:
baseurl = 'https://www.zoopla.co.uk'
user_agent = 'Mozilla/5.0 (compatible; GrapeshotCrawler/2.0; +http://www.grapeshot.co.uk/crawler.php)'
headers = {
    'User-Agent': user_agent
}

### Get listing urls, listing ids, and shared ownership

In [None]:
urls_ids_shared_ownerships = []

step = 50000
for price in tqdm(range(150000, 200000, step)):
    for page_n in tqdm(range(1, 2)):
        
        url = f'https://www.zoopla.co.uk/for-sale/property/london/?price_max={price+step}&price_min={price}&q=London&results_sort=newest_listings&search_source=for-sale&pn={page_n}'
        r = req.get(url, headers=headers, verify=cert.where(), timeout=None)
        
        soup = BeautifulSoup(r.content, 'html.parser')
        propertylist = soup.find_all('div', class_='_1lw0o5c0')

        for property in propertylist:
            for url in property.find_all('a'):
                href = url['href']
                listingId = href[-9:-1]
                shared_ownership = scrape_shared_ownership.run(url)
                urls_ids_shared_ownerships.append((baseurl + href, listingId, shared_ownership))

print(str(len(urls_ids_shared_ownerships)) + " listings have been found.")

#### Download urls_ids_shared_ownership list as a json file

In [None]:
json_location = '../data/urls_ids_shared_ownerships_2.json'
with open(json_location, 'w') as json_file:
    json.dump(urls_ids_shared_ownerships, json_file, indent=2)

#### Load urls_ids_shared_ownerships.json

In [None]:
with open(json_location, 'r') as json_file:
    urls_ids_shared_ownerships = json.load(json_file)
urls_ids_shared_ownerships

### Scrape and store listing data

In [None]:
property_dicts = []
session = req.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

for url, listingId, shared_ownership in tqdm(urls_ids_shared_ownerships):
    
    try:
        r = session.get(url, headers=headers, verify=cert.where(), timeout=None)
        soup = BeautifulSoup(r.content, 'html.parser')
        
        property_dict = {
            'url': url,
            'type': scrape_type.run(soup),
            'address': scrape_address.run(soup),
            'location': scrape_location.run(soup),
            'price': scrape_price.run(soup),
            'average_sale_price': scrape_average_sale_price.run(listingId),
            'shared_ownership': shared_ownership,
            'specs_list': scrape_specs_list.run(soup),
            'description_list': scrape_description_list.run(soup),
            'description_text': scrape_description_text.run(soup),
            'listing_data': scrape_listing_data.run(listingId),
            'travel_times_list': scrape_travel_times_list.run(soup)
        }

        property_dicts.append(property_dict)
    
    except req.exceptions.RequestException as e:
        print(f"Error: {e}")

session.close()

### Create Pandas DataFrame

In [None]:
df = pd.DataFrame(property_dicts)
df.head()

### Save data into CSV

In [None]:
df.to_csv('../data/scraped_data_2.csv', index=False)