# Scraping Ads from PakWheels

In this notebook we scrape ads (cars listings) from the PakWheels website using Python libraries such as `requests` and `BeautifulSoup`.

**Imports**

In [2]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
from json import loads, dumps
from tqdm import tqdm

## Scraping User Reviews

A function to scrape all listings for a car of specific body type.

In [2]:
def scrape_ad(url, b_type):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'lxml')

        featured = 1 if soup.find('div', class_='mb40 pos-rel').find('div', class_='featured-ribbon pointer') else 0
        car_name = soup.find('h1').text
        location = soup.find('p', class_='detail-sub-heading').find('a').text.strip()
        car_specifics = soup.find('table', class_=re.compile(r'table table-bordered text-center table-engine-detail fs16')).find_all('td')
        model = car_specifics[0].text.strip()
        mileage = car_specifics[1].text.strip()
        engine_type = car_specifics[2].text.strip()
        transmission = car_specifics[3].text.strip()
        try:
            car_features = soup.find('ul', class_=re.compile(r'list-unstyled car-feature-list nomargin')).find_all('li')
            features = [feature.text.strip() for feature in car_features]
        except:
            features = []
        sellers_comments = soup.find('h2', id='scroll_seller_comments').find_next_sibling('div').text.strip()
        car_details = soup.find('ul', id='scroll_car_detail').find_all('li')
        details = {car_details[idx].text.strip(): car_details[idx + 1].text.strip() for idx in range(0, len(car_details), 2)}
        ad_no = details['Ad Ref #']
        del details['Ad Ref #']
        price = soup.find('div', class_='price-box').text.strip()
        seller_details = soup.find('div', class_='owner-detail-main').text.strip().split('\n\n')[0]

        return {
            'Ad Ref': ad_no,
            'url': url,
            'Featured': featured,
            'Vehichle': car_name,
            'Location': location,
            'Model': model,
            'Body Type': b_type,
            'Mileage': mileage,
            'Engine Type': engine_type,
            'Transmission': transmission,
            'Features': features,
            'Details': details,
            'Price': price,
            'Seller Details': seller_details,
            "Seller's Comments": sellers_comments,
        }
    except:
        return {
            'url': url
        }

## Scraping Ads in Parallel

This function takes a list of URLs and a type of advertisement, and it scrapes the data concurrently. 

In [18]:
def scrape_ads_in_parallel(urls, b_type):
    results = []  # Use a list to hold only valid results
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_index = {executor.submit(scrape_ad, url, b_type): index for index, url in enumerate(urls)}

        # Initialize tqdm progress bar
        with tqdm(total=len(urls), desc=f"Scraping {b_type}", unit="ad") as pbar:
            for future in as_completed(future_to_index):
                index = future_to_index[future]
                result = future.result()
                if result is not None:  # Only append valid results
                    results.append(result)
                pbar.update(1)  # Update progress bar

    return results

In [5]:
for body_type in os.listdir('data'):
    if 'urls.txt' in os.listdir(os.path.join('data', body_type)) and 'data.json' not in os.listdir(os.path.join('data', body_type)) and body_type in ['Hatchback']:
        with open(os.path.join('data', body_type, 'urls.txt'), 'r') as f:
            urls = f.read().strip().split('\n')
        results = scrape_ads_in_parallel(urls[:10], body_type)
        with open(os.path.join('data', body_type, 'data.json'), 'w') as f:
            f.write(dumps(results))


Scraping Hatchback: 100%|██████████| 10/10 [00:02<00:00,  3.64ad/s]


**A sample from cars listing data**

In [5]:
df = pd.read_csv('datasets/dataset_1.csv', index_col=0)
df.sample(5)

Unnamed: 0_level_0,url,Featured,Vehicle,Location,Model,Vehicle Type,Mileage,Engine Type,Transmission,Features,Details,Price,Seller Details,Seller's Comments
Ad Ref,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
9315921,https://www.pakwheels.com/used-cars/honda-civi...,0,Honda Civic Oriel 1.8 i-VTEC CVT 2018,Karachi Sindh,2018,Sedan,"55,000 km",Petrol,Automatic,"['ABS', 'AM/FM Radio', 'Air Bags', 'Air Condit...","{'Registered In': 'Sindh', 'Color': 'Taffeta W...","PKR 53.5 lacs\n\nFinancing starts at PKR 105,2...","Ghouss\nMember Since Oct 14, 2024",Civic oriel 1.8 ug top of the lineHonda Civic ...
9333938,https://www.pakwheels.com/used-cars/honda-civi...,0,Honda Civic Oriel 1.8 i-VTEC CVT 2020,"Gulshan-e-Iqbal, Karachi Sindh",2020,Sedan,"97,000 km",Petrol,Automatic,"['ABS', 'AM/FM Radio', 'Air Bags', 'Air Condit...","{'Registered In': 'Sindh', 'Color': 'Taffeta W...","PKR 60.5 lacs\n\nFinancing starts at PKR 97,39...","Mohammad Mhad\nMember Since Feb 17, 2022",Original Alloy rims. Inside out fully original...
9287755,https://www.pakwheels.com/used-cars/suzuki-meh...,0,Suzuki Mehran VX Euro II 2014,"North Gulgasht, Multan Punjab",2014,Hatchback,"60,000 km",Petrol,Manual,"['AM/FM Radio', 'CD Player', 'Front Speakers',...","{'Registered In': 'Lahore', 'Color': 'Gold', '...",PKR 12 lacs,"Kahuram Rana\nMember Since Oct 07, 2024",new tyre Lahore restoration\n Mention PakWh...
9300295,https://www.pakwheels.com/used-cars/suzuki-mr-...,0,Suzuki MR Wagon G 2011,"Baghban Pura, Lahore Punjab",2011,Mini Van,"160,000 km",Petrol,Automatic,"['ABS', 'AM/FM Radio', 'Air Bags', 'Air Condit...","{'Registered In': 'Islamabad', 'Color': 'Blue'...",PKR 18.85 lacs,"Rana Kamran\nMember Since Apr 11, 2022",Lightweight allow rims . All token taxes are p...
9314985,https://www.pakwheels.com/used-cars/suzuki-meh...,0,Suzuki Mehran VXR (CNG) 2007,"Bahria Town, Lahore Punjab",2007,Hatchback,"100,000 km",Petrol,Automatic,"['AM/FM Radio', 'Air Conditioning', 'Cassette ...","{'Registered In': 'Punjab', 'Color': 'Solid Wh...",PKR 8.5 lacs,"Adnan\nMember Since May 02, 2021",I have a white mehran 2007 model with 660 cc e...
