# Scraping Ads from PakWheels

In this notebook we scrape ads (cars listings) from the PakWheels website using Python libraries such as `requests` and `BeautifulSoup`.

**Imports**

In [2]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
from json import loads, dumps
from tqdm import tqdm

## Scraping User Reviews

A function to scrape all listings for a car of specific body type.

In [2]:
def scrape_ad(url, b_type):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'lxml')

        featured = 1 if soup.find('div', class_='mb40 pos-rel').find('div', class_='featured-ribbon pointer') else 0
        car_name = soup.find('h1').text
        location = soup.find('p', class_='detail-sub-heading').find('a').text.strip()
        car_specifics = soup.find('table', class_=re.compile(r'table table-bordered text-center table-engine-detail fs16')).find_all('td')
        model = car_specifics[0].text.strip()
        mileage = car_specifics[1].text.strip()
        engine_type = car_specifics[2].text.strip()
        transmission = car_specifics[3].text.strip()
        try:
            car_features = soup.find('ul', class_=re.compile(r'list-unstyled car-feature-list nomargin')).find_all('li')
            features = [feature.text.strip() for feature in car_features]
        except:
            features = []
        sellers_comments = soup.find('h2', id='scroll_seller_comments').find_next_sibling('div').text.strip()
        car_details = soup.find('ul', id='scroll_car_detail').find_all('li')
        details = {car_details[idx].text.strip(): car_details[idx + 1].text.strip() for idx in range(0, len(car_details), 2)}
        ad_no = details['Ad Ref #']
        del details['Ad Ref #']
        price = soup.find('div', class_='price-box').text.strip()
        seller_details = soup.find('div', class_='owner-detail-main').text.strip().split('\n\n')[0]

        return {
            'Ad Ref': ad_no,
            'url': url,
            'Featured': featured,
            'Vehicle': car_name,
            'Location': location,
            'Model': model,
            'Vehicle Type': b_type,
            'Mileage': mileage,
            'Engine Type': engine_type,
            'Transmission': transmission,
            'Features': features,
            'Details': details,
            'Price': price,
            'Seller Details': seller_details,
            "Seller's Comments": sellers_comments,
        }
    except:
        return {
            'url': url
        }

## Scraping Ads in Parallel

This function takes a list of URLs and a type of advertisement, and it scrapes the data concurrently. 

In [18]:
def scrape_ads_in_parallel(urls, b_type):
    results = []  # Use a list to hold only valid results
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_index = {executor.submit(scrape_ad, url, b_type): index for index, url in enumerate(urls)}

        # Initialize tqdm progress bar
        with tqdm(total=len(urls), desc=f"Scraping {b_type}", unit="ad") as pbar:
            for future in as_completed(future_to_index):
                index = future_to_index[future]
                result = future.result()
                if result is not None:  # Only append valid results
                    results.append(result)
                pbar.update(1)  # Update progress bar

    return results

In [5]:
for body_type in os.listdir('data'):
    if 'urls.txt' in os.listdir(os.path.join('data', body_type)) and 'data.json' not in os.listdir(os.path.join('data', body_type)) and body_type in ['Hatchback']:
        with open(os.path.join('data', body_type, 'urls.txt'), 'r') as f:
            urls = f.read().strip().split('\n')
        results = scrape_ads_in_parallel(urls[:10], body_type)
        with open(os.path.join('data', body_type, 'data.json'), 'w') as f:
            f.write(dumps(results))


Scraping Hatchback: 100%|██████████| 10/10 [00:02<00:00,  3.64ad/s]


**A sample from cars listing data**

In [3]:
df = pd.read_csv('datasets/dataset_1.csv')
df.sample(5)

Unnamed: 0,Ad Ref,url,Featured,Vehicle,Location,Model,Vehicle Type,Mileage,Engine Type,Transmission,Features,Details,Price,Seller Details,Seller's Comments
7187,9247871,https://www.pakwheels.com/used-cars/changan-ka...,0,Changan Karvaan Base Model 1.0 2021,"Shakarghar Road, Narowal Punjab",2021,Mini Van,"71,000 km",Petrol,Manual,"['AM/FM Radio', 'Air Conditioning', 'Immobiliz...","{'Registered In': 'Punjab', 'Color': 'White', ...",PKR 26 lacs,"Mohsin Khan\nMember Since Mar 01, 2024","Power Steering, Inside out fully original. Dri..."
3395,9236306,https://www.pakwheels.com/used-cars/suzuki-alt...,0,Suzuki Alto VXR 2011,"North Nazimabad, Karachi Sindh",2011,Hatchback,"19,999 km",Petrol,Manual,"['AM/FM Radio', 'Air Conditioning', 'CD Player...","{'Registered In': 'Sindh', 'Color': 'White', '...",PKR 11.75 lacs,"Unknown\nMember Since Oct 07, 2017",asalam o alikum i am selling my alto vxr 2011...
24874,9295247,https://www.pakwheels.com/used-cars/mitsubishi...,0,Mitsubishi Ek Wagon Limited 2011,"I- 9, Islamabad Islamabad",2011,Hatchback,"120,000 km",Petrol,Automatic,"['AM/FM Radio', 'Air Bags', 'Air Conditioning'...","{'Registered In': 'Lahore', 'Color': 'Silver',...",PKR 17.5 lacs,"Waseem Ahmed\nMember Since Sep 30, 2017",. MODEL 2011. REGISTRATION 2014 (RAWALPINDI). ...
4337,9297367,https://www.pakwheels.com/used-cars/changan-al...,0,Changan Alsvin 1.3L MT Comfort 2021,"Eden Gardens, Faisalabad Punjab",2021,Sedan,"65,000 km",Petrol,Manual,"['ABS', 'AM/FM Radio', 'Air Bags', 'Air Condit...","{'Registered In': 'Punjab', 'Color': 'Steller ...","PKR 29.8 lacs\n\nFinancing starts at PKR 74,62...","Ali Hassan\nMember Since May 06, 2022",After Market Alloy rims. Looking to sell the c...
5650,9327793,https://www.pakwheels.com/used-cars/toyota-cor...,0,Toyota Corolla SE Saloon Automatic 2002,"Misryal Road, Rawalpindi Punjab",2002,Sedan,"200,005 km",Petrol,Automatic,"['AM/FM Radio', 'Air Bags', 'Air Conditioning'...","{'Registered In': 'Sindh', 'Color': 'Beige', '...",PKR 16.8 lacs,"Bag\nMember Since Jun 12, 2021",Corolla SE saloon company auto home used car r...
