In [None]:
from bs4 import BeautifulSoup
import requests

import numpy as np
import pandas as pd

import csv

In [None]:

base_url = "https://riyasewana.com/search/cars?page="
total_pages = 323
all_data = []

# Function to scrape data from a given page
def scrape_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Referer': 'https://www.google.com/',
    }

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        car_elements = soup.select('li.item.round:not(.promoted) h2 a')
        data = []
        for car_element in car_elements:
            car_name = car_element.get('title')
            car_link = car_element.get('href')
            data.append({'Car Name': car_name, 'Car Link': car_link})
        return data
    else:
        print(f"Failed to fetch page {url}. Status code: {response.status_code}")
        return []

# For 1st page
page_url = "https://riyasewana.com/search/cars"
print(f"Scraping data from {page_url}...")
data = scrape_page(page_url)
all_data.extend(data)

# # For rest of the pages
for page_number in range(2, total_pages + 1):
    page_url = f"{base_url}{page_number}"
    print(f"Scraping data from {page_url}..." , end=" \t - ")
    data = scrape_page(page_url)
    all_data.extend(data)

# Save data to CSV
csv_file_path = "car_data.csv"
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Car Name', 'Car Link']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(all_data)

print(f"Data saved to {csv_file_path}")

In [None]:
# Function to scrape additional details from a car detail page
def scrape_additional_details(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Referer': 'https://www.google.com/',
    }

    response = requests.get(url, headers=headers)
    print("Fetching data from "+ url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        if soup.select_one('td.aleft:-soup-contains("Price") + td.aleft') is None:
            print("Ad not available")
            return None
        else:
          details = {
              'Price'       : soup.select_one('td.aleft:-soup-contains("Price") + td.aleft').text.strip(),
              'Make'        : soup.select_one('td.aleft:-soup-contains("Make") + td.aleft').text.strip(),
              'Model'       : soup.select_one('td.aleft:-soup-contains("Model") + td.aleft').text.strip(),
              'YOM'         : soup.select_one('td.aleft:-soup-contains("YOM") + td.aleft').text.strip(),
              'Mileage (km)': soup.select_one('td.aleft:-soup-contains("Mileage (km)") + td.aleft').text.strip(),
              'Gear'        : soup.select_one('td.aleft:-soup-contains("Gear") + td.aleft').text.strip(),
              'Fuel Type'   : soup.select_one('td.aleft:-soup-contains("Fuel Type") + td.aleft').text.strip(),
              'Options'     : soup.select_one('td.aleft:-soup-contains("Options") + td.aleft').text.strip(),
              'Engine (cc)' : soup.select_one('td.aleft:-soup-contains("Engine (cc)") + td.aleft').text.strip(),
          }
          print(details)
          return details
    else:
        print(f"Failed to fetch details for {url}. Status code: {response.status_code}")
        return None


# Assuming you have a CSV file named 'car_data.csv' with 'Car Name' and 'Car Link' columns
csv_file_path = "car_data.csv"
updated_data = []

with open(csv_file_path, 'r', newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:

    # row = next(reader)
      car_name = row['Car Name']
      car_link = row['Car Link']

      additional_info = scrape_additional_details(car_link)

      # Check for a 410 status code and skip appending to updated_data if encountered
      if additional_info is None:
          continue

      # Add the additional information to the existing row
      updated_data.append({'Car Name': car_name, 'Car Link': car_link, **additional_info})

      # Save updated data to CSV
      updated_csv_file_path = "updated_car_data.csv"
      with open(updated_csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
          fieldnames = ['Car Name', 'Car Link', 'Price', 'Make', 'Model', 'YOM', 'Mileage (km)', 'Gear', 'Fuel Type', 'Options', 'Engine (cc)']
          writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
          writer.writeheader()
          writer.writerows(updated_data)

print(f"Updated data saved to {updated_csv_file_path}")

Fetching data from https://riyasewana.com/buy/kia-rio-sale-dehiwala-mount-lavinia-7251743
{'Price': 'Negotiable', 'Make': 'Kia', 'Model': 'RIO', 'YOM': '2001', 'Mileage (km)': '-', 'Gear': 'Manual', 'Fuel Type': 'Petrol', 'Options': 'AIR CONDITION, POWER STEERING, POWER MIRROR, POWER WINDOW', 'Engine (cc)': '1493'}
Fetching data from https://riyasewana.com/buy/mazda-familia-323-sale-kurunegala-7251735
{'Price': 'Rs. 865,000', 'Make': 'Mazda', 'Model': 'Familia 323', 'YOM': '1988', 'Mileage (km)': '-', 'Gear': 'Manual', 'Fuel Type': 'Petrol', 'Options': '-', 'Engine (cc)': '1300'}
Fetching data from https://riyasewana.com/buy/suzuki-wagon-r-sale-anuradapura-7251716
{'Price': 'Rs. 2,100,000', 'Make': 'Suzuki', 'Model': 'Wagon R', 'YOM': '2003', 'Mileage (km)': '-', 'Gear': 'Manual', 'Fuel Type': 'Petrol', 'Options': 'AIR CONDITION, POWER STEERING, POWER WINDOW', 'Engine (cc)': '1000'}
Fetching data from https://riyasewana.com/buy/micro-panda-sale-kiribathgoda-7251672
{'Price': 'Rs. 1,050

In [None]:
url = car_link

In [None]:
headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Referer': 'https://www.google.com/',
    }

response = requests.get(url, headers=headers)
response

<Response [200]>

In [None]:
soup = BeautifulSoup(response.text, 'html.parser')
x = soup.select_one('td[style="text-align:left;"] ~ td[style="text-align:left;"] ~ td[style="text-align:left;"] ~ td[style="text-align:left;"] ~ td[style="text-align:left;"] ~ td[style="text-align:left;"] ~ td[style="text-align:left;"] ~ td[style="text-align:left;"] ~ td[style="text-align:left;"]')

In [None]:
soup.select_one('td.aleft:-soup-contains("YOM") + td.aleft')
soup.select_one('td.aleft:-soup-contains("Gear") + td.aleft')



<td class="aleft">Automatic</td>

In [None]:
soup.select_one('td.aleft:-soup-contains("YOM") + td.aleft').text.strip()

'2007'