# HW3 on Data Science course of Sharif University of Technology
## Created by: Mohammad Mahdi Hossein Beiky     SI: 400100995
## GitHub URL: https://github.com/Mmhb1382/data-science.git
---

### Importing Python Packages here:

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re

# Initialize Selenium WebDriver
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run without opening the browser
driver = webdriver.Chrome(options=options)

# Load the website
url = "https://bama.ir/car/samand"
driver.get(url)

# Scroll multiple times to load all listings
scroll_pause_time = 3
for _ in range(20):  
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(scroll_pause_time)

# Parse the loaded page
soup = BeautifulSoup(driver.page_source, "html.parser")

# Find all car listings
car_listings = soup.find_all("div", class_="bama-ad-holder")
print(f"Total cars found: {len(car_listings)}")  # Debug check

Total cars found: 668


In [2]:
def convert_price_to_number(price_text):
    """ Converts Persian/English price text to a clean integer """
    if price_text == "N/A":
        return 0  # If no price is available, return 0

    # Remove non-numeric characters (commas, Persian letters, etc.)
    cleaned_text = re.sub(r"[^\d]", "", price_text)

    # Convert to integer if possible
    try:
        return int(cleaned_text)
    except ValueError:
        return 0  # If conversion fails, return 0

# Create a list to store extracted car data
car_data = []

# Loop through listings and extract details
for i, car in enumerate(car_listings):  
    # Extract Car Name
    car_name_element = car.select_one("div.bama-ad-large__title-row p.bama-ad__title span.text")
    car_name = car_name_element.text.strip() if car_name_element else "N/A"

    # Extract Production Year
    year_element = car.select_one("div.bama-ad__detail-row span")
    year_text = year_element.text.strip() if year_element else "N/A"

    # Convert year to an integer and filter out old cars
    try:
        year = int(year_text)
    except ValueError:
        continue  

    if year <= 1385:
        continue  

    # Extract Car Model
    model_element = car.select_one("div.bama-ad__detail-row span.bama-ad__detail-trim")
    car_model = model_element.text.strip() if model_element else "N/A"

    # Extract Prices
    preprice_element = car.select_one("div.bama-ad__price-row div.bama-ad__installment-price-holder "
    "div.bama-ad__pre-price span")
    preprice = preprice_element.text.strip() if preprice_element else "0"   

    monprice_element = car.select_one("div.bama-ad__price-row div.bama-ad__installment-price-holder "
    "div.bama-ad__monthly-price span")
    monprice = monprice_element.text.strip() if monprice_element else "0"

    price_element = car.select_one("div.bama-ad__price-row div.bama-ad__price-holder span.bama-ad__price")
    price = price_element.text.strip() if price_element else "0"

    # Extract Mileage
    mileage_element = car.select_one("div.bama-ad__detail-row span.dir-ltr")
    mileage = mileage_element.text.strip() if mileage_element else "N/A"

    # Extract Location
    loc_element = car.select_one("div.bama-ad__price-row div.bama-ad__address-box div.bama-ad__address span")
    location = loc_element.text.strip() if loc_element else "N/A"

    # **Extract the Car Advertisement Link**
    ad_link_element = car.select_one("a.bama-ad.listing")
    ad_link = "https://bama.ir" + ad_link_element["href"] if ad_link_element else "N/A"

    # **Visit the Car Detail Page for More Data**
    if ad_link != "N/A":
        driver.get(ad_link)  # Open the car's page
        time.sleep(3)  # Wait for the page to load

        # Parse the new page
        detail_soup = BeautifulSoup(driver.page_source, "html.parser")

        # Extract the total amount of price
        # Convert extracted prices to numbers
        preprice_num = convert_price_to_number(preprice)
        monprice_num = convert_price_to_number(monprice)
        price_num = convert_price_to_number(price)
        
        # Determine the final price:
        if price_num == 0:
            # Find the installment count container
            installment_element = detail_soup.select_one("div.bama-ad-detail-price__installment-info span.bama-ad-detail-price__installment-text"
            "span.bama-ad-detail-price__installment-count")

            # Extract the first numeric value inside the span
            if installment_element:
                installment_text = installment_element.get_text(strip=True)  # Get text without spaces
                installment_number = "".join(filter(str.isdigit, installment_text))  # Keep only numbers
            else:
                installment_number = "N/A"
            installment_num = convert_price_to_number(installment_number)
            total_price = preprice_num + monprice_num*installment_num  # Sum preprice + monthly price if total price is missing
        else:
            total_price = price_num  # Use total price directly

        # Extract other features:
        details = detail_soup.select("div.bama-vehicle-detail-with-icon div.bama-vehicle-detail-with-icon__detail-holder p")

        # Initialize variables
        transmission = "N/A"
        color = "N/A"

        # Loop through details to match correct labels
        for detail in details:
            text = detail.get_text(strip=True)

            if "گیربکس" in text or "Transmission" in text:  # Persian & English check
                transmission = text.replace("گیربکس:", "").replace("Transmission:", "").strip()
            
            elif "رنگ" in text or "Color" in text:
                color = text.replace("رنگ:", "").replace("Color:", "").strip()

        # Extract Description
        description_element = detail_soup.select_one("div.description-wrapper div.description div.desc p")
        description = description_element.text.strip() if description_element else "N/A"

    else:
        total_price, transmission, color, description = "N/A", "N/A", "N/A", "N/A"

    # Append data to list
    car_data.append({
        "Car Name": car_name,
        "Model": car_model,
        "PrePrice": preprice,
        "Monthly Price": monprice,
        "Total Price": total_price,
        "Mileage": mileage,
        "Year": year,
        "Location": location,
        "Transmission": transmission,
        "Color": color,
        "Description": description
    })

# Close the browser
driver.quit()

# Convert data to DataFrame
df = pd.DataFrame(car_data)

# Save to Excel file
csv_filename = "samand_cars_extended.csv"
df.to_csv(csv_filename, index=False)

print(f"Data successfully saved to {csv_filename}")

Data successfully saved to samand_cars_extended.csv


In [7]:
car_names = []
for i, car in enumerate(car_listings):  
    # Extract Car Name
    car_name_element = car.select_one("p.bama-ad__title span.text")
    car_name = car_name_element.text.strip() if car_name_element else "N/A"
    car_names.append(car_name)
car_names_series = pd.Series(car_names)
df["Car Name"] = car_names_series
df.to_csv(csv_filename, index=False)