## Itamar Melnik
## Tomer Sabag

In [167]:

import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

# Function to fetch car data from a single page
def get_car_data_from_page(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None, None, None, None, None

    try:
        soup = BeautifulSoup(response.content, 'html.parser')
        
# Extracting prices, data images, car URLs, models, and manufacturers from the page.
        prices = soup.find('main', class_="main").find_all('div', class_="price")
        data_images = soup.find('main', class_="main").find_all('div', class_="card-block")
        cars_url = soup.find('main', class_="main").find_all('div', class_="card-body p-md-3")
        models_manufctur = soup.find('main', class_="main").find_all('h2', class_="card-title mb-0 mb-sm-1")

        l1 = ["https://www.ad.co.il" + i.find('a').get('href') for i in cars_url]
        l2 = [int(i.get_text().strip().replace("₪", "").replace(",", "")) if i.get_text().strip() else None for i in prices]
        l3 = [i['data-images'] for i in data_images]
        l4 = [' '.join(i.get_text().split()[1:][::-1]) for i in models_manufctur]
        l5 = [i.get_text().split(" ")[0] for i in models_manufctur]
        return l1, l2, l3, l4, l5
    
    except Exception as e:
        print(f"Error parsing {url}: {e}")
        return None, None, None, None, None

# Function to fetch additional car data from page.
def get_additional_car_data(soup):
    additional_data = {}
    all_description = soup.find_all('p', class_='text-word-break')
    description = [i.get_text().strip().replace('\r', '').replace('\n', ' ').replace('\t', '').strip() for i in all_description]
    additional_data['Description'] = description if description else None
        
    cre_repub = soup.find_all('div', class_='px-3')
    for i in cre_repub:
        a = i.get_text().split(':')
        if len(a) == 2:
            additional_data[a[0].strip()] = a[1].strip()
    
    return additional_data

# Function to collect car data from all pages.
def collect_car_data(base_url):
    cars_url_list, all_prices, num_images, all_models, all_manufacturer = [], [], [], [], []

    page_index = 1

    while True:
        current_url = f"{base_url}&pageindex={page_index}" if page_index > 1 else base_url
        cars_url, prices, data_images, models, manufacturer = get_car_data_from_page(current_url)
        
        if not (cars_url or prices or data_images or models or manufacturer):
            break
        
        cars_url_list.extend(cars_url)
        all_prices.extend(prices)
        num_images.extend(data_images)
        all_models.extend(models)
        all_manufacturer.extend(manufacturer)
        page_index += 1

    all_cars_data = []

    for i, car_url in enumerate(cars_url_list):
        try:
            response = requests.get(car_url)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Error fetching {car_url}: {e}")
            continue
        
        soup = BeautifulSoup(response.content, 'html.parser')
        
        car_data = {}
        
        car_table = soup.find('table', class_="table table-sm mb-4")
        if car_table:
            for row in car_table.find_all('tr'):
                columns = row.find_all('td')
                if len(columns) == 2:
                    key = columns[0].get_text(strip=True)
                    value = columns[1].get_text(strip=True)
                    car_data[key] = value
        
        additional_data = get_additional_car_data(soup)
        car_data.update(additional_data)
        
        car_data['Price'] = all_prices[i]
        car_data['Pic_num'] = num_images[i]
        car_data['Model'] = all_models[i]  
        car_data['Manufactor'] = all_manufacturer[i]
        all_cars_data.append(car_data)
    
    return all_cars_data

# Function to calculate days until next test date.
def days_until_test(test_date_str):
    try:
        test_date = datetime.strptime(test_date_str, "%m/%Y")
        delta = test_date - datetime.now()
        return delta.days
    except:
        return None

# Function to process car data and organize the dataframe.
def process_car_data(all_cars_data):
    df = pd.DataFrame(all_cars_data)

    df.rename(columns={
        'יצרן': 'Manufactor',
        'שנה': 'Year',
        'דגם': 'Model',
        'יד': 'Hand',
        'ת. הילוכים': 'Gear',
        'נפח': 'Engine_Capacity',
        'סוג מנוע': 'Engine_type',
        'בעלות קודמת': 'Prev_ownership',
        'בעלות נוכחית': 'Curr_ownership',
        'אזור': 'Area',
        'עיר': 'City',
        'מחיר': 'Price',
        'מספר תמונות': 'Pic_num',
        'תאריך יצירה': 'Cre_date',
        'תאריך הקפצה אחרון': 'Repub_date',
        'תיאור': 'Description',
        'צבע': 'Color',
        'ק"מ': 'Km',
        'טסט עד': 'Test'
    }, inplace=True)
    
    df['Test'] = df['Test'].apply(days_until_test)     
    
 # Converting relevant columns type.  
    df['Test'] = pd.to_numeric(df['Test'], errors='coerce').astype('Int64')
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce').astype('Int64')
    df['Hand'] = pd.to_numeric(df['Hand'], errors='coerce').astype('Int64')
    df['Engine_Capacity'] = df['Engine_Capacity'].str.replace(',', '')
    df['Engine_Capacity'] = pd.to_numeric(df['Engine_Capacity'], errors='coerce').astype('Int64')
    df['Hand'] = pd.to_numeric(df['Hand'], errors='coerce').astype('Int64')
    df['Km'] = df['Km'].str.replace(',', '')
    df['Km'] = pd.to_numeric(df['Km'], errors='coerce').astype('Int64')
    df['Pic_num'] = pd.to_numeric(df['Pic_num'], errors='coerce').astype('Int64')
    df['Price'] = pd.to_numeric(df['Price'], errors='coerce').astype('Float64')
    df['Gear'] = df['Gear'].astype('category')
    df['Engine_type'] = df['Engine_type'].astype('category')
    df['Prev_ownership'] = df['Prev_ownership'].astype('category')
    df['Curr_ownership'] = df['Curr_ownership'].astype('category')
    df['Cre_date'] = pd.to_datetime(df['Cre_date'], errors='coerce')
    df['Repub_date'] = pd.to_datetime(df['Repub_date'], errors='coerce')

    df = df[[
        'Manufactor', 'Year', 'Model', 'Hand', 'Gear', 'Engine_Capacity', 
        'Engine_type', 'Prev_ownership', 'Curr_ownership', 'Area', 'City', 
        'Price', 'Pic_num', 'Cre_date', 'Repub_date', 'Description', 'Color', 'Km', 'Test'
    ]]

    return df

# Function to get supply data from the Ministry of Transportation.
def get_supply_data():
    url = 'https://data.gov.il/api/3/action/datastore_search?resource_id=5e87a7a1-2f6f-41c1-8aec-7216d52a6cf6'
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        data = response.json()
        records = data['result']['records']
        return pd.DataFrame(records)
    except requests.RequestException as e:
        print(f"Error fetching supply data: {e}")
        return pd.DataFrame()

# Function to calculate supply score and add it to the dataframe.
def calculate_supply_score(df, supply_data):
    supply_data.rename(columns={
        'tozar': 'Manufactor',
        'shnat_yitzur': 'Year',
        'kinuy_mishari': 'Model',
        'mispar_rechavim_pailim': 'Supply_score', 
     }, inplace=True)

    supply_score = supply_data.groupby(['Manufactor', 'Model', 'Year'])['Supply_score'].sum().reset_index()
    df = df.merge(supply_score, on=['Manufactor', 'Model', 'Year'], how='left')

    return df

# Main function to execute the whole process.
def main():
    base_url = "https://www.ad.co.il/car?sp261=13896"
    all_cars_data = collect_car_data(base_url)
    df = process_car_data(all_cars_data)

    supply_data = get_supply_data()
    df = calculate_supply_score(df, supply_data)
    
    return df

df = main()
df


Unnamed: 0,Manufactor,Year,Model,Hand,Gear,Engine_Capacity,Engine_type,Prev_ownership,Curr_ownership,Area,City,Price,Pic_num,Cre_date,Repub_date,Description,Color,Km,Test,Supply_score
0,פיג'ו,2015,508,3,אוטומטית,1600,בנזין,פרטית,פרטית,באר שבע והסביבה,שדרות,37000.0,6,2024-05-27,2024-01-06,"[מצב מעולה , כיפי לנסיעה טסט 09/24 ,כסאות חשמל...",כסוף מטאלי,100496.0,86.0,182.0
1,פיג'ו,2016,208,3,רובוטית,1200,בנזין,פרטית,פרטית,מושבים בשרון,שער אפרים,25000.0,3,2024-05-20,2024-05-06,"[חדשה !! . זריזה מאוד . שקטה וקלה לנהיגה ., תו...",שחור,99000.0,267.0,2028.0
2,פיג'ו,2014,301,2,ידנית,1200,בנזין,פרטית,פרטית,יישובי השומרון,קדומים,15000.0,3,2024-02-05,2024-02-05,[רכב פרטי שמור טסט לשנה מוכר עקב מעבר לרכב עבודה],אפור מטאלי,130000.0,267.0,484.0
3,פיג'ו,2012,207,3,אוטומטית,1600,בנזין,פרטית,פרטית,באר שבע והסביבה,באר שבע,12500.0,7,2024-09-04,2024-09-04,[למכירה פיגו 207 שנת 2012 דגם אקטיב במצב מעול...,אפור,200000.0,239.0,671.0
4,פיג'ו,2020,2008,1,אוטומטית,1200,בנזין,פרטית,פרטית,בקעת אונו,קרית אונו,69000.0,3,2024-03-04,2024-03-04,[רכב שמור מאד. לאחר טיפול כולל רצועת טיימינג ח...,לבן,55000.0,208.0,1155.0
5,פיג'ו,2018,301,4,ידנית,1600,דיזל,מונית,פרטית,טבריה והסביבה,טבריה,19500.0,7,2024-02-19,2024-02-19,"[פיגו טורבו דיזל שנה 2018 מנוע 1600 טסט ,8 חוד...",לבן מטאלי,170586.0,55.0,42.0
6,פיג'ו,2007,307,3,אוטומטית,1200,בנזין,פרטית,פרטית,חיפה וחוף הכרמל,חיפה,15000.0,1,2023-12-12,2023-11-12,,אפור,258000.0,86.0,313.0
7,פיג'ו,2015,508,3,אוטומטית,1600,בנזין,פרטית,פרטית,חיפה וחוף הכרמל,חיפה,25000.0,5,2023-09-11,2023-09-11,[למכירה בהזדמנות (לא מעוניין בהחלפה) 🚗 רכב מנה...,אפור מטאלי,190000.0,,182.0
8,פיג'ו,2018,5008,2,אוטומטית,1600,דיזל,פרטית,פרטית,מודיעין והסביבה,מודיעין עילית,82000.0,4,2023-10-25,2023-10-25,[דגם פרימיום כל התוספות.גג נפתח מושבי עור מתכו...,לבן,190000.0,,846.0
9,פיג'ו,1995,106,5,ידנית,1200,בנזין,פרטית,פרטית,ירושלים והסביבה,ירושלים,20000.0,1,2023-11-09,2023-11-09,[מוסב ראלי (למעט המנוע) אין טסט שנתיים וחצי מנ...,לבן,140000.0,,


In [169]:

df.to_csv('output.csv', index=False,encoding="utf-8-sig")