# Car Listings Preprocessing

## Goal:
##### 🔻 Process HTML files with car listings for analysis.

## Steps:
    ▼ Extract features using BeautifulSoup.
    ▼ Translate column names from Arabic to English.
    ▼ Create a structured DataFrame and save it as csv

In [1]:
import os
import pandas as pd
from bs4 import BeautifulSoup

Convert HTML 'txt files' into a data objects:

In [2]:
def html_parser(file_path):
    with open(file_path, encoding='utf-8') as file:
        text = file.read()
    return BeautifulSoup(text, 'html.parser')

#### Function for extract the YEAR feature 

In [3]:
def extract_year(car_model):
    # extract the year as a numeric value from the car_model that contain the full text
    if car_model :
        year = int(''.join(filter(str.isdigit, car_model)))
        if year :
            return year
    return None

#### Function for extract the PRICE feature 

In [4]:
def extract_price(price_tag):
    # Extract the numeric part of the price
    if price_tag:
        price_text = price_tag.text.strip()
        price = ''.join(filter(str.isdigit, price_text))
        if price:
            return int(price)

    return None

#### Function for extract the basic information about the car < MAKE , YEAR, PRICE > 

In [5]:
def extract_basic_details(soup):
    car_details = soup.select('table.driving-table td')
    if not car_details:
        return None, None, None
    

    car_make = car_details[0].find('h3').text.strip()
    car_model = car_details[0].find('h5').text.strip()

    # Extracting the year
    year = extract_year(car_model)
    if year is None:
        return None

    # Extract the price
    price_tag = soup.find('h5', class_='post-price')
    price = extract_price(price_tag)

    return car_make, year, price

#### Function for extract additional information and details about the car

In [6]:
def extract_other_features(soup):
    # Extract other features
    features = {}
    for row in soup.select('table.list_ads tr.list-row'):
        key = row.find_all('td')[0].text.strip()
        value = row.find_all('td')[1].text.strip()
        features[key] = value

    # Extract additions
    additions_list = ['مُكيّف','إغلاق مركزي','جهاز إنذار','مسجل CD','فتحة سقف','جنطات مغنيسيوم','فرش جلد','وسادة حماية هوائية']
    additions = [li.text.strip() for li in soup.select('table.list_ads tr.list-row td.list-additions ul li')]
    
    # Create a dictionary for each feature with 0 as default value
    feature_dict = {feature: 0 for feature in additions_list}

    # Update the values based on the additions list
    for feature in additions:
        if feature in feature_dict:
            feature_dict[feature] = 1

    return features, feature_dict


#### Collecting all the features together

In [7]:
def extract_features(soup):
    # Extract car details
    car_make, year, price = extract_basic_details(soup)
    if car_make is None:
        return None

    # Extract other features
    features, additions = extract_other_features(soup)

    # Create a dictionary with all features
    data = {
        'Make': car_make,
        'Year': year,
        'Price': price,
    }
    data.update(features)
    data.update(additions)
    return data

#### Function to  build car dataframe from folder

In [8]:
def car_df_build(folder_path):
    # Create an empty list to store extracted features
    data_list = []

    # Loop through each file in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            soup = html_parser(file_path)
            # Extract features and append to the list
            extracted_data = extract_features(soup)
            if extracted_data is not None:
                data_list.append(extracted_data)

    # Create a DataFrame from the list of extracted features
    df = pd.DataFrame(data_list)

    return df

### RUN THE CODE ON OUR DATA

In [None]:
pd.set_option("display.max_columns", None)
folder_path = '../data/'
result_df = car_df_build(folder_path)

##### Translate the columns 

In [None]:
translation_dict = {
    'لون السيارة': 'Color',
    'نوع الوقود': 'Fuel Type',
    'أصل السيارة': 'Car Origin',
    'رخصة السيارة': 'Car License',
    'نوع الجير': 'Transmission Type',
    'الزجاج': 'Glass',
    'قوة الماتور': 'Engine Power',
    'عداد السيارة': 'Mileage',
    'وسيلة الدفع': 'Payment Method',
    'معروضة': 'Available',
    'أصحاب سابقون': 'Previous Owners',
    'إضافات': 'Additions',
    'مُكيّف': 'Air Conditioner',
    'إغلاق مركزي': 'Central Locking',
    'جهاز إنذار': 'Alarm System',
    'مسجل CD': 'CD Player',
    'فتحة سقف': 'Sunroof',
    'جنطات مغنيسيوم': 'Alloy Wheels',
    'فرش جلد': 'Leather Seats',
    'وسادة حماية هوائية': 'Airbag',
    'عدد الركاب': 'Passenger Count',
    'الدفع': 'Drive Type'
}

# Use the translation dictionary to rename columns
result_df.rename(columns=translation_dict, inplace=True)

##### Save the df as csv file 

In [None]:
result_df.to_csv('../csv_files/draft_df.csv', index=False)

##### Test

In [None]:
df_car = pd.read_csv('../csv_files/draft_df.csv')
df_car