# Environment Setup

In [None]:
from google.colab import drive

drive.mount("/content/drive")

In [None]:
%cd /content/drive/MyDrive/cs5228_code

# Data Preprocessing

## 1. Load in the dataset and remove unnecessary columns

In [None]:
import pandas as pd


def load_data(path):
    """
    Remove the features that:
      1. may not be useful for the model
      2. have too many missing values
      3. all the values are the same
      4. may be redundant with other features
    """
    df = pd.read_csv(path)
    df.drop(["listing_id", "indicative_price", "eco_category", "fuel_type", "opc_scheme", "lifespan", "features", "accessories"], axis=1, inplace=True)
    return df


df_train = load_data("data/train.csv")
df_test = load_data("data/test.csv")

## 2. Process date features

In [None]:
def proce_date(df):
    """
    Convert the date imformation from string to duration numerical number
    """
    # Convert date columns to datetime
    df["reg_date"] = pd.to_datetime(df["reg_date"], format="%d-%b-%Y")
    df["original_reg_date"] = pd.to_datetime(df["original_reg_date"], format="%d-%b-%Y")

    # Calculate age of vehicle
    end_date = pd.to_datetime("2024-11-01")
    reg_age = round(((end_date - df["reg_date"]).dt.days / 365.25), 2)

    # Fill missing values in manufactured column
    df.loc[df["category"].str.contains("parf car"), "manufactured"] = df.loc[
        df["category"].str.contains("parf car"), "manufactured"
    ].fillna(df["reg_date"].dt.year)

    # Fill missing values in manufactured column
    df.loc[df["manufactured"].isna(), "manufactured"] = (
        df.loc[df["manufactured"].isna(), "manufactured"]
        .fillna(df["original_reg_date"].dt.year)
        .fillna(df["reg_date"].dt.year)
    )
    df["manufactured"] = 2024 - df["manufactured"]

    # Drop and rename columns
    df["reg_date"] = reg_age
    df.rename(columns={"reg_date": "reg_age"}, inplace=True)
    df.drop(["original_reg_date"], axis=1, inplace=True)

    return df

df_train = proce_date(df_train)
df_test = proce_date(df_test)

## 3. Encode the basic information

### 3.1 Encoded `type_of_Vehicle` and `transmission` features using numerical encoding

In [None]:
type = list(df_train["type_of_vehicle"].unique())
tran = list(df_train["transmission"].unique())

type_map = {t: i for i, t in enumerate(type)}
tran_map = {t: i for i, t in enumerate(tran)}

def encode_type_trans(df):
    """
    Encode the type of vehicle and transmission into numerical values
    """
    df["type_of_vehicle"] = df["type_of_vehicle"].map(type_map)
    df["transmission"] = df["transmission"].map(tran_map)
    return df

df_train = encode_type_trans(df_train)
df_test = encode_type_trans(df_test)

### 3.2 Encode free text features utilizing BERT embeddings

In [None]:
import numpy as np
import torch
from transformers import BertModel, BertTokenizer
from sklearn.decomposition import PCA


def encode_title(df, target_dimension=16):
    """
    Encode the title feature using BERT
    """
    # Clean the title feature by removing anything in parentheses
    df[["title_cleaned", "coe_valid_to"]] = df["title"].str.extract(r"^(.*)\s\(\D+\s(.*)\)$")
    df["title_cleaned"] = df["title_cleaned"].fillna(df["title"])
    df["title"] = df["title_cleaned"]
    df.drop(["title_cleaned", "coe_valid_to"], axis=1, inplace=True)
    # Add the prefix "CAR: "
    df.loc[:, "title"] = "CAR: " + df["title"]

    # Encode the title using BERT
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_name = "bert-large-uncased"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name).to(device)

    def encode_text(text):
        # Tokenize the text and get the embeddings
        inputs = tokenizer(text, return_tensors="pt", padding=True,
                           truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state
        embeddings = embeddings.mean(dim=1)
        return embeddings.squeeze().cpu().numpy() 
    
    # Encode the title using BERT
    title_embeddings = df["title"].apply(encode_text).tolist()
    embedding_matrix = np.vstack(title_embeddings)
    pca = PCA(n_components=target_dimension)
    reduced_embeddings = pca.fit_transform(embedding_matrix)
    df["title"] = reduced_embeddings.tolist()

    # Drop the redundant information
    df.drop(["make", "model"], axis=1, inplace=True)
    
    return df


def encode_description(df, target_dimension=8):
    """
    Encode the description feature using BERT
    """
    # Fill the missing values in the description
    df["description"] = df["description"].fillna("Good condition")

    # Encode the description using BERT
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model_name = "bert-large-uncased"
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name).to(device)

    def encode_text(text):
        # Tokenize the text and get the embeddings
        inputs = tokenizer(text, return_tensors="pt", padding=True,
                           truncation=True, max_length=512).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state
        embeddings = embeddings.mean(dim=1)
        return embeddings.squeeze().cpu().numpy() 
    
    # Encode the description using BERT
    description_embeddings = df["description"].apply(encode_text).tolist()
    embedding_matrix = np.vstack(description_embeddings)
    pca = PCA(n_components=target_dimension)
    reduced_embeddings = pca.fit_transform(embedding_matrix)
    df["description"] = reduced_embeddings.tolist()
    
    return df


def encode_free_text(df):
    # Encode the title and description
    df = encode_title(df)
    df = encode_description(df)
    return df


df_train = encode_free_text(df_train)
df_test = encode_free_text(df_test)

### 3.3 Save the basic features encoded dataset to new CSV files

In [None]:
df_train.to_csv("data/train_basic_encoded.csv", index=False)
df_test.to_csv("data/test_basic_encoded.csv", index=False)

In [1]:
import pandas as pd

df_train = pd.read_csv("data/train_basic_encoded.csv")
df_test = pd.read_csv("data/test_basic_encoded.csv")

## 4. Encode the `category` feature

In [2]:
from sklearn.preprocessing import MultiLabelBinarizer

def multi_label_category(df):
    """
    Convert the category feature into multi-label format
    """
    mlb = MultiLabelBinarizer()
    df_category = mlb.fit_transform(df["category"].str.split(", "))
    df["category"] = df_category.tolist()
    return df

df_train = multi_label_category(df_train)
df_test = multi_label_category(df_test)

## 5. Fill the missing numerical values utilizing clustering imformation

### 5.1 Fill the missing values in the features related to the vehicle's condition

### 5.2 Fill the missing values in the features related to the vehicle's mileage or age

In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            25000 non-null  object 
 1   description      25000 non-null  object 
 2   manufactured     25000 non-null  float64
 3   reg_age          25000 non-null  float64
 4   type_of_vehicle  25000 non-null  int64  
 5   category         25000 non-null  object 
 6   transmission     25000 non-null  int64  
 7   curb_weight      24693 non-null  float64
 8   power            22360 non-null  float64
 9   engine_cap       24404 non-null  float64
 10  no_of_owners     24982 non-null  float64
 11  depreciation     24493 non-null  float64
 12  coe              25000 non-null  int64  
 13  road_tax         22368 non-null  float64
 14  dereg_value      24780 non-null  float64
 15  mileage          19696 non-null  float64
 16  omv              24936 non-null  float64
 17  arf         

In [4]:
print(df_train.iloc[1])

title              [2.260293483734131, -0.15082983672618866, -0.5...
description        [1.51604425907135, -0.05233699455857277, -0.09...
manufactured                                                     7.0
reg_age                                                         6.84
type_of_vehicle                                                    1
category            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0]
transmission                                                       0
curb_weight                                                   1465.0
power                                                          135.0
engine_cap                                                    1991.0
no_of_owners                                                     2.0
depreciation                                                 21170.0
coe                                                            47002
road_tax                                                      1202.0
dereg_value                       

In [None]:
df = df_train.loc[:, ["title", "type_of_vehicle", "category", "transmission"]]

# Training

## 1. Normalize the features to avoid the influence of different dataset distributions

## 2. Train the model (XGBoost) with validation set

# Testing