In [19]:
import numpy as np
import pandas as pd
from category_encoders import BinaryEncoder, OneHotEncoder, TargetEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer, OrdinalEncoder
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset 
from torch.utils.data import DataLoader
import torch


In [20]:
pd.set_option('display.max_columns', None)

train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv("Data/test.csv")

print('Training data number = {}'.format(train_data.shape[0]))
print('Test data number = {}\n'.format(test_data.shape[0]))
print(train_data.columns) 

Training data number = 25000
Test data number = 10000

Index(['listing_id', 'title', 'make', 'model', 'description', 'manufactured',
       'original_reg_date', 'reg_date', 'type_of_vehicle', 'category',
       'transmission', 'curb_weight', 'power', 'fuel_type', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'opc_scheme', 'lifespan', 'eco_category',
       'features', 'accessories', 'indicative_price', 'price'],
      dtype='object')


In [21]:
basic_drop_columns = [
    "listing_id",
    # "title",
    "make",
    "model",
    # "description",
    "manufactured",
    "original_reg_date",
    "reg_date",
    "type_of_vehicle",
    "category",
    "transmission",
    "curb_weight",
    "power",
    "fuel_type",
    "engine_cap",
    "no_of_owners",
    "depreciation",
    "coe",
    "road_tax",
    "dereg_value",
    "mileage",
    "omv",
    "arf",
    "opc_scheme",
    "lifespan",
    "eco_category",
    # "features",
    # "accessories",
    "indicative_price",
    # "price",
]
_train_data = train_data.drop(columns=basic_drop_columns)
print(_train_data.columns)

Index(['title', 'description', 'features', 'accessories', 'price'], dtype='object')


In [22]:
def min_max_scaler(
    data: pd.DataFrame, column_name: str, scaler: MinMaxScaler, refit: bool = False
) -> pd.DataFrame:
    if refit:
        data[column_name] = scaler.fit_transform(
            data[column_name].values.reshape(-1, 1)
        )
    else:
        data[column_name] = scaler.transform(data[column_name].values.reshape(-1, 1))
    return data


def standard_scaler(
    data: pd.DataFrame, column_name: str, scaler: StandardScaler, refit: bool = False
) -> pd.DataFrame:
    if refit:
        data[column_name] = scaler.fit_transform(
            data[column_name].values.reshape(-1, 1)
        )
    else:
        data[column_name] = scaler.transform(data[column_name].values.reshape(-1, 1))
    return data


def binary_encoder(
    data: pd.DataFrame, column_name: str, encoder: BinaryEncoder, refit: bool = False
) -> pd.DataFrame:
    if refit:
        labels = encoder.fit_transform(data[column_name])
    else:
        labels = encoder.transform(data[column_name])
    labels = pd.DataFrame(labels)
    data.drop(columns=[column_name], inplace=True)
    return pd.concat([data, labels], axis=1)


def onehot_encoder(
    data: pd.DataFrame, column_name: str, encoder: OneHotEncoder, refit: bool = False
) -> pd.DataFrame:
    if refit:
        labels = encoder.fit_transform(data[column_name])
    else:
        labels = encoder.transform(data[column_name])
    labels = pd.DataFrame(labels)
    data.drop(columns=[column_name], inplace=True)
    return pd.concat([data, labels], axis=1)


def ordinal_encoder(
    data: pd.DataFrame, column_name: str, encoder: OrdinalEncoder, refit: bool = False
) -> pd.DataFrame:
    if refit:
        labels = encoder.fit_transform(pd.DataFrame(data[column_name]))
    else:
        labels = encoder.transform(pd.DataFrame(data[column_name]))
    labels = labels.ravel()
    labels = pd.DataFrame(labels, columns=[column_name])
    data.drop(columns=[column_name], inplace=True)
    return pd.concat([data, labels], axis=1)


def target_encoder(
    data: pd.DataFrame, column_name: str, encoder: TargetEncoder, refit: bool = False
) -> pd.DataFrame:
    if refit:
        labels = encoder.fit_transform(data[column_name], data["price"])
    else:
        labels = encoder.transform(data[column_name])
    labels = pd.DataFrame(labels)
    data.drop(columns=[column_name], inplace=True)
    return pd.concat([data, labels], axis=1)


def multi_label_binarizer(
    data: pd.DataFrame,
    column_name: str,
    encoder: MultiLabelBinarizer,
    refit: bool = False,
) -> pd.DataFrame:
    data[column_name] = data[column_name].map(
        lambda c: [_c.strip() for _c in c.split(",") if _c != "" and _c != "-"]
    )
    if refit:
        labels = encoder.fit_transform(data[column_name]).astype(np.float64)
    else:
        labels = encoder.transform(data[column_name]).astype(np.float64)
    labels = pd.DataFrame(
        labels, columns=[column_name + "_" + c for c in encoder.classes_]
    )
    data.drop(columns=[column_name], inplace=True)
    return pd.concat([data, labels], axis=1)


other_drop_columns = [
    # "make",
    # "model",
    # "manufactured",
    # "type_of_vehicle",
    # "category",
    # "transmission",
    # "curb_weight",
    # "power",
    # "fuel_type",
    # "engine_cap",
    # "no_of_owners",
    # "depreciation",
    # "coe",
    # "road_tax",
    # "dereg_value",
    # "mileage",
    # "omv",
    # "arf",
    # "opc_scheme",
    # "price",
]

column_encoders = {
    # "make": (binary_encoder, BinaryEncoder()),
    "make": (ordinal_encoder, OrdinalEncoder()),
    # "model": (binary_encoder, BinaryEncoder()),
    "model": (ordinal_encoder, OrdinalEncoder()),
    "manufactured": (min_max_scaler, MinMaxScaler()),
    # "type_of_vehicle": (binary_encoder, BinaryEncoder()),
    "type_of_vehicle": (ordinal_encoder, OrdinalEncoder()),
    "category": (multi_label_binarizer, MultiLabelBinarizer()),
    "transmission": (ordinal_encoder, OrdinalEncoder()),
    "curb_weight": (standard_scaler, StandardScaler()),
    "power": (min_max_scaler, MinMaxScaler()),
    "engine_cap": (min_max_scaler, MinMaxScaler()),
    "no_of_owners": (min_max_scaler, MinMaxScaler()),
    "depreciation": (min_max_scaler, MinMaxScaler()),
    "coe": (min_max_scaler, MinMaxScaler()),
    "road_tax": (min_max_scaler, MinMaxScaler()),
    "dereg_value": (min_max_scaler, MinMaxScaler()),
    "mileage": (min_max_scaler, MinMaxScaler()),
    "omv": (min_max_scaler, MinMaxScaler()),
    "arf": (min_max_scaler, MinMaxScaler()),
    "price": (min_max_scaler, MinMaxScaler()),
}


_train_data = _train_data.drop(columns=other_drop_columns)
_train_data = _train_data.dropna()
_train_data = column_encoders["price"][0](
    _train_data, "price", column_encoders["price"][1], refit=True
)

# BASE_MODEL = "camembert-base"
BASE_MODEL = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, cache_dir="./modelCache", clean_up_tokenization_spaces=True)
model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL, cache_dir="./modelCache", num_labels=1
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
X = _train_data[["title", "description", "features", "accessories"]]
Y = _train_data["price"]

X_combined = X.apply(lambda x: "|".join(x), axis=1).values.tolist()
tokens = tokenizer(X_combined, padding=True, truncation=True, return_tensors="pt")
x_train, x_test, y_train, y_test, mask_train, mask_test = train_test_split(tokens['input_ids'], torch.Tensor(Y.values), tokens['attention_mask'], test_size=0.2, random_state=42)

In [24]:
train_dataset = Dataset.from_dict({'input_ids': x_train, 'label': y_train, 'attention_mask': mask_train})
test_dataset = Dataset.from_dict({'input_ids': x_test, 'label': y_test, 'attention_mask': mask_test})

print(x_train.shape, y_train.shape, mask_train.shape)

def compute_metrics_for_regression(eval_pred):
    logits, labels = eval_pred
    labels = labels.reshape(-1, 1)
    
    logits = column_encoders["price"][1].inverse_transform(logits)
    labels = column_encoders["price"][1].inverse_transform(labels)
    rmse = np.sqrt(mean_squared_error(labels, logits))
    return {"rmse": rmse}


LEARNING_RATE = 2e-5
BATCH_SIZE = 8
EPOCHS = 20
training_args = TrainingArguments(
    output_dir="./modelCache/mymodel",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="rmse",
    load_best_model_at_end=True,
    weight_decay=0.01,
    greater_is_better=False,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics_for_regression,
)
trainer.train()

torch.Size([16771, 230]) torch.Size([16771]) torch.Size([16771, 230])


  0%|          | 0/41940 [00:00<?, ?it/s]

{'loss': 0.0081, 'grad_norm': 0.3073815405368805, 'learning_rate': 1.9761564139246544e-05, 'epoch': 0.24}
{'loss': 0.0023, 'grad_norm': 0.3855680823326111, 'learning_rate': 1.9523128278493086e-05, 'epoch': 0.48}
{'loss': 0.0019, 'grad_norm': 0.23669053614139557, 'learning_rate': 1.928469241773963e-05, 'epoch': 0.72}
{'loss': 0.0017, 'grad_norm': 0.28194108605384827, 'learning_rate': 1.904625655698617e-05, 'epoch': 0.95}


  0%|          | 0/525 [00:00<?, ?it/s]

{'eval_loss': 0.0017062259139493108, 'eval_rmse': 119735.078125, 'eval_runtime': 32.0583, 'eval_samples_per_second': 130.793, 'eval_steps_per_second': 16.376, 'epoch': 1.0}
{'loss': 0.0011, 'grad_norm': 0.39601826667785645, 'learning_rate': 1.8807820696232715e-05, 'epoch': 1.19}
{'loss': 0.0013, 'grad_norm': 0.2727905213832855, 'learning_rate': 1.8569384835479257e-05, 'epoch': 1.43}
{'loss': 0.001, 'grad_norm': 0.27829667925834656, 'learning_rate': 1.83309489747258e-05, 'epoch': 1.67}
{'loss': 0.001, 'grad_norm': 0.549113392829895, 'learning_rate': 1.8092513113972344e-05, 'epoch': 1.91}


  0%|          | 0/525 [00:00<?, ?it/s]

{'eval_loss': 0.000999927637167275, 'eval_rmse': 91661.59375, 'eval_runtime': 32.8501, 'eval_samples_per_second': 127.64, 'eval_steps_per_second': 15.982, 'epoch': 2.0}


KeyboardInterrupt: 

In [None]:
tokenizer.save("./model/mytokenizer.json")
trainer.save_model("./model")