In [115]:
import numpy as np
import pandas as pd
from category_encoders import BinaryEncoder, OneHotEncoder, TargetEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MultiLabelBinarizer, OrdinalEncoder
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import joblib
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset 
from torch.utils.data import DataLoader
import torch
from sklearn.linear_model import LinearRegression

In [116]:
pd.set_option('display.max_columns', None)

train_data = pd.read_csv('Data/train.csv')
test_data = pd.read_csv("Data/test.csv")

print('Training data number = {}'.format(train_data.shape[0]))
print('Test data number = {}\n'.format(test_data.shape[0]))
print(train_data.columns) 

Training data number = 25000
Test data number = 10000

Index(['listing_id', 'title', 'make', 'model', 'description', 'manufactured',
       'original_reg_date', 'reg_date', 'type_of_vehicle', 'category',
       'transmission', 'curb_weight', 'power', 'fuel_type', 'engine_cap',
       'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value',
       'mileage', 'omv', 'arf', 'opc_scheme', 'lifespan', 'eco_category',
       'features', 'accessories', 'indicative_price', 'price'],
      dtype='object')


In [117]:
basic_columns = [
    "depreciation",
    "dereg_value",
    "manufactured",
    "coe",
    "power",
    "category",
    "arf",
    "mileage",
    "omv",
]

nlp_columns = [
    # "listing_id",
    "title",
    # "make",
    # "model",
    "description",
    # "manufactured",
    # "original_reg_date",
    # "reg_date",
    # "type_of_vehicle",
    # "category",
    # "transmission",
    # "curb_weight",
    # "power",
    # "fuel_type",
    # "engine_cap",
    # "no_of_owners",
    # "depreciation",
    # "coe",
    # "road_tax",
    # "dereg_value",
    # "mileage",
    # "omv",
    # "arf",
    # "opc_scheme",
    # "lifespan",
    # "eco_category",
    "features",
    "accessories",
    # "indicative_price",
    # "price",
]

price_columns = [
    "price",
]

basic_train_data = train_data[basic_columns].copy()
nlp_train_data = train_data[nlp_columns].copy()
price_train_data = train_data[price_columns].copy()

print(f"basic_train_data: {basic_train_data.columns}")
print(f"nlp_train_data: {nlp_train_data.columns}")
print(f"price_train_data: {price_train_data.columns}")

basic_train_data: Index(['depreciation', 'dereg_value', 'manufactured', 'coe', 'power',
       'category', 'arf', 'mileage', 'omv'],
      dtype='object')
nlp_train_data: Index(['title', 'description', 'features', 'accessories'], dtype='object')
price_train_data: Index(['price'], dtype='object')


In [118]:
def min_max_scaler(
    data: pd.DataFrame, column_name: str, scaler: MinMaxScaler, refit: bool = False
) -> pd.DataFrame:
    if refit:
        data[column_name] = scaler.fit_transform(
            data[column_name].values.reshape(-1, 1)
        )
    else:
        data[column_name] = scaler.transform(data[column_name].values.reshape(-1, 1))
    return data


def standard_scaler(
    data: pd.DataFrame, column_name: str, scaler: StandardScaler, refit: bool = False
) -> pd.DataFrame:
    if refit:
        data[column_name] = scaler.fit_transform(
            data[column_name].values.reshape(-1, 1)
        )
    else:
        data[column_name] = scaler.transform(data[column_name].values.reshape(-1, 1))
    return data


def binary_encoder(
    data: pd.DataFrame, column_name: str, encoder: BinaryEncoder, refit: bool = False
) -> pd.DataFrame:
    if refit:
        labels = encoder.fit_transform(data[column_name])
    else:
        labels = encoder.transform(data[column_name])
    labels = pd.DataFrame(labels)
    data.drop(columns=[column_name], inplace=True)
    return pd.concat([data, labels], axis=1)


def onehot_encoder(
    data: pd.DataFrame, column_name: str, encoder: OneHotEncoder, refit: bool = False
) -> pd.DataFrame:
    if refit:
        labels = encoder.fit_transform(data[column_name])
    else:
        labels = encoder.transform(data[column_name])
    labels = pd.DataFrame(labels)
    data.drop(columns=[column_name], inplace=True)
    return pd.concat([data, labels], axis=1)


def ordinal_encoder(
    data: pd.DataFrame, column_name: str, encoder: OrdinalEncoder, refit: bool = False
) -> pd.DataFrame:
    if refit:
        labels = encoder.fit_transform(pd.DataFrame(data[column_name]))
    else:
        labels = encoder.transform(pd.DataFrame(data[column_name]))
    labels = labels.ravel()
    labels = pd.DataFrame(labels, columns=[column_name])
    data.drop(columns=[column_name], inplace=True)
    return pd.concat([data, labels], axis=1)


def target_encoder(
    data: pd.DataFrame, column_name: str, encoder: TargetEncoder, refit: bool = False
) -> pd.DataFrame:
    if refit:
        labels = encoder.fit_transform(data[column_name], data["price"])
    else:
        labels = encoder.transform(data[column_name])
    labels = pd.DataFrame(labels)
    data.drop(columns=[column_name], inplace=True)
    return pd.concat([data, labels], axis=1)


def multi_label_binarizer(
    data: pd.DataFrame,
    column_name: str,
    encoder: MultiLabelBinarizer,
    refit: bool = False,
) -> pd.DataFrame:
    data[column_name] = data[column_name].map(
        lambda c: [_c.strip() for _c in c.split(",") if _c != "" and _c != "-"]
    )
    if refit:
        labels = encoder.fit_transform(data[column_name]).astype(np.float64)
    else:
        labels = encoder.transform(data[column_name]).astype(np.float64)
    labels = pd.DataFrame(
        labels, columns=[column_name + "_" + c for c in encoder.classes_]
    )
    data.drop(columns=[column_name], inplace=True)
    return pd.concat([data, labels], axis=1)


other_drop_columns = [
    # "make",
    # "model",
    # "manufactured",
    # "type_of_vehicle",
    # "category",
    # "transmission",
    # "curb_weight",
    # "power",
    # "fuel_type",
    # "engine_cap",
    # "no_of_owners",
    # "depreciation",
    # "coe",
    # "road_tax",
    # "dereg_value",
    # "mileage",
    # "omv",
    # "arf",
    # "opc_scheme",
    # "price",
]

column_encoders = {
    # "make": (binary_encoder, BinaryEncoder()),
    "make": (ordinal_encoder, OrdinalEncoder()),
    # "model": (binary_encoder, BinaryEncoder()),
    "model": (ordinal_encoder, OrdinalEncoder()),
    "manufactured": (min_max_scaler, MinMaxScaler()),
    # "type_of_vehicle": (binary_encoder, BinaryEncoder()),
    "type_of_vehicle": (ordinal_encoder, OrdinalEncoder()),
    "category": (multi_label_binarizer, MultiLabelBinarizer()),
    "transmission": (ordinal_encoder, OrdinalEncoder()),
    "curb_weight": (standard_scaler, StandardScaler()),
    "power": (min_max_scaler, MinMaxScaler()),
    "engine_cap": (min_max_scaler, MinMaxScaler()),
    "no_of_owners": (min_max_scaler, MinMaxScaler()),
    "depreciation": (min_max_scaler, MinMaxScaler()),
    "coe": (min_max_scaler, MinMaxScaler()),
    "road_tax": (min_max_scaler, MinMaxScaler()),
    "dereg_value": (min_max_scaler, MinMaxScaler()),
    "mileage": (min_max_scaler, MinMaxScaler()),
    "omv": (min_max_scaler, MinMaxScaler()),
    "arf": (min_max_scaler, MinMaxScaler()),
    "price": (min_max_scaler, MinMaxScaler()),
}

def preprocess_basic_data(data: pd.DataFrame, refit: bool = False):
    data = data.reset_index(drop=True)
    for column_name in data.columns:
        # print("====================================={}================================".format(column_name))
        if data[column_name].dtype != "object":
            data[column_name].fillna(
                data[column_name].mean(),
                inplace=True,
            )
                
        assert column_name in column_encoders, f"Column {column_name} not found in column_encoders."
        prefunc, encoder = column_encoders[column_name]
        data = prefunc(data, column_name, encoder, refit)
    return data

def proprocess_nlp_data(data: pd.DataFrame):
    BASE_TOKENIZER = "./modelCache/mymodel/bert1/tokenizer"
    tokenizer = AutoTokenizer.from_pretrained(BASE_TOKENIZER, cache_dir="./modelCache", clean_up_tokenization_spaces=True, local_files_only=True,)
    data.fillna("", inplace=True)
    X_combined = data.apply(lambda x: "|".join(x), axis=1).values.tolist()
    tokens = tokenizer(X_combined, padding=True, truncation=True, return_tensors="pt")
    return tokens


In [119]:
_basic_train_data = preprocess_basic_data(basic_train_data, refit=True)
_nlp_tokens = proprocess_nlp_data(nlp_train_data)
_price_train_data = preprocess_basic_data(price_train_data, refit=True)

basicx_train, basicx_test, nlpx_train, nlpx_test, nlpmask_train, nlpmask_test, y_train, y_test = train_test_split(
    _basic_train_data.values,
    _nlp_tokens['input_ids'], 
    _nlp_tokens['attention_mask'], 
    _price_train_data['price'].values,
    test_size=0.05, random_state=21
)

In [120]:
# prepare model
modelCount = 5
lgb_models = []
lgb_model_dir = "./modelCache/mymodel"
for i in range(modelCount):
    model = joblib.load(f"{lgb_model_dir}/model_{i}.pkl")
    lgb_models.append(model)

BASE_MODEL = "./modelCache/mymodel/bert1/model"
nlp_model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL, cache_dir="./modelCache", num_labels=1, local_files_only=True,
)

In [121]:
# get light gbm prediction on train data and test data
lgb_train_predictions = np.zeros(len(basicx_train))
for model in lgb_models:
    lgb_train_predictions += model.predict(basicx_train)
lgb_train_predictions /= modelCount
lgb_train_predictions = lgb_train_predictions.reshape(-1, 1)

lgb_test_predictions = np.zeros(len(basicx_test))
for model in lgb_models:
    lgb_test_predictions += model.predict(basicx_test)
lgb_test_predictions /= modelCount
lgb_test_predictions = lgb_test_predictions.reshape(-1, 1)

In [122]:
# get nlp prediction on train data
train_dataset = Dataset.from_dict({'input_ids': nlpx_train, 'attention_mask': nlpmask_train})
print(f"nlp train shape: {nlpx_train.shape}, {nlpmask_train.shape}")
train_trainer = Trainer(model=nlp_model)
nlp_train_predictions = train_trainer.predict(train_dataset)
nlp_train_predictions = nlp_train_predictions.predictions

test_dataset = Dataset.from_dict({'input_ids': nlpx_test, 'attention_mask': nlpmask_test})
print(f"nlp test shape: {nlpx_test.shape}, {nlpmask_test.shape}")
test_trainer = Trainer(model=nlp_model)
nlp_test_predictions = test_trainer.predict(test_dataset)
nlp_test_predictions = nlp_test_predictions.predictions

nlp train shape: torch.Size([23750, 230]), torch.Size([23750, 230])


  0%|          | 0/2969 [00:00<?, ?it/s]

nlp test shape: torch.Size([1250, 230]), torch.Size([1250, 230])


  0%|          | 0/157 [00:00<?, ?it/s]

In [123]:
# train a linear model based on the 2 precitions
combined_train_data = np.concatenate((lgb_train_predictions, nlp_train_predictions), axis=1)
combined_test_data = np.concatenate((lgb_test_predictions, nlp_test_predictions), axis=1)

meta_model = LinearRegression()
meta_model.fit(combined_train_data, y_train)

combined_test_pred = meta_model.predict(combined_test_data)
combined_test_pred = column_encoders["price"][1].inverse_transform(combined_test_pred.reshape(-1, 1))
actual_test_pred = column_encoders["price"][1].inverse_transform(y_test.reshape(-1, 1))
rmse = np.sqrt(mean_squared_error(actual_test_pred, combined_test_pred))
print(f"evaluate rmse: {rmse}.")

joblib.dump(meta_model, f"{lgb_model_dir}/meta_model.pkl")


evaluate rmse: 21977.465062154468.


['./modelCache/mymodel/meta_model.pkl']

In [124]:
# generate result_4.csv for submission: prepare the data
basic_sub_data = test_data[basic_columns].copy()
nlp_sub_data = test_data[nlp_columns].copy()
basic_sub_data = preprocess_basic_data(basic_sub_data)
nlp_sub_tokens = proprocess_nlp_data(nlp_sub_data)

In [125]:
# generate result_4.csv for submission: get lightgbm prediction
lgb_sub_predictions = np.zeros(len(basic_sub_data))
for model in lgb_models:
    lgb_sub_predictions += model.predict(basic_sub_data)
lgb_sub_predictions /= modelCount
lgb_sub_predictions = lgb_sub_predictions.reshape(-1, 1)

In [126]:
# generate result_4.csv for submission: get nlp prediction
nlp_sub_data = nlp_sub_tokens['input_ids']
nlp_sub_mask = nlp_sub_tokens['attention_mask']
train_dataset = Dataset.from_dict({'input_ids': nlp_sub_data, 'attention_mask': nlp_sub_mask})
print(f"nlp data shape: {nlp_sub_data.shape}, {nlp_sub_mask.shape}")
sub_trainer = Trainer(model=nlp_model)
nlp_sub_predictions = sub_trainer.predict(train_dataset)
nlp_sub_predictions = nlp_sub_predictions.predictions

nlp data shape: torch.Size([10000, 240]), torch.Size([10000, 240])


  0%|          | 0/1250 [00:00<?, ?it/s]

In [127]:
# generate result_4.csv for submission: get final prediction
combined_sub_data = np.concatenate((lgb_sub_predictions, nlp_sub_predictions), axis=1)

sub_meta_model = joblib.load(f"{lgb_model_dir}/meta_model.pkl")

combined_sub_pred = sub_meta_model.predict(combined_sub_data)
combined_sub_pred = column_encoders["price"][1].inverse_transform(combined_sub_pred.reshape(-1, 1))
res_df = pd.DataFrame({"Id": range(0, combined_sub_pred.shape[0]), "Predicted": combined_sub_pred.ravel()})
res_df.to_csv('result_4.csv', index=False)



In [114]:
check_gt = column_encoders["price"][1].inverse_transform(y_train.reshape(-1, 1))
check_lgb_pred = column_encoders["price"][1].inverse_transform(lgb_train_predictions)
check_nlp_pred = column_encoders["price"][1].inverse_transform(nlp_train_predictions)
rmse_lgb = np.sqrt(mean_squared_error(check_gt, check_lgb_pred))
rmse_nlp = np.sqrt(mean_squared_error(check_gt, check_nlp_pred))
print(f"rmse_lgb: {rmse_lgb}, rmse_nlp: {rmse_nlp}")
print(f"rmse_combined: {rmse}")

rmse_lgb: 22074.994318534722, rmse_nlp: 28886.207178926772
rmse_combined: 21977.465062154468
