In [None]:
!pip install -r requirements.txt

In [None]:
import boto3
import joblib
import math
import optuna

import lightgbm as lgb
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.linear_model import QuantileRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder



## Getting the data from S3

In [None]:
session = boto3.Session()
credentials = session.get_credentials()
current_credentials = credentials.get_frozen_credentials()

In [None]:
session = boto3.Session(
    aws_access_key_id=current_credentials.access_key,
    aws_secret_access_key=current_credentials.secret_key,
    aws_session_token=current_credentials.token
)

In [None]:
curated_bucket = 'curated-bucket-car-price-tk'
file_key = 'ml_sample_data_snapsoft.csv'

s3_client = session.client('s3')
response = s3_client.get_object(Bucket=curated_bucket, Key=file_key)
car_data = pd.read_csv(response['Body'])

## Data preparation for the training

In [None]:
#imputing float data values with median values
numerical_cols = car_data.select_dtypes(include=['float64', 'int64']).columns
imputer = SimpleImputer(strategy='median')
car_data[numerical_cols] = imputer.fit_transform(car_data[numerical_cols])
car_data.head()

In [None]:
from datetime import datetime
import numpy as np

#create timestamp column from salesdate
car_data['saledate'] = pd.to_datetime(car_data['saledate'])

car_data['saledate_timestamp'] = pd.to_datetime(car_data['saledate']).values.astype(np.int64) // 10 ** 9
car_data.saledate_timestamp

In [None]:
#Encoding categorical columns using one hot encoding
encoder = OneHotEncoder()

categorical_columns = car_data.select_dtypes(include=['object']).columns
encoded_data = encoder.fit_transform(car_data[categorical_columns])

encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(categorical_columns))

df = car_data.drop(categorical_columns, axis=1)
df = pd.concat([df, encoded_df], axis=1)


## Price underestimation methods

In [None]:


#before training
#cutoff the 95 percentile of the training dataset
def upper_percentile_cutoff(percentile, source_dataset,destination_dataset):
    percentile = np.percentile(source_dataset, percentile)
    destination_dataset = np.where(destination_dataset > percentile, percentile, destination_dataset)
    return destination_dataset

#after training
#subtract the 5 percentile of the predictions
def lower_percentile_subtract(percentile, source_dataset, destination_dataset):
    percentile = np.percentile(source_dataset, percentile)
    destination_dataset = destination_dataset - percentile
    return destination_dataset

#before training
#scale the training dataset to a max range
def min_max_scale(data, feature_range=(0, 1)):
    min_val = feature_range[0]
    max_val = feature_range[1]
    
    data_min = np.min(data)
    data_max = np.max(data)
    
    scaled_data = (data - data_min) / (data_max - data_min) * (max_val - min_val) + min_val
    return scaled_data


## Model training

In [None]:

def tune_and_train_model(X_train, y_train_scaled, X_test, y_test_scaled):
    def objective(trial):
        param = {
            'objective': 'root_mean_squared_error',
            'metric': 'root_mean_squared_error',
            'verbosity': -1,
            'boosting_type': 'gbdt',
            'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
            'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
            'num_leaves': trial.suggest_int('num_leaves', 2, 256),
            'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
            'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100)
        }

        train_data = lgb.Dataset(X_train, label=y_train_scaled)
        valid_data = lgb.Dataset(X_test, label=y_test_scaled, reference=train_data)

        model = lgb.train(param, train_data, valid_sets=[valid_data])
        preds = model.predict(X_test)
        rmse = mean_squared_error(y_test_scaled, preds)
        return rmse

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=100)

    best_params = study.best_params
    print(f"Best parameters: {best_params}")

    train_data = lgb.Dataset(X_train, label=y_train_scaled)
    valid_data = lgb.Dataset(X_test, label=y_test_scaled, reference=train_data)

    model = lgb.train(best_params, train_data, valid_sets=[valid_data])

    return model



## Predictions and plotting

In [None]:
def make_predictions_and_plot(model,y, X_test, y_test, saledate_y,title,models,after_training_cutoff=None):
    predictions = model.predict(X_test)
    if after_training_cutoff is not None:
        predictions = after_training_cutoff(5,y,predictions)
    rmse = math.sqrt(mean_squared_error(y_test, predictions))
    print(f'Mean Squared Error: {math.sqrt(rmse)}')

    plot_df = pd.DataFrame({
        'saledate': saledate_y,
        'actual_price': y_test,
        'predictions': predictions
    })

    #save model to a dictionary along with the rmse
    models[model] = rmse

    plot_df = plot_df.sort_values(by='saledate')

    plt.figure(figsize=(10, 6))
    plt.plot(plot_df['saledate'], plot_df['actual_price'], label='Actual Prices', marker='o')
    plt.plot(plot_df['saledate'], plot_df['predictions'], label='Predictions', marker='x')
    plt.xlabel('Sale Date')
    plt.ylabel('Price')
    plt.title(title)
    plt.legend()
    plt.show()

In [None]:
#prepare the data
os.environ['LOKY_MAX_CPU_COUNT'] = '8'
X = df.drop(['Price','saledate'], axis=1)
y = df['Price']

saledate = df['saledate']
models = {}

In [None]:
#95 percentile impute
X_train, X_test, y_train, y_test, saledate_X,saledate_y = train_test_split(X, y,saledate, test_size=0.2, random_state=42)
y_train_upper_cut = upper_percentile_cutoff(95, y,y_train)
model = tune_and_train_model(X_train, y_train_upper_cut, X_test, y_test)
make_predictions_and_plot(model,y, X_test, y_test, saledate_y,'Actual Prices vs Underestimated Predictions Based on Sale Date on training dataset 95% impute',models,after_training_cutoff=None)


In [None]:
#5 percentile subtract
X_train, X_test, y_train, y_test, saledate_X,saledate_y = train_test_split(X, y,saledate, test_size=0.2, random_state=42)
model = tune_and_train_model(X_train, y_train, X_test, y_test)
make_predictions_and_plot(model, y,X_test, y_test, saledate_y,'Actual Prices vs Underestimated Predictions Based on Sale Date on training dataset 5% substract',models,after_training_cutoff=lower_percentile_subtract)


In [None]:
#95 percentile min max scale
X_train, X_test, y_train, y_test, saledate_X,saledate_y = train_test_split(X, y,saledate, test_size=0.2, random_state=42)
y_train = y_train.values.reshape(-1, 1)
percentile_95 = np.percentile(y, 95)
y_train_scaled = min_max_scale(y_train, feature_range=(0, percentile_95))
model = tune_and_train_model(X_train, y_train_scaled, X_test, y_test)
make_predictions_and_plot(model,y, X_test, y_test, saledate_y,'Actual Prices vs Underestimated Predictions Based on Sale Date on training dataset 95% percentile min max scale',models,after_training_cutoff=None)

In [None]:
#saving the best model
key = min(models, key=models.get)
joblib.dump(key, 'best_model.pkl')

## Conclusions
- Based on the dataset LightGBM was a good model for predictions as it's capabilites for feature engineering, fast training and robustness of over-fitting

- The Root Mean Squared Error (RMSE) is a popular evaluation metric for regression tasks. In this case, lower RMSE indicates better model performance. It is also in the same unit as the results of the regression.

- The actual prices and underestimated predictions based on sale date are visualized for each preprocessing method.

- The 95 percentile impute method resulted in the lowest RMSE, while the min-max scale method had the highest RMSE.