In [727]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import KFold
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')
# import the necessary packages

In [728]:
target_column = 'Price'

In [729]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 150)

In [730]:
def handleOriginal(original, test_df):
    original.insert(0, 'id', range(test_df['id'].max()+1, test_df['id'].max() + 1 + len(original)))

In [731]:
train = pd.read_csv("train.csv")
train_extra = pd.read_csv("training_extra.csv")
original = pd.read_csv("original.csv")
test = pd.read_csv("test.csv")

handleOriginal(original, train_extra)

train['Dataset'] = 'train'
train_extra['Dataset'] = 'train_extra'
original['Dataset'] = 'original'
test['Dataset'] = 'test'

data = pd.concat([train, test, train_extra, original]).reset_index(drop=True)

In [732]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4246818 entries, 0 to 4246817
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Brand                 object 
 2   Material              object 
 3   Size                  object 
 4   Compartments          float64
 5   Laptop Compartment    object 
 6   Waterproof            object 
 7   Style                 object 
 8   Color                 object 
 9   Weight Capacity (kg)  float64
 10  Price                 float64
 11  Dataset               object 
dtypes: float64(3), int64(1), object(8)
memory usage: 388.8+ MB


In [733]:
def shift_target_column_to_end(data):
    target_data = data.pop(target_column)
    data[target_column] = target_data
    return data

In [734]:
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].astype('category')

In [735]:
data['Size'] = pd.Categorical(
    data['Size'], 
    categories=['Small', 'Medium', 'Large'],
    ordered=True
) # make size ordered

data['Compartments'] = data['Compartments'].astype('Int64')

In [736]:
for col in data.columns:
    if data[col].dtype != 'category' or col == 'Dataset':
        continue
    data[col] = data[col].cat.codes
    data[col] = data[col].astype('category')

In [737]:
ce_columns = ['Brand', 'Material', 'Color']

for col in ce_columns:
    data[f'{col}_FE'] = data.groupby(col)[col].transform('count') / len(data) # frequency encode color category

In [738]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4246818 entries, 0 to 4246817
Data columns (total 15 columns):
 #   Column                Dtype   
---  ------                -----   
 0   id                    int64   
 1   Brand                 category
 2   Material              category
 3   Size                  category
 4   Compartments          Int64   
 5   Laptop Compartment    category
 6   Waterproof            category
 7   Style                 category
 8   Color                 category
 9   Weight Capacity (kg)  float64 
 10  Price                 float64 
 11  Dataset               category
 12  Brand_FE              float64 
 13  Material_FE           float64 
 14  Color_FE              float64 
dtypes: Int64(1), category(8), float64(5), int64(1)
memory usage: 263.3 MB


In [739]:
data.head(10)

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price,Dataset,Brand_FE,Material_FE,Color_FE
0,0,1,1,1,7,1,0,2,0,11.611723,112.15875,train,0.187578,0.244332,0.155389
1,1,1,0,0,10,1,1,1,3,27.078537,68.88056,train,0.187578,0.226276,0.154516
2,2,4,1,0,2,1,0,1,5,16.64376,39.1732,train,0.200442,0.244332,0.157754
3,3,2,2,0,8,1,0,1,3,12.93722,80.60793,train,0.191386,0.236051,0.154516
4,4,0,0,1,1,1,1,1,3,17.749338,86.02312,train,0.199439,0.226276,0.154516
5,5,2,0,1,10,0,1,-1,0,7.241812,20.01553,train,0.191386,0.226276,0.155389
6,6,2,-1,2,3,0,0,0,3,6.828123,84.805,train,0.191386,0.028068,0.154516
7,7,3,0,0,1,1,1,0,1,21.488864,27.15815,train,0.189222,0.226276,0.159844
8,8,4,3,1,8,1,0,2,2,10.20778,25.98652,train,0.200442,0.265273,0.166636
9,9,4,2,1,2,1,1,1,4,15.8951,38.48741,train,0.200442,0.236051,0.172183


In [740]:
train_data = data[data['Dataset'] == 'train']
original_data = data[data['Dataset'] == 'original']
extra_data = data[data['Dataset'] == 'train_extra']
train_and_extra_data = data[data['Dataset'].isin(['train', 'train_extra'])]
full_data = data[data['Dataset'].isin(['train', 'train_extra', 'original'])]
test_data = data[data['Dataset'] == 'test']
test_data = test_data.drop(target_column, axis=1)

test_id = test_data[['id']].copy()
train_data = train_data.drop('id', axis=1)
original_data = original_data.drop('id', axis=1)
train_and_extra_data = train_and_extra_data.drop('id', axis=1)
full_data = full_data.drop('id', axis=1)
extra_data = extra_data.drop('id', axis=1)
test_data = test_data.drop('id', axis=1)

In [741]:
def getKFoldScores(kf, train_data, params):
    X = train_data.drop(target_column, axis=1)
    y = (train_data[target_column])

    scores = []
    for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        cat_columns = ['Brand', 'Material', 'Laptop Compartment', 'Waterproof', 'Size', 'Style', 'Color', 'Dataset']
        train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_columns)
        val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_columns)
        
        model = lgb.train(
            params,
            train_data,
            valid_sets=[val_data],
        )

        scores.append(model.best_score['valid_0']['rmse'])
    return scores

In [742]:
kf = KFold(n_splits=5)

In [743]:
params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'n_jobs': -1
}

train_scores = getKFoldScores(kf, train_data, params)
original_scores = getKFoldScores(kf, original_data, params)
extra_scores = getKFoldScores(kf, extra_data, params)
train_extra_scores = getKFoldScores(kf, train_and_extra_data, params)
full_scores = getKFoldScores(kf, full_data, params)

print("Mean RMSE (train):", np.mean(train_scores))
print("Mean RMSE (original):", np.mean(original_scores))
print("Mean RMSE (extra):", np.mean(extra_scores))
print("Mean RMSE (train_extra):", np.mean(train_extra_scores))
print("Mean RMSE (full):", np.mean(full_scores))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002857 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 12
[LightGBM] [Info] Start training from score 81.474972
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002687 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 12
[LightGBM] [Info] Start training from score 81.369008
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002766 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not eno

In [744]:
def getPrediction(estimator, test_id, test_data, target):
    y_pred_submission = estimator.predict(test_data)
    
    test_submission = test_id
    test_submission[target] = y_pred_submission

    return test_submission

In [745]:
def createSubmission(data, params, type_str):
    X = data.drop(target_column, axis=1)
    y = (data[target_column])
    data = lgb.Dataset(X, label=y)

    model = lgb.train(
            params,
            data
        )
    
    test_submission = getPrediction(model, test_id, test_data, target_column)
    test_submission.to_csv(f"lgbm_gbdt_{type_str}.csv", index=False)

In [746]:
createSubmission(train_data, params, "train")
createSubmission(original_data, params, "original")
createSubmission(extra_data, params, "extra")
createSubmission(train_and_extra_data, params, "train_extra")
createSubmission(full_data, params, "full")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003586 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 326
[LightGBM] [Info] Number of data points in the train set: 300000, number of used features: 12
[LightGBM] [Info] Start training from score 81.411107
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001558 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 327
[LightGBM] [Info] Number of data points in the train set: 52500, number of used features: 12
[LightGBM] [Info] Start training from score 77.820789
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.041776 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enou