In [407]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import KFold
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')
# import the necessary packages

In [408]:
target_column = 'Price'

In [409]:
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 150)

In [410]:
def handleOriginal(original, test_df):
    original.insert(0, 'id', range(test_df['id'].max()+1, test_df['id'].max() + 1 + len(original)))

In [411]:
train = pd.read_csv("train.csv")
train_extra = pd.read_csv("training_extra.csv")
original = pd.read_csv("original.csv")
test = pd.read_csv("test.csv")

handleOriginal(original, train_extra)

train['Dataset'] = 'train'
train_extra['Dataset'] = 'train_extra'
original['Dataset'] = 'original'
test['Dataset'] = 'test'

data = pd.concat([train, test, original]).reset_index(drop=True)

In [412]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 552500 entries, 0 to 552499
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    552500 non-null  int64  
 1   Brand                 533943 non-null  object 
 2   Material              535915 non-null  object 
 3   Size                  538899 non-null  object 
 4   Compartments          549875 non-null  float64
 5   Laptop Compartment    537469 non-null  object 
 6   Waterproof            538014 non-null  object 
 7   Style                 536752 non-null  object 
 8   Color                 533140 non-null  object 
 9   Weight Capacity (kg)  549660 non-null  float64
 10  Price                 349875 non-null  float64
 11  Dataset               552500 non-null  object 
dtypes: float64(3), int64(1), object(8)
memory usage: 50.6+ MB


In [413]:
def shift_target_column_to_end(data):
    target_data = data.pop(target_column)
    data[target_column] = target_data
    return data

In [414]:
for col in data.columns:
    if data[col].dtype == 'object':
        data[col] = data[col].astype('category')

In [415]:
for col in data.columns:
    if data[col].dtype != 'category' or col == 'Dataset':
        continue
    data[col] = data[col].cat.codes
    data[col] = data[col].astype('category')

data['Laptop Compartment'] = data['Laptop Compartment'].astype('bool')
data['Waterproof'] = data['Waterproof'].astype('bool')

In [416]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 552500 entries, 0 to 552499
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype   
---  ------                --------------   -----   
 0   id                    552500 non-null  int64   
 1   Brand                 552500 non-null  category
 2   Material              552500 non-null  category
 3   Size                  552500 non-null  category
 4   Compartments          549875 non-null  float64 
 5   Laptop Compartment    552500 non-null  bool    
 6   Waterproof            552500 non-null  bool    
 7   Style                 552500 non-null  category
 8   Color                 552500 non-null  category
 9   Weight Capacity (kg)  549660 non-null  float64 
 10  Price                 349875 non-null  float64 
 11  Dataset               552500 non-null  category
dtypes: bool(2), category(6), float64(3), int64(1)
memory usage: 21.1 MB


In [417]:
data.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price,Dataset
0,0,1,1,1,7.0,True,False,2,0,11.611723,112.15875,train
1,1,1,0,2,10.0,True,True,1,3,27.078537,68.88056,train
2,2,4,1,2,2.0,True,False,1,5,16.64376,39.1732,train
3,3,2,2,2,8.0,True,False,1,3,12.93722,80.60793,train
4,4,0,0,1,1.0,True,True,1,3,17.749338,86.02312,train


In [418]:
train_data = data[data['Dataset'] == 'train']
original_data = data[data['Dataset'] == 'original']
test_data = data[data['Dataset'] == 'test']
test_data = test_data.drop(target_column, axis=1)

test_id = test_data[['id']].copy()
train_data = train_data.drop('id', axis=1)
test_data = test_data.drop('id', axis=1)

In [419]:
X = train_data.drop(target_column, axis=1)
y = (train_data[target_column])

In [420]:
kf = KFold(n_splits=5)

In [421]:
params = {
          'objective': 'regression',
          'metric': 'rmse',
          'boosting_type': 'gbdt',
          'n_jobs': -1
}

scores = []
for i, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    cat_columns = ['Brand', 'Material', 'Size', 'Style', 'Color', "Dataset"]
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_columns)
    val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_columns)
    
    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
    )

    scores.append(model.best_score['valid_0']['rmse'])
print("Mean MAE score:", np.mean(scores))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001780 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 301
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 9
[LightGBM] [Info] Start training from score 81.474972
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001854 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 301
[LightGBM] [Info] Number of data points in the train set: 240000, number of used features: 9
[LightGBM] [Info] Start training from score 81.369008
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001749 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enoug

In [422]:
train_data = lgb.Dataset(X, label=y)

model = lgb.train(
        params,
        train_data
    )

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002087 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 301
[LightGBM] [Info] Number of data points in the train set: 300000, number of used features: 9
[LightGBM] [Info] Start training from score 81.411107


In [423]:
def getPrediction(estimator, test_id, test_data, target):
    y_pred_submission = estimator.predict(test_data)
    
    test_submission = test_id
    test_submission[target] = y_pred_submission

    return test_submission

In [424]:
test_submission = getPrediction(model, test_id, test_data, target_column)
test_submission.to_csv("lgbm_gbdt_train.csv", index=False)