In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler, MinMaxScaler
from catboost import CatBoostRegressor
import os

for dirname, _, filenames in os.walk('./inputs/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./inputs/sample_submission.csv
./inputs/test.csv
./inputs/train.csv
./inputs/training_extra.csv


In [2]:
# Loading the Extra Training data
train = pd.read_csv('./inputs/train.csv')
training_extra = pd.read_csv('./inputs/training_extra.csv')
df = pd.concat([train, training_extra], ignore_index=True, sort=False)
target = df["Price"]
df.drop("Price", axis=1, inplace=True)
df.drop("id", axis=1, inplace=True)
training_extra = df
training_extra

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.643760
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.937220
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338
...,...,...,...,...,...,...,...,...,...
3994313,Nike,Canvas,,3.0,Yes,Yes,Messenger,Blue,28.098120
3994314,Puma,Leather,Small,10.0,Yes,Yes,Tote,Blue,17.379531
3994315,Jansport,Canvas,Large,10.0,No,No,Backpack,Red,17.037708
3994316,Puma,Canvas,,2.0,No,No,Backpack,Gray,28.783339


In [3]:
for i in training_extra:
    if i != "Weight Capacity (kg)" and i != "Compartments":
        training_extra[i] = LabelEncoder().fit_transform(training_extra[i])

training_extra

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,1,1,1,7.0,1,0,2,0,11.611723
1,1,0,2,10.0,1,1,1,3,27.078537
2,4,1,2,2.0,1,0,1,5,16.643760
3,2,2,2,8.0,1,0,1,3,12.937220
4,0,0,1,1.0,1,1,1,3,17.749338
...,...,...,...,...,...,...,...,...,...
3994313,2,0,3,3.0,1,1,1,1,28.098120
3994314,3,1,2,10.0,1,1,2,1,17.379531
3994315,1,0,0,10.0,0,0,0,5,17.037708
3994316,3,0,3,2.0,0,0,0,2,28.783339


In [4]:
for i in training_extra:
    if sorted(training_extra[i].unique())[-1] < 8:
        training_extra[i] = np.where(training_extra[i] == sorted(training_extra[i].unique())[-1], np.nan,
                                     training_extra[i])

training_extra

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,1.0,1.0,1.0,7.0,1.0,0.0,2.0,0.0,11.611723
1,1.0,0.0,2.0,10.0,1.0,1.0,1.0,3.0,27.078537
2,4.0,1.0,2.0,2.0,1.0,0.0,1.0,5.0,16.643760
3,2.0,2.0,2.0,8.0,1.0,0.0,1.0,3.0,12.937220
4,0.0,0.0,1.0,1.0,1.0,1.0,1.0,3.0,17.749338
...,...,...,...,...,...,...,...,...,...
3994313,2.0,0.0,,3.0,1.0,1.0,1.0,1.0,28.098120
3994314,3.0,1.0,2.0,10.0,1.0,1.0,2.0,1.0,17.379531
3994315,1.0,0.0,0.0,10.0,0.0,0.0,0.0,5.0,17.037708
3994316,3.0,0.0,,2.0,0.0,0.0,0.0,2.0,28.783339


In [5]:
# Loading the Test data
test = pd.read_csv("./inputs/test.csv")
test.drop("id", axis=1, inplace=True)
test

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953
...,...,...,...,...,...,...,...,...,...
199995,Adidas,Canvas,Large,2.0,Yes,No,Messenger,Red,7.383498
199996,Nike,Polyester,Small,9.0,No,Yes,Messenger,Pink,6.058394
199997,Jansport,Nylon,Small,9.0,No,Yes,Tote,Green,26.890163
199998,Puma,Nylon,Large,10.0,Yes,No,Tote,Gray,25.769153


In [6]:
for i in test:
    if i != "Weight Capacity (kg)" and i != "Compartments":
        test[i] = LabelEncoder().fit_transform(test[i])

test

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,3,1,2,2.0,0,0,2,3,20.671147
1,2,0,1,7.0,0,1,0,3,13.564105
2,0,0,0,9.0,0,1,1,1,11.809799
3,0,2,0,1.0,1,0,1,3,18.477036
4,5,2,0,2.0,1,1,2,0,9.907953
...,...,...,...,...,...,...,...,...,...
199995,0,0,0,2.0,1,0,1,5,7.383498
199996,2,3,2,9.0,0,1,1,4,6.058394
199997,1,2,2,9.0,0,1,2,3,26.890163
199998,3,2,0,10.0,1,0,2,2,25.769153


In [7]:
for i in test:
    if sorted(test[i].unique())[-1] < 8:
        test[i] = np.where(test[i] == sorted(test[i].unique())[-1], np.nan, test[i])

test

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,3.0,1.0,2.0,2.0,0.0,0.0,2.0,3.0,20.671147
1,2.0,0.0,1.0,7.0,0.0,1.0,0.0,3.0,13.564105
2,0.0,0.0,0.0,9.0,0.0,1.0,1.0,1.0,11.809799
3,0.0,2.0,0.0,1.0,1.0,0.0,1.0,3.0,18.477036
4,,2.0,0.0,2.0,1.0,1.0,2.0,0.0,9.907953
...,...,...,...,...,...,...,...,...,...
199995,0.0,0.0,0.0,2.0,1.0,0.0,1.0,5.0,7.383498
199996,2.0,3.0,2.0,9.0,0.0,1.0,1.0,4.0,6.058394
199997,1.0,2.0,2.0,9.0,0.0,1.0,2.0,3.0,26.890163
199998,3.0,2.0,0.0,10.0,1.0,0.0,2.0,2.0,25.769153


In [8]:
scaler = RobustScaler()
scaler.fit(training_extra)
training_extra_scaled = scaler.transform(training_extra)
training_extra_scaled

array([[-0.5       , -0.5       ,  0.        , ...,  0.5       ,
        -1.        , -0.54055584],
       [-0.5       , -1.        ,  0.5       , ...,  0.        ,
         0.        ,  0.75715444],
       [ 1.        , -0.5       ,  0.5       , ...,  0.        ,
         0.66666667, -0.11835342],
       ...,
       [-0.5       , -1.        , -0.5       , ..., -0.5       ,
         0.66666667, -0.08530005],
       [ 0.5       , -1.        ,         nan, ..., -0.5       ,
        -0.33333333,  0.9001923 ],
       [ 1.        ,  0.5       ,  0.        , ..., -0.5       ,
        -0.66666667,  0.42134422]])

In [9]:
test_scaled = scaler.transform(test)
test_scaled

array([[ 0.5       , -0.5       ,  0.5       , ...,  0.5       ,
         0.        ,  0.21955594],
       [ 0.        , -1.        ,  0.        , ..., -0.5       ,
         0.        , -0.37674538],
       [-1.        , -1.        , -0.5       , ...,  0.        ,
        -0.66666667, -0.52393667],
       ...,
       [-0.5       ,  0.        ,  0.5       , ...,  0.5       ,
         0.        ,  0.74134939],
       [ 0.5       ,  0.        , -0.5       , ...,  0.5       ,
        -0.33333333,  0.64729337],
       [ 1.        , -0.5       ,  0.        , ...,  0.        ,
         0.33333333,  0.93305597]])

In [10]:
X_train = training_extra_scaled
X_test = test_scaled
y_train = target

In [11]:
clf = CatBoostRegressor(n_estimators=1000, learning_rate=0.01, max_depth=5, random_state=42)
clf.load_model('./outputs/catboost_extra.model')
clf.fit(X_train, y_train, init_model=clf)
clf.save_model('./outputs/catboost_extra.model')
predictions = clf.predict(X_test)
predictions

0:	learn: 38.9101361	total: 459ms	remaining: 7m 38s
1:	learn: 38.9097280	total: 767ms	remaining: 6m 22s
2:	learn: 38.9093273	total: 1.08s	remaining: 5m 58s
3:	learn: 38.9089281	total: 1.35s	remaining: 5m 37s
4:	learn: 38.9085508	total: 1.62s	remaining: 5m 22s
5:	learn: 38.9081613	total: 1.9s	remaining: 5m 15s
6:	learn: 38.9077955	total: 2.18s	remaining: 5m 9s
7:	learn: 38.9074470	total: 2.42s	remaining: 5m
8:	learn: 38.9070937	total: 2.73s	remaining: 5m
9:	learn: 38.9067227	total: 2.98s	remaining: 4m 54s
10:	learn: 38.9063598	total: 3.24s	remaining: 4m 51s
11:	learn: 38.9060108	total: 3.48s	remaining: 4m 46s
12:	learn: 38.9056673	total: 3.76s	remaining: 4m 45s
13:	learn: 38.9053300	total: 4s	remaining: 4m 41s
14:	learn: 38.9049972	total: 4.27s	remaining: 4m 40s
15:	learn: 38.9046609	total: 4.5s	remaining: 4m 37s
16:	learn: 38.9043561	total: 4.8s	remaining: 4m 37s
17:	learn: 38.9040288	total: 5.05s	remaining: 4m 35s
18:	learn: 38.9037085	total: 5.32s	remaining: 4m 34s
19:	learn: 38.9034

array([81.34845172, 82.38257043, 81.9488286 , ..., 83.25107191,
       81.56844318, 81.54201567])

In [12]:
train_score = clf.score(X_train, y_train)
train_score

0.003542332507343593

In [13]:
submission_test = pd.read_csv("./inputs/test.csv")
output = pd.DataFrame({'id': submission_test.id, 'Price': predictions})
output.to_csv('./outputs/submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [14]:
!kaggle competitions submit -c playground-series-s5e2 -f ./outputs/submission.csv -m f"{clf.__class__.__name__} with {clf.get_param("n_estimators")} estimators, a learning rate of {clf.get_param("learning_rate")}, max depth of {clf.get_param("max_depth")} and a random state of {clf.get_param("random_state")}"

Successfully submitted to Backpack Prediction Challenge



  0%|          | 0.00/4.93M [00:00<?, ?B/s]
  0%|          | 16.0k/4.93M [00:00<00:54, 95.4kB/s]
 11%|#1        | 576k/4.93M [00:00<00:04, 968kB/s]  
 50%|#####     | 2.47M/4.93M [00:01<00:00, 2.91MB/s]
 55%|#####5    | 2.73M/4.93M [00:01<00:01, 1.36MB/s]
 59%|#####8    | 2.91M/4.93M [00:02<00:01, 1.39MB/s]
 75%|#######4  | 3.69M/4.93M [00:02<00:00, 2.08MB/s]
100%|##########| 4.93M/4.93M [00:03<00:00, 1.72MB/s]
