In [37]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler, MinMaxScaler
from catboost import CatBoostRegressor
import os

for dirname, _, filenames in os.walk('./inputs/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./inputs/sample_submission.csv
./inputs/test.csv
./inputs/train.csv
./inputs/training_extra.csv


In [38]:
# Loading the Extra Training data
train = pd.read_csv('./inputs/train.csv')
training_extra = pd.read_csv('./inputs/training_extra.csv')
df = pd.concat([train, training_extra], ignore_index=True, sort=False)
target = df["Price"]
df.drop("Price", axis=1, inplace=True)
df.drop("id", axis=1, inplace=True)
training_extra = df
training_extra

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.643760
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.937220
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338
...,...,...,...,...,...,...,...,...,...
3994313,Nike,Canvas,,3.0,Yes,Yes,Messenger,Blue,28.098120
3994314,Puma,Leather,Small,10.0,Yes,Yes,Tote,Blue,17.379531
3994315,Jansport,Canvas,Large,10.0,No,No,Backpack,Red,17.037708
3994316,Puma,Canvas,,2.0,No,No,Backpack,Gray,28.783339


In [39]:
for i in training_extra:
    if i != "Weight Capacity (kg)" and i != "Compartments":
        training_extra[i] = LabelEncoder().fit_transform(training_extra[i])

training_extra

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,1,1,1,7.0,1,0,2,0,11.611723
1,1,0,2,10.0,1,1,1,3,27.078537
2,4,1,2,2.0,1,0,1,5,16.643760
3,2,2,2,8.0,1,0,1,3,12.937220
4,0,0,1,1.0,1,1,1,3,17.749338
...,...,...,...,...,...,...,...,...,...
3994313,2,0,3,3.0,1,1,1,1,28.098120
3994314,3,1,2,10.0,1,1,2,1,17.379531
3994315,1,0,0,10.0,0,0,0,5,17.037708
3994316,3,0,3,2.0,0,0,0,2,28.783339


In [40]:
for i in training_extra:
    if sorted(training_extra[i].unique())[-1] < 8:
        training_extra[i] = np.where(training_extra[i] == sorted(training_extra[i].unique())[-1], np.nan,
                                     training_extra[i])

training_extra

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,1.0,1.0,1.0,7.0,1.0,0.0,2.0,0.0,11.611723
1,1.0,0.0,2.0,10.0,1.0,1.0,1.0,3.0,27.078537
2,4.0,1.0,2.0,2.0,1.0,0.0,1.0,5.0,16.643760
3,2.0,2.0,2.0,8.0,1.0,0.0,1.0,3.0,12.937220
4,0.0,0.0,1.0,1.0,1.0,1.0,1.0,3.0,17.749338
...,...,...,...,...,...,...,...,...,...
3994313,2.0,0.0,,3.0,1.0,1.0,1.0,1.0,28.098120
3994314,3.0,1.0,2.0,10.0,1.0,1.0,2.0,1.0,17.379531
3994315,1.0,0.0,0.0,10.0,0.0,0.0,0.0,5.0,17.037708
3994316,3.0,0.0,,2.0,0.0,0.0,0.0,2.0,28.783339


In [41]:
# Loading the Test data
test = pd.read_csv("./inputs/test.csv")
test.drop("id", axis=1, inplace=True)
test

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953
...,...,...,...,...,...,...,...,...,...
199995,Adidas,Canvas,Large,2.0,Yes,No,Messenger,Red,7.383498
199996,Nike,Polyester,Small,9.0,No,Yes,Messenger,Pink,6.058394
199997,Jansport,Nylon,Small,9.0,No,Yes,Tote,Green,26.890163
199998,Puma,Nylon,Large,10.0,Yes,No,Tote,Gray,25.769153


In [42]:
for i in test:
    if i != "Weight Capacity (kg)" and i != "Compartments":
        test[i] = LabelEncoder().fit_transform(test[i])

test

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,3,1,2,2.0,0,0,2,3,20.671147
1,2,0,1,7.0,0,1,0,3,13.564105
2,0,0,0,9.0,0,1,1,1,11.809799
3,0,2,0,1.0,1,0,1,3,18.477036
4,5,2,0,2.0,1,1,2,0,9.907953
...,...,...,...,...,...,...,...,...,...
199995,0,0,0,2.0,1,0,1,5,7.383498
199996,2,3,2,9.0,0,1,1,4,6.058394
199997,1,2,2,9.0,0,1,2,3,26.890163
199998,3,2,0,10.0,1,0,2,2,25.769153


In [43]:
for i in test:
    if sorted(test[i].unique())[-1] < 8:
        test[i] = np.where(test[i] == sorted(test[i].unique())[-1], np.nan, test[i])

test

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,3.0,1.0,2.0,2.0,0.0,0.0,2.0,3.0,20.671147
1,2.0,0.0,1.0,7.0,0.0,1.0,0.0,3.0,13.564105
2,0.0,0.0,0.0,9.0,0.0,1.0,1.0,1.0,11.809799
3,0.0,2.0,0.0,1.0,1.0,0.0,1.0,3.0,18.477036
4,,2.0,0.0,2.0,1.0,1.0,2.0,0.0,9.907953
...,...,...,...,...,...,...,...,...,...
199995,0.0,0.0,0.0,2.0,1.0,0.0,1.0,5.0,7.383498
199996,2.0,3.0,2.0,9.0,0.0,1.0,1.0,4.0,6.058394
199997,1.0,2.0,2.0,9.0,0.0,1.0,2.0,3.0,26.890163
199998,3.0,2.0,0.0,10.0,1.0,0.0,2.0,2.0,25.769153


In [44]:
scaler = RobustScaler()
scaler.fit(training_extra)
training_extra_scaled = scaler.transform(training_extra)
training_extra_scaled

array([[-0.5       , -0.5       ,  0.        , ...,  0.5       ,
        -1.        , -0.54055584],
       [-0.5       , -1.        ,  0.5       , ...,  0.        ,
         0.        ,  0.75715444],
       [ 1.        , -0.5       ,  0.5       , ...,  0.        ,
         0.66666667, -0.11835342],
       ...,
       [-0.5       , -1.        , -0.5       , ..., -0.5       ,
         0.66666667, -0.08530005],
       [ 0.5       , -1.        ,         nan, ..., -0.5       ,
        -0.33333333,  0.9001923 ],
       [ 1.        ,  0.5       ,  0.        , ..., -0.5       ,
        -0.66666667,  0.42134422]])

In [45]:
test_scaled = scaler.transform(test)
test_scaled

array([[ 0.5       , -0.5       ,  0.5       , ...,  0.5       ,
         0.        ,  0.21955594],
       [ 0.        , -1.        ,  0.        , ..., -0.5       ,
         0.        , -0.37674538],
       [-1.        , -1.        , -0.5       , ...,  0.        ,
        -0.66666667, -0.52393667],
       ...,
       [-0.5       ,  0.        ,  0.5       , ...,  0.5       ,
         0.        ,  0.74134939],
       [ 0.5       ,  0.        , -0.5       , ...,  0.5       ,
        -0.33333333,  0.64729337],
       [ 1.        , -0.5       ,  0.        , ...,  0.        ,
         0.33333333,  0.93305597]])

In [46]:
X_train = training_extra_scaled
X_test = test_scaled
y_train = target

In [47]:
clf = CatBoostRegressor(n_estimators=1000, learning_rate=0.01, max_depth=5, random_state=42)
clf.load_model('./outputs/catboost_extra.model')
clf.fit(X_train, y_train, init_model=clf)
clf.save_model('./outputs/catboost_extra.model')
predictions = clf.predict(X_test)
predictions

0:	learn: 38.8788724	total: 415ms	remaining: 6m 54s
1:	learn: 38.8788624	total: 767ms	remaining: 6m 22s
2:	learn: 38.8788523	total: 1.12s	remaining: 6m 13s
3:	learn: 38.8788468	total: 1.46s	remaining: 6m 3s
4:	learn: 38.8788419	total: 1.79s	remaining: 5m 55s
5:	learn: 38.8788268	total: 2.02s	remaining: 5m 34s
6:	learn: 38.8788206	total: 2.3s	remaining: 5m 26s
7:	learn: 38.8788084	total: 2.53s	remaining: 5m 13s
8:	learn: 38.8788010	total: 2.77s	remaining: 5m 5s
9:	learn: 38.8787932	total: 3s	remaining: 4m 57s
10:	learn: 38.8787876	total: 3.25s	remaining: 4m 51s
11:	learn: 38.8787737	total: 3.52s	remaining: 4m 50s
12:	learn: 38.8787579	total: 3.75s	remaining: 4m 44s
13:	learn: 38.8787417	total: 3.99s	remaining: 4m 41s
14:	learn: 38.8787315	total: 4.26s	remaining: 4m 39s
15:	learn: 38.8787012	total: 4.51s	remaining: 4m 37s
16:	learn: 38.8786638	total: 4.8s	remaining: 4m 37s
17:	learn: 38.8786549	total: 5.07s	remaining: 4m 36s
18:	learn: 38.8786466	total: 5.33s	remaining: 4m 35s
19:	learn:

array([81.16766817, 82.372491  , 82.02632169, ..., 82.93965036,
       81.91704158, 81.52676417])

In [48]:
train_score = clf.score(X_train, y_train)
train_score

0.0035084167962884383

In [49]:
submission_test = pd.read_csv("./inputs/test.csv")
output = pd.DataFrame({'id': submission_test.id, 'Price': predictions})
output.to_csv('./outputs/submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [50]:
!kaggle competitions submit -c playground-series-s5e2 -f ./outputs/submission.csv -m f"{clf.__class__.__name__} with {clf.get_param("n_estimators")} estimators, a learning rate of {clf.get_param("learning_rate")}, max depth of {clf.get_param("max_depth")} and a random state of {clf.get_param("random_state")}"

Successfully submitted to Backpack Prediction Challenge



  0%|          | 0.00/4.93M [00:00<?, ?B/s]
  0%|          | 16.0k/4.93M [00:00<00:52, 98.0kB/s]
 11%|#1        | 576k/4.93M [00:00<00:04, 973kB/s]  
 50%|#####     | 2.48M/4.93M [00:01<00:00, 2.92MB/s]
 56%|#####5    | 2.75M/4.93M [00:01<00:01, 1.58MB/s]
 59%|#####9    | 2.92M/4.93M [00:01<00:01, 1.44MB/s]
 62%|######2   | 3.06M/4.93M [00:02<00:01, 1.43MB/s]
 75%|#######5  | 3.70M/4.93M [00:02<00:00, 2.08MB/s]
100%|##########| 4.93M/4.93M [00:03<00:00, 1.71MB/s]
