In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import root_mean_squared_error
from sklearn.decomposition import PCA
from catboost import CatBoostRegressor
import matplotlib.pyplot as plt
import os

for dirname, _, filenames in os.walk('./inputs/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./inputs/sample_submission.csv
./inputs/test.csv
./inputs/train.csv
./inputs/training_extra.csv


In [2]:
# Loading the Extra Training data
df = pd.read_csv('./inputs/training_extra.csv')
target = df["Price"]
df.drop("Price", axis=1, inplace=True)
df.drop("id", axis=1, inplace=True)
training_extra = df
training_extra

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,Under Armour,Canvas,Small,10.0,Yes,Yes,Tote,Blue,23.882052
1,Puma,Polyester,Small,4.0,No,Yes,Backpack,Green,11.869095
2,Jansport,Polyester,Small,8.0,Yes,Yes,Tote,Red,8.092302
3,Nike,Nylon,Large,7.0,No,No,Messenger,Pink,7.719581
4,Nike,Leather,Large,9.0,No,Yes,Tote,Green,22.741826
...,...,...,...,...,...,...,...,...,...
3694313,Nike,Canvas,,3.0,Yes,Yes,Messenger,Blue,28.098120
3694314,Puma,Leather,Small,10.0,Yes,Yes,Tote,Blue,17.379531
3694315,Jansport,Canvas,Large,10.0,No,No,Backpack,Red,17.037708
3694316,Puma,Canvas,,2.0,No,No,Backpack,Gray,28.783339


In [3]:
for i in training_extra:
    if i != "Weight Capacity (kg)" and i != "Compartments":
        training_extra[i] = LabelEncoder().fit_transform(training_extra[i])

training_extra

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,4,0,2,10.0,1,1,2,1,23.882052
1,3,3,2,4.0,0,1,0,3,11.869095
2,1,3,2,8.0,1,1,2,5,8.092302
3,2,2,0,7.0,0,0,1,4,7.719581
4,2,1,0,9.0,0,1,2,3,22.741826
...,...,...,...,...,...,...,...,...,...
3694313,2,0,3,3.0,1,1,1,1,28.098120
3694314,3,1,2,10.0,1,1,2,1,17.379531
3694315,1,0,0,10.0,0,0,0,5,17.037708
3694316,3,0,3,2.0,0,0,0,2,28.783339


In [4]:
for i in training_extra:
    if sorted(training_extra[i].unique())[-1] < 8:
        training_extra[i] = np.where(training_extra[i] == sorted(training_extra[i].unique())[-1], np.nan,
                                     training_extra[i])

training_extra

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,4.0,0.0,2.0,10.0,1.0,1.0,2.0,1.0,23.882052
1,3.0,3.0,2.0,4.0,0.0,1.0,0.0,3.0,11.869095
2,1.0,3.0,2.0,8.0,1.0,1.0,2.0,5.0,8.092302
3,2.0,2.0,0.0,7.0,0.0,0.0,1.0,4.0,7.719581
4,2.0,1.0,0.0,9.0,0.0,1.0,2.0,3.0,22.741826
...,...,...,...,...,...,...,...,...,...
3694313,2.0,0.0,,3.0,1.0,1.0,1.0,1.0,28.098120
3694314,3.0,1.0,2.0,10.0,1.0,1.0,2.0,1.0,17.379531
3694315,1.0,0.0,0.0,10.0,0.0,0.0,0.0,5.0,17.037708
3694316,3.0,0.0,,2.0,0.0,0.0,0.0,2.0,28.783339


In [5]:
# Loading the Test data
test = pd.read_csv("./inputs/test.csv")
test.drop("id", axis=1, inplace=True)
test

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953
...,...,...,...,...,...,...,...,...,...
199995,Adidas,Canvas,Large,2.0,Yes,No,Messenger,Red,7.383498
199996,Nike,Polyester,Small,9.0,No,Yes,Messenger,Pink,6.058394
199997,Jansport,Nylon,Small,9.0,No,Yes,Tote,Green,26.890163
199998,Puma,Nylon,Large,10.0,Yes,No,Tote,Gray,25.769153


In [6]:
for i in test:
    if i != "Weight Capacity (kg)" and i != "Compartments":
        test[i] = LabelEncoder().fit_transform(test[i])

test

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,3,1,2,2.0,0,0,2,3,20.671147
1,2,0,1,7.0,0,1,0,3,13.564105
2,0,0,0,9.0,0,1,1,1,11.809799
3,0,2,0,1.0,1,0,1,3,18.477036
4,5,2,0,2.0,1,1,2,0,9.907953
...,...,...,...,...,...,...,...,...,...
199995,0,0,0,2.0,1,0,1,5,7.383498
199996,2,3,2,9.0,0,1,1,4,6.058394
199997,1,2,2,9.0,0,1,2,3,26.890163
199998,3,2,0,10.0,1,0,2,2,25.769153


In [7]:
for i in test:
    if sorted(test[i].unique())[-1] < 8:
        test[i] = np.where(test[i] == sorted(test[i].unique())[-1], np.nan, test[i])

test

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,3.0,1.0,2.0,2.0,0.0,0.0,2.0,3.0,20.671147
1,2.0,0.0,1.0,7.0,0.0,1.0,0.0,3.0,13.564105
2,0.0,0.0,0.0,9.0,0.0,1.0,1.0,1.0,11.809799
3,0.0,2.0,0.0,1.0,1.0,0.0,1.0,3.0,18.477036
4,,2.0,0.0,2.0,1.0,1.0,2.0,0.0,9.907953
...,...,...,...,...,...,...,...,...,...
199995,0.0,0.0,0.0,2.0,1.0,0.0,1.0,5.0,7.383498
199996,2.0,3.0,2.0,9.0,0.0,1.0,1.0,4.0,6.058394
199997,1.0,2.0,2.0,9.0,0.0,1.0,2.0,3.0,26.890163
199998,3.0,2.0,0.0,10.0,1.0,0.0,2.0,2.0,25.769153


In [8]:
scaler = StandardScaler()
scaler.fit(training_extra)
training_extra_scaled = scaler.transform(training_extra)
training_extra_scaled

array([[ 1.3969377 , -1.38749587,  1.26037488, ...,  1.22147526,
        -0.89497746,  0.84209503],
       [ 0.69710529,  1.28669539,  1.26037488, ..., -1.24377421,
         0.28323893, -0.88030638],
       [-0.70255954,  1.28669539,  1.26037488, ...,  1.22147526,
         1.46145532, -1.42181784],
       ...,
       [-0.70255954, -1.38749587, -1.21464582, ..., -1.24377421,
         1.46145532, -0.13923775],
       [ 0.69710529, -1.38749587,         nan, ..., -1.24377421,
        -0.30586927,  1.54483491],
       [ 1.3969377 ,  1.28669539,  0.02286453, ..., -1.24377421,
        -0.89497746,  0.72654855]])

In [9]:
test_scaled = scaler.transform(test)
test_scaled

array([[ 0.69710529, -0.49609878,  1.26037488, ...,  1.22147526,
         0.28323893,  0.38171976],
       [-0.00272713, -1.38749587,  0.02286453, ..., -1.24377421,
         0.28323893, -0.63727826],
       [-1.40239196, -1.38749587, -1.21464582, ..., -0.01114947,
        -0.89497746, -0.8888082 ],
       ...,
       [-0.70255954,  0.3952983 ,  1.26037488, ...,  1.22147526,
         0.28323893,  1.27339393],
       [ 0.69710529,  0.3952983 , -1.21464582, ...,  1.22147526,
        -0.30586927,  1.11266496],
       [ 1.3969377 , -0.49609878,  0.02286453, ..., -0.01114947,
         0.87234713,  1.60099445]])

In [10]:
X_train = training_extra_scaled
X_test = test_scaled
y_train = target

In [None]:
clf = CatBoostRegressor(n_estimators=1000, learning_rate=0.01, max_depth=5, random_state=42)
clf.load_model('./outputs/catboost_extra.model')
clf.fit(X_train, y_train, init_model=clf)
predictions = clf.predict(X_test)
predictions

TypeError: CatBoost.plot_predictions() missing 2 required positional arguments: 'data' and 'features_to_change'

In [12]:
submission_test = pd.read_csv("./inputs/test.csv")
output = pd.DataFrame({'id': submission_test.id, 'Price': predictions})
output.to_csv('./outputs/submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
