Importing the dependencies


In [97]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder


importing the data

In [98]:
train_data = pd.read_csv(r'Datasets/train.csv')
test_data = pd.read_csv(r'Datasets/test.csv')


In [99]:
train_data.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [100]:
test_data.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953


In [101]:
print('Train data shape:',train_data.shape)
print('Train labels shape:',test_data.shape)

Train data shape: (300000, 11)
Train labels shape: (200000, 10)


In [102]:
#Extra training data
train_data_extra = pd.read_csv(r'Datasets/training_extra.csv')

In [103]:
train_data_extra.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,500000,Under Armour,Canvas,Small,10.0,Yes,Yes,Tote,Blue,23.882052,114.11068
1,500001,Puma,Polyester,Small,4.0,No,Yes,Backpack,Green,11.869095,129.74972
2,500002,Jansport,Polyester,Small,8.0,Yes,Yes,Tote,Red,8.092302,21.3737
3,500003,Nike,Nylon,Large,7.0,No,No,Messenger,Pink,7.719581,48.09209
4,500004,Nike,Leather,Large,9.0,No,Yes,Tote,Green,22.741826,77.32461


In [104]:
print('Extra training data:',train_data_extra.shape)

Extra training data: (3694318, 11)


Preprocessing 

In [105]:
train_data.isnull().sum()

id                         0
Brand                   9705
Material                8347
Size                    6595
Compartments               0
Laptop Compartment      7444
Waterproof              7050
Style                   7970
Color                   9950
Weight Capacity (kg)     138
Price                      0
dtype: int64

In [106]:
test_data.isnull().sum()

id                         0
Brand                   6227
Material                5613
Size                    4381
Compartments               0
Laptop Compartment      4962
Waterproof              4811
Style                   5153
Color                   6785
Weight Capacity (kg)      77
dtype: int64

In [107]:
#Let us merge both train and train_extra datas
final_data = pd.concat([train_data, train_data_extra], ignore_index=True)


In [108]:
test_data

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953
...,...,...,...,...,...,...,...,...,...,...
199995,499995,Adidas,Canvas,Large,2.0,Yes,No,Messenger,Red,7.383498
199996,499996,Nike,Polyester,Small,9.0,No,Yes,Messenger,Pink,6.058394
199997,499997,Jansport,Nylon,Small,9.0,No,Yes,Tote,Green,26.890163
199998,499998,Puma,Nylon,Large,10.0,Yes,No,Tote,Gray,25.769153


In [109]:
def impute_nulls_for_all_columns(data):
    """
    Impute missing values for all columns in the DataFrame.
    Numerical columns will be imputed with the mean,
    categorical columns will be imputed with the mode (most frequent value).
    
    Parameters:
    - train_data: pandas DataFrame containing the data
    
    Returns:
    - train_data: DataFrame with imputed values in all columns
    """
    
    for column_name in data.columns:
        # Check if the column is numeric
        if data[column_name].dtype in ['int64', 'float64']:
            # Impute with mean for numerical columns
            imputer = SimpleImputer(strategy='mean')
            data[column_name] = imputer.fit_transform(data[[column_name]]).flatten()  # Flatten to 1D
        else:
            # Impute with mode (most frequent value) for categorical columns
            imputer = SimpleImputer(strategy='most_frequent')
            data[column_name] = imputer.fit_transform(data[[column_name]]).flatten()  # Flatten to 1D
    
    return data


In [110]:
def impute_nulls_for_all_columns_test(data):
    """
    Impute missing values for all columns in the DataFrame.
    Numerical columns will be imputed with the mean,
    categorical columns will be imputed with the mode (most frequent value).
    
    Parameters:
    - train_data: pandas DataFrame containing the data
    
    Returns:
    - train_data: DataFrame with imputed values in all columns
    """
    
    for column_name in data.columns:
        # Check if the column is numeric
        if data[column_name].dtype in ['int64', 'float64']:
            # Impute with mean for numerical columns
            imputer = SimpleImputer(strategy='mean')
            data[column_name] = imputer.fit_transform(data[[column_name]]).flatten()  # Flatten to 1D
        else:
            # Impute with mode (most frequent value) for categorical columns
            imputer = SimpleImputer(strategy='most_frequent')
            data[column_name] = imputer.fit_transform(data[[column_name]]).flatten()  # Flatten to 1D
    
    return data


In [111]:
final_data = impute_nulls_for_all_columns(final_data)
test_data = impute_nulls_for_all_columns_test(test_data)

In [112]:
test_data

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000.0,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001.0,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002.0,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003.0,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004.0,Adidas,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953
...,...,...,...,...,...,...,...,...,...,...
199995,499995.0,Adidas,Canvas,Large,2.0,Yes,No,Messenger,Red,7.383498
199996,499996.0,Nike,Polyester,Small,9.0,No,Yes,Messenger,Pink,6.058394
199997,499997.0,Jansport,Nylon,Small,9.0,No,Yes,Tote,Green,26.890163
199998,499998.0,Puma,Nylon,Large,10.0,Yes,No,Tote,Gray,25.769153


In [113]:
final_data.isnull().sum()

id                      0
Brand                   0
Material                0
Size                    0
Compartments            0
Laptop Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight Capacity (kg)    0
Price                   0
dtype: int64

In [114]:
test_data

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000.0,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001.0,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002.0,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003.0,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004.0,Adidas,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953
...,...,...,...,...,...,...,...,...,...,...
199995,499995.0,Adidas,Canvas,Large,2.0,Yes,No,Messenger,Red,7.383498
199996,499996.0,Nike,Polyester,Small,9.0,No,Yes,Messenger,Pink,6.058394
199997,499997.0,Jansport,Nylon,Small,9.0,No,Yes,Tote,Green,26.890163
199998,499998.0,Puma,Nylon,Large,10.0,Yes,No,Tote,Gray,25.769153


In [115]:
#Label Encoder
label = LabelEncoder()

In [116]:
#Performing Label encoding
final_data['Brand'] = label.fit_transform(final_data['Brand'])
final_data['Material'] = label.fit_transform(final_data['Material'])
final_data['Size'] = label.fit_transform(final_data['Size'])
final_data['Laptop Compartment'] = label.fit_transform(final_data['Laptop Compartment'])
final_data['Waterproof'] = label.fit_transform(final_data['Waterproof'])
final_data['Style'] = label.fit_transform(final_data['Style'])
final_data['Color'] = label.fit_transform(final_data['Color'])

In [117]:
#Performing Label encoding
test_data['Brand'] = label.fit_transform(test_data['Brand'])
test_data['Material'] = label.fit_transform(test_data['Material'])
test_data['Size'] = label.fit_transform(test_data['Size'])
test_data['Laptop Compartment'] = label.fit_transform(test_data['Laptop Compartment'])
test_data['Waterproof'] = label.fit_transform(test_data['Waterproof'])
test_data['Style'] = label.fit_transform(test_data['Style'])
test_data['Color'] = label.fit_transform(test_data['Color'])

In [118]:
test_data.isnull().sum()

id                      0
Brand                   0
Material                0
Size                    0
Compartments            0
Laptop Compartment      0
Waterproof              0
Style                   0
Color                   0
Weight Capacity (kg)    0
dtype: int64

In [119]:
final_data

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0.0,1,1,1,7.0,1,0,2,0,11.611723,112.15875
1,1.0,1,0,2,10.0,1,1,1,3,27.078537,68.88056
2,2.0,4,1,2,2.0,1,0,1,5,16.643760,39.17320
3,3.0,2,2,2,8.0,1,0,1,3,12.937220,80.60793
4,4.0,0,0,1,1.0,1,1,1,3,17.749338,86.02312
...,...,...,...,...,...,...,...,...,...,...,...
3994313,4194313.0,2,0,1,3.0,1,1,1,1,28.098120,104.74460
3994314,4194314.0,3,1,2,10.0,1,1,2,1,17.379531,122.39043
3994315,4194315.0,1,0,0,10.0,0,0,0,5,17.037708,148.18470
3994316,4194316.0,3,0,1,2.0,0,0,0,2,28.783339,22.32269


In [120]:
test_data

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000.0,3,1,2,2.0,0,0,2,3,20.671147
1,300001.0,2,0,1,7.0,0,1,0,3,13.564105
2,300002.0,0,0,0,9.0,0,1,1,1,11.809799
3,300003.0,0,2,0,1.0,1,0,1,3,18.477036
4,300004.0,0,2,0,2.0,1,1,2,0,9.907953
...,...,...,...,...,...,...,...,...,...,...
199995,499995.0,0,0,0,2.0,1,0,1,5,7.383498
199996,499996.0,2,3,2,9.0,0,1,1,4,6.058394
199997,499997.0,1,2,2,9.0,0,1,2,3,26.890163
199998,499998.0,3,2,0,10.0,1,0,2,2,25.769153


In [121]:
test_data.drop(columns='id',inplace=True)

In [122]:
final_data

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0.0,1,1,1,7.0,1,0,2,0,11.611723,112.15875
1,1.0,1,0,2,10.0,1,1,1,3,27.078537,68.88056
2,2.0,4,1,2,2.0,1,0,1,5,16.643760,39.17320
3,3.0,2,2,2,8.0,1,0,1,3,12.937220,80.60793
4,4.0,0,0,1,1.0,1,1,1,3,17.749338,86.02312
...,...,...,...,...,...,...,...,...,...,...,...
3994313,4194313.0,2,0,1,3.0,1,1,1,1,28.098120,104.74460
3994314,4194314.0,3,1,2,10.0,1,1,2,1,17.379531,122.39043
3994315,4194315.0,1,0,0,10.0,0,0,0,5,17.037708,148.18470
3994316,4194316.0,3,0,1,2.0,0,0,0,2,28.783339,22.32269


In [123]:
"""from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse_output=False)
encoded_array = encoder.fit_transform(final_data[['Laptop Compartment', 'Waterproof', 'Style', 'Color']])

# Create a DataFrame with the encoded values
encoded_columns = encoder.get_feature_names_out(['Laptop Compartment', 'Waterproof', 'Style', 'Color'])
final_data = pd.DataFrame(encoded_array, columns=encoded_columns)"""

"from sklearn.preprocessing import OneHotEncoder\n\nencoder = OneHotEncoder(sparse_output=False)\nencoded_array = encoder.fit_transform(final_data[['Laptop Compartment', 'Waterproof', 'Style', 'Color']])\n\n# Create a DataFrame with the encoded values\nencoded_columns = encoder.get_feature_names_out(['Laptop Compartment', 'Waterproof', 'Style', 'Color'])\nfinal_data = pd.DataFrame(encoded_array, columns=encoded_columns)"

In [124]:
"final_data = pd.get_dummies(final_data, columns=['Laptop Compartment', 'Waterproof'])"

"final_data = pd.get_dummies(final_data, columns=['Laptop Compartment', 'Waterproof'])"

In [125]:
"test_data = pd.get_dummies(test_data, columns=['Laptop Compartment', 'Waterproof'])"

"test_data = pd.get_dummies(test_data, columns=['Laptop Compartment', 'Waterproof'])"

In [126]:
final_data

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0.0,1,1,1,7.0,1,0,2,0,11.611723,112.15875
1,1.0,1,0,2,10.0,1,1,1,3,27.078537,68.88056
2,2.0,4,1,2,2.0,1,0,1,5,16.643760,39.17320
3,3.0,2,2,2,8.0,1,0,1,3,12.937220,80.60793
4,4.0,0,0,1,1.0,1,1,1,3,17.749338,86.02312
...,...,...,...,...,...,...,...,...,...,...,...
3994313,4194313.0,2,0,1,3.0,1,1,1,1,28.098120,104.74460
3994314,4194314.0,3,1,2,10.0,1,1,2,1,17.379531,122.39043
3994315,4194315.0,1,0,0,10.0,0,0,0,5,17.037708,148.18470
3994316,4194316.0,3,0,1,2.0,0,0,0,2,28.783339,22.32269


In [127]:
test_data

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,3,1,2,2.0,0,0,2,3,20.671147
1,2,0,1,7.0,0,1,0,3,13.564105
2,0,0,0,9.0,0,1,1,1,11.809799
3,0,2,0,1.0,1,0,1,3,18.477036
4,0,2,0,2.0,1,1,2,0,9.907953
...,...,...,...,...,...,...,...,...,...
199995,0,0,0,2.0,1,0,1,5,7.383498
199996,2,3,2,9.0,0,1,1,4,6.058394
199997,1,2,2,9.0,0,1,2,3,26.890163
199998,3,2,0,10.0,1,0,2,2,25.769153


Splitting the training data

In [128]:
X = final_data.drop(columns=['Price','id'])
Y = final_data['Price']

In [129]:
test_data.shape

(200000, 9)

In [130]:
test_data

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,3,1,2,2.0,0,0,2,3,20.671147
1,2,0,1,7.0,0,1,0,3,13.564105
2,0,0,0,9.0,0,1,1,1,11.809799
3,0,2,0,1.0,1,0,1,3,18.477036
4,0,2,0,2.0,1,1,2,0,9.907953
...,...,...,...,...,...,...,...,...,...
199995,0,0,0,2.0,1,0,1,5,7.383498
199996,2,3,2,9.0,0,1,1,4,6.058394
199997,1,2,2,9.0,0,1,2,3,26.890163
199998,3,2,0,10.0,1,0,2,2,25.769153


Train Test Split

In [131]:
from sklearn.model_selection import train_test_split

In [132]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

Model Implementation

In [133]:
import xgboost as xgb

In [134]:
model = xgb.XGBRegressor()


In [135]:
model.fit(X_train,y_train)

In [136]:
y_pred = model.predict(X_test)

In [137]:
from sklearn.metrics import mean_squared_error
import numpy as np

In [138]:
mse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Mean Squared Error: {mse:.2f}")

Mean Squared Error: 38.85


In [139]:
y_test_pred = model.predict(test_data)

In [140]:
sample = pd.read_csv('Datasets/test.csv')

In [141]:
Final_prediction = pd.DataFrame({
    'id': sample['id'][:len(y_test_pred)],  # Align lengths
    'Price': y_test_pred
})

In [142]:
Final_prediction

Unnamed: 0,id,Price
0,300000,79.604378
1,300001,82.710045
2,300002,83.804802
3,300003,80.330254
4,300004,81.719788
...,...,...
199995,499995,80.409538
199996,499996,77.834892
199997,499997,82.097618
199998,499998,81.781395


In [143]:
Final_prediction.to_csv("Datasets/Final_Submission.csv", index=False)
