In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer 

In [2]:
df = pd.read_csv("Data/train.csv.gz")

In [3]:
df.head(10)

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312
5,5,Nike,Canvas,Medium,10.0,No,Yes,,Black,7.241812,20.01553
6,6,Nike,,Large,3.0,No,No,Backpack,Green,6.828123,84.805
7,7,Puma,Canvas,Small,1.0,Yes,Yes,Backpack,Blue,21.488864,27.15815
8,8,Under Armour,Polyester,Medium,8.0,Yes,No,Tote,Gray,10.20778,25.98652
9,9,Under Armour,Nylon,Medium,2.0,Yes,Yes,Messenger,Pink,15.8951,38.48741


In [4]:
df.drop("id", axis = 1, inplace = True)

In [5]:
df.head(10)

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312
5,Nike,Canvas,Medium,10.0,No,Yes,,Black,7.241812,20.01553
6,Nike,,Large,3.0,No,No,Backpack,Green,6.828123,84.805
7,Puma,Canvas,Small,1.0,Yes,Yes,Backpack,Blue,21.488864,27.15815
8,Under Armour,Polyester,Medium,8.0,Yes,No,Tote,Gray,10.20778,25.98652
9,Under Armour,Nylon,Medium,2.0,Yes,Yes,Messenger,Pink,15.8951,38.48741


In [6]:
(df.isnull().sum())/len(df)

Brand                   0.032350
Material                0.027823
Size                    0.021983
Compartments            0.000000
Laptop Compartment      0.024813
Waterproof              0.023500
Style                   0.026567
Color                   0.033167
Weight Capacity (kg)    0.000460
Price                   0.000000
dtype: float64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Brand                 290295 non-null  object 
 1   Material              291653 non-null  object 
 2   Size                  293405 non-null  object 
 3   Compartments          300000 non-null  float64
 4   Laptop Compartment    292556 non-null  object 
 5   Waterproof            292950 non-null  object 
 6   Style                 292030 non-null  object 
 7   Color                 290050 non-null  object 
 8   Weight Capacity (kg)  299862 non-null  float64
 9   Price                 300000 non-null  float64
dtypes: float64(3), object(7)
memory usage: 22.9+ MB


In [8]:
df_sample = df.sample(n=100, random_state=42)

In [9]:
X = df_sample.drop("Price", axis = 1)
y = df_sample["Price"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.80, shuffle = True, random_state = 42)

In [11]:
def segregation_cols(df):

    obj_df = df.select_dtypes(include = ["object"]).columns.tolist()
    num_cols = df.select_dtypes(include = ["int64", "float64"]).columns.tolist()
    
    
    boolean_column = [col for col in obj_df if (df[col].nunique() == 2)]
            
    obj_df = [col for col in obj_df if (col not in boolean_column)]

    return obj_df, boolean_column, num_cols
         

In [12]:
obj_df, boolean_df, num_cols = segregation_cols(X_train)

In [13]:
obj_prep = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown = "ignore"))
     ])

In [14]:
boolean_prep = Pipeline(steps =[
    ("imputer", SimpleImputer(strategy = "most_frequent")),
     ("encoder", OneHotEncoder(drop = "if_binary"))
     ])

In [15]:
num_prep = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "median")),
    ("encoder", StandardScaler())
])

In [16]:
preprocessor = ColumnTransformer([
    ('num', num_prep, num_cols),
    ('obj', obj_prep, obj_df),
    ('boo', boolean_prep, boolean_df)
])

In [17]:
pipeline = Pipeline([
    ("preprocessor", preprocessor)
])

In [18]:
pipeline

In [19]:
X_train = pipeline.fit_transform(X_train)

In [20]:
X_train_pd = pd.DataFrame(X_train)

# Transpose to the Necessary dimensions

In [21]:
X_trainT = X_train.T
y_trainT = y_train.values.reshape(1,-1)

In [22]:
print(f"shape of X-Train: {X_trainT.shape}")
print(f"shape of y-Train: {y_trainT.shape}")

shape of X-Train: (25, 80)
shape of y-Train: (1, 80)


# Initialising Gradients

In [23]:
def initialise_gradients(n_h, n_f, n_y):
    np.random.seed(1)
    W1 = np.random.randn(n_h, n_f)*0.01
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h)*0.01
    b2 = np.zeros((n_y,1))

    parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2}

    return parameters

# Activations

In [24]:
def Relu(Z):
    return np.maximum(0, Z)

# Derivative

In [25]:
def ReluDerivative(Z):
    return (Z>0).astype(float)

# Forward propagation

In [26]:
def forward_propation(X, parameters):
    W1, W2, b1, b2 = parameters["W1"], parameters["W2"], parameters["b1"], parameters["b2"]

    Z1 = np.dot(W1, X) + b1
    a1 = Relu(Z1)
    Z2 = np.dot(W2, a1) + b2

    cache = {"Z1": Z1, "Z2": Z2, "a1": a1}

    return Z2, cache 

# Back Propagation

In [27]:
def back_propagation(X, y, parameters, cache):

    W2 = parameters["W2"]
    Z1, Z2, a1 = cache["Z1"], cache["Z2"], cache["a1"]
    m = X.shape[1]

    dZ2 = (2/m)*(Z2 - y)
    dW2 = (1/m)* np.dot(dZ2, a1.T)
    db2 = (1/m)* np.sum(dZ2, axis = 1, keepdims = True)

    da1 = np.dot(W2.T, dZ2)
    dZ1 = da1 * ReluDerivative(Z1)
    dW1 = (1/m)*np.dot(dZ1, X.T)
    db1 = (1/m)*np.sum(dZ1, axis = 1, keepdims = True)

    gradients = {"dW1" : dW1, "db1": db1, "dW2": dW2, "db2": db2}

    return gradients 
        
    
    
    

# Cost Function

In [28]:
def cost_function(Z2, y, cache):
    m = y.shape[1]
    cost = (1/m)*np.sum((Z2 - y)**2)

    return cost
    

# Update parameters

In [29]:
def update_parameters(parameters, gradients, learning_rate):
    parameters["W1"] -= learning_rate * gradients["dW1"]
    parameters["b1"] -= learning_rate * gradients["db1"]

    parameters["W2"] -= learning_rate * gradients["dW2"]
    parameters["b2"] -= learning_rate * gradients["db2"]

    return parameters
    



    

In [30]:
def train(X, y, n_neurons, Iterations, Learning_rate):
    np.random.seed(1)
    print(Iterations)

    n_features = X.shape[0]
    output_neurons = y.shape[1]
    

    parameters = initialise_gradients(n_neurons, n_features, output_neurons)
    loss = []
    

    for i in range(Iterations):

        Z2, cache = forward_propation(X, parameters)
        cost = cost_function(Z2, y, cache)
        gradients = back_propagation(X, y, parameters, cache)

        parameters = update_parameters(parameters, gradients, Learning_rate)

        if (i%100 == 0):
            loss.append(cost)

            print(f"epoch {i} loss = {cost:.4f}")



    return parameters, loss



In [31]:
parameters, loss = train(X_trainT, y_trainT, 10, 10000, 0.01)

10000
epoch 0 loss = 640770.2485
epoch 100 loss = 97170.5800
epoch 200 loss = 96543.1244
epoch 300 loss = 96502.3330
epoch 400 loss = 96490.6482
epoch 500 loss = 96473.8746
epoch 600 loss = 96419.9669
epoch 700 loss = 96227.7213
epoch 800 loss = 95549.6122
epoch 900 loss = 93260.7986
epoch 1000 loss = 86746.6452
epoch 1100 loss = 74843.5859
epoch 1200 loss = 63232.0948
epoch 1300 loss = 55780.0802
epoch 1400 loss = 49089.6379
epoch 1500 loss = 46019.0315
epoch 1600 loss = 42325.2230
epoch 1700 loss = 37547.2601
epoch 1800 loss = 34686.0649
epoch 1900 loss = 32936.3376
epoch 2000 loss = 31746.3971
epoch 2100 loss = 29772.2560
epoch 2200 loss = 27703.9350
epoch 2300 loss = 25787.0909
epoch 2400 loss = 23457.2849
epoch 2500 loss = 21254.4940
epoch 2600 loss = 18986.2498
epoch 2700 loss = 16716.6803
epoch 2800 loss = 13775.7213
epoch 2900 loss = 11283.5343
epoch 3000 loss = 9401.8518
epoch 3100 loss = 8144.2803
epoch 3200 loss = 7263.0311
epoch 3300 loss = 6546.3294
epoch 3400 loss = 6075.

In [38]:
test_data = pd.read_csv("Data/test.csv.gz")

In [39]:
test_data

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg)
0,300000,Puma,Leather,Small,2.0,No,No,Tote,Green,20.671147
1,300001,Nike,Canvas,Medium,7.0,No,Yes,Backpack,Green,13.564105
2,300002,Adidas,Canvas,Large,9.0,No,Yes,Messenger,Blue,11.809799
3,300003,Adidas,Nylon,Large,1.0,Yes,No,Messenger,Green,18.477036
4,300004,,Nylon,Large,2.0,Yes,Yes,Tote,Black,9.907953
...,...,...,...,...,...,...,...,...,...,...
199995,499995,Adidas,Canvas,Large,2.0,Yes,No,Messenger,Red,7.383498
199996,499996,Nike,Polyester,Small,9.0,No,Yes,Messenger,Pink,6.058394
199997,499997,Jansport,Nylon,Small,9.0,No,Yes,Tote,Green,26.890163
199998,499998,Puma,Nylon,Large,10.0,Yes,No,Tote,Gray,25.769153


In [55]:
test_transformed = pipeline.transform(test_data)

In [45]:
# Perform forward propagation to get predictions
Z2_test, _ = forward_propation(test.T, parameters)  # Transpose for consistency
predictions = Z2_test.flatten()  # Convert to 1D array


In [51]:
test_ids = np.arange(300000, 300000 + test.shape[0])


In [58]:
# Ensure test_transformed has the correct shape
print(f"Shape of test_transformed before prediction: {test_transformed.shape}")

# Generate predictions
Z2, _ = forward_propation(test_transformed.T, parameters)

# Ensure correct shape
print(f"Shape of Z2 (Predictions): {Z2.shape}")

# Fix predictions shape
predictions = Z2[0, :]  # Extract only first row
print(f"Final Predictions shape: {predictions.shape}")  # Should be (200000,)

# Ensure `test_ids` has the same length
test_ids = np.arange(300000, 300000 + len(test_transformed))

# Create submission DataFrame
submission_df = pd.DataFrame({"id": test_ids, "Price": predictions})

# Save to CSV
submission_df.to_csv("submission.csv", index=False)

# Final check
print(f"Submission file created with shape: {submission_df.shape}")


Shape of test_transformed before prediction: (200000, 25)
Shape of Z2 (Predictions): (80, 200000)
Final Predictions shape: (200000,)
Submission file created with shape: (200000, 2)


In [53]:
print(f"Length of test_ids: {len(test_ids)}")
print(f"Length of predictions: {len(predictions)}")


Length of test_ids: 200000
Length of predictions: 16000000
