In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer 

In [2]:
df = pd.read_csv("Data/train.csv.gz")

In [3]:
df.head(10)

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312
5,5,Nike,Canvas,Medium,10.0,No,Yes,,Black,7.241812,20.01553
6,6,Nike,,Large,3.0,No,No,Backpack,Green,6.828123,84.805
7,7,Puma,Canvas,Small,1.0,Yes,Yes,Backpack,Blue,21.488864,27.15815
8,8,Under Armour,Polyester,Medium,8.0,Yes,No,Tote,Gray,10.20778,25.98652
9,9,Under Armour,Nylon,Medium,2.0,Yes,Yes,Messenger,Pink,15.8951,38.48741


In [4]:
df.drop("id", axis = 1, inplace = True)

In [5]:
df.head(10)

Unnamed: 0,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312
5,Nike,Canvas,Medium,10.0,No,Yes,,Black,7.241812,20.01553
6,Nike,,Large,3.0,No,No,Backpack,Green,6.828123,84.805
7,Puma,Canvas,Small,1.0,Yes,Yes,Backpack,Blue,21.488864,27.15815
8,Under Armour,Polyester,Medium,8.0,Yes,No,Tote,Gray,10.20778,25.98652
9,Under Armour,Nylon,Medium,2.0,Yes,Yes,Messenger,Pink,15.8951,38.48741


In [6]:
(df.isnull().sum())/len(df)

Brand                   0.032350
Material                0.027823
Size                    0.021983
Compartments            0.000000
Laptop Compartment      0.024813
Waterproof              0.023500
Style                   0.026567
Color                   0.033167
Weight Capacity (kg)    0.000460
Price                   0.000000
dtype: float64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300000 entries, 0 to 299999
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Brand                 290295 non-null  object 
 1   Material              291653 non-null  object 
 2   Size                  293405 non-null  object 
 3   Compartments          300000 non-null  float64
 4   Laptop Compartment    292556 non-null  object 
 5   Waterproof            292950 non-null  object 
 6   Style                 292030 non-null  object 
 7   Color                 290050 non-null  object 
 8   Weight Capacity (kg)  299862 non-null  float64
 9   Price                 300000 non-null  float64
dtypes: float64(3), object(7)
memory usage: 22.9+ MB


In [8]:
X = df.drop("Price", axis = 1)
y = df["Price"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.80, shuffle = True, random_state = 42)

In [10]:
def segregation_cols(df):

    obj_df = df.select_dtypes(include = ["object"]).columns.tolist()
    num_cols = df.select_dtypes(include = ["int64", "float64"]).columns.tolist()
    
    
    boolean_column = [col for col in obj_df if (df[col].nunique() == 2)]
            
    obj_df = [col for col in obj_df if (col not in boolean_column)]

    return obj_df, boolean_column, num_cols
         

In [11]:
obj_df, boolean_df, num_cols = segregation_cols(X_train)

In [19]:
obj_prep = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown = "ignore"))
     ])

In [21]:
boolean_prep = Pipeline(steps =[
    ("imputer", SimpleImputer(strategy = "most_frequent")),
     ("encoder", OneHotEncoder(drop = "if_binary"))
     ])

In [22]:
num_prep = Pipeline(steps = [
    ("imputer", SimpleImputer(strategy = "median")),
    ("encoder", StandardScaler())
])

In [24]:
preprocessor = ColumnTransformer([
    ('num', num_prep, num_cols),
    ('obj', obj_prep, obj_df),
    ('boo', boolean_prep, boolean_df)
])