# Automated Pre-processing with Scikit-learn Pipeline

In [128]:
from warnings import filterwarnings
filterwarnings("ignore")

### Step 1 - Read the dataset

In [129]:
import pandas as pd
df= pd.read_csv("Cars93.csv",na_values=["","NA"], keep_default_na= False)
df.head()


Unnamed: 0,id,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Weight,Origin,Make
0,1,Acura,Integra,Small,12.9,15.9,18.8,25,31,,...,5,177,102,68,37,26.5,11.0,2705,non-USA,Acura Integra
1,2,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,...,5,195,115,71,38,30.0,15.0,3560,non-USA,Acura Legend
2,3,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,...,5,180,102,67,37,28.0,14.0,3375,non-USA,Audi 90
3,4,Audi,100,Midsize,30.8,37.7,44.6,19,26,,...,6,193,106,70,37,31.0,17.0,3405,non-USA,Audi 100
4,5,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,...,4,186,109,69,39,27.0,13.0,3640,non-USA,BMW 535i


### Step 2 - Perform basic data quality chceks

In [130]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  93 non-null     int64  
 1   Manufacturer        93 non-null     object 
 2   Model               93 non-null     object 
 3   Type                93 non-null     object 
 4   Min.Price           93 non-null     float64
 5   Price               93 non-null     float64
 6   Max.Price           93 non-null     float64
 7   MPG.city            93 non-null     int64  
 8   MPG.highway         93 non-null     int64  
 9   AirBags             89 non-null     object 
 10  DriveTrain          93 non-null     object 
 11  Cylinders           93 non-null     object 
 12  EngineSize          93 non-null     float64
 13  Horsepower          93 non-null     int64  
 14  RPM                 93 non-null     int64  
 15  Rev.per.mile        93 non-null     int64  
 16  Man.trans.

In [131]:
m=df.isna().sum()
m[m>0]

AirBags            4
Rear.seat.room     2
Luggage.room      11
dtype: int64

In [132]:
df.duplicated().sum()

0

if there is any duplicate values then we can Use  
df.drop_duplicates(keep="first")

### Step 3 - Separate X(features) and Y(labels)

In [133]:
X= df.drop(columns=["id","Weight"])
Y= df[["Weight"]]

Above double square bracket is used to get the Y in df, if single sq bracket is used then Y will be in series

In [134]:
X.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Acura,Integra,Small,12.9,15.9,18.8,25,31,,Front,...,13.2,5,177,102,68,37,26.5,11.0,non-USA,Acura Integra
1,Acura,Legend,Midsize,29.2,33.9,38.7,18,25,Driver & Passenger,Front,...,18.0,5,195,115,71,38,30.0,15.0,non-USA,Acura Legend
2,Audi,90,Compact,25.9,29.1,32.3,20,26,Driver only,Front,...,16.9,5,180,102,67,37,28.0,14.0,non-USA,Audi 90
3,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,21.1,6,193,106,70,37,31.0,17.0,non-USA,Audi 100
4,BMW,535i,Midsize,23.7,30.0,36.2,22,30,Driver only,Rear,...,21.1,4,186,109,69,39,27.0,13.0,non-USA,BMW 535i


In [135]:
Y.head()

Unnamed: 0,Weight
0,2705
1,3560
2,3375
3,3405
4,3640


### Step 4 - Pre-processing pipeline applied on X features

In [136]:
cat= list(X.columns[X.dtypes=="object"])
con= list(X.columns[X.dtypes!="object"])

In [137]:
cat

['Manufacturer',
 'Model',
 'Type',
 'AirBags',
 'DriveTrain',
 'Cylinders',
 'Man.trans.avail',
 'Origin',
 'Make']

In [138]:
con

['Min.Price',
 'Price',
 'Max.Price',
 'MPG.city',
 'MPG.highway',
 'EngineSize',
 'Horsepower',
 'RPM',
 'Rev.per.mile',
 'Fuel.tank.capacity',
 'Passengers',
 'Length',
 'Wheelbase',
 'Width',
 'Turn.circle',
 'Rear.seat.room',
 'Luggage.room']

In [139]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [140]:
# Numerical Pipeline
num_pipe= make_pipeline(
    SimpleImputer(strategy="median"),
    StandardScaler()
)

In [141]:
# Categorical Pipeline
cat_pipe= make_pipeline(
    SimpleImputer(strategy= "most_frequent"),
    OneHotEncoder(handle_unknown="ignore", sparse_output=False)
)

In [142]:

#Combining the Pipeline
pre= ColumnTransformer([
    ("num",num_pipe,con),
    ("cat",cat_pipe, cat)]
    ).set_output(transform="pandas")

In [143]:
pre

In [144]:
X_pre = pre.fit_transform(X)
X_pre.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
0,-0.485787,-0.37572,-0.282465,0.471312,0.360925,-0.841022,-0.073484,1.717489,1.12953,-1.062184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.388017,1.497844,1.531409,-0.781032,-0.770514,0.515869,1.078322,0.369586,0.005661,0.409445,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.008658,0.998227,0.948052,-0.423219,-0.581941,0.128186,0.540813,0.369586,-0.105713,0.072197,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.571949,1.893374,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,1.359872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.755752,1.091905,1.303535,-0.065407,0.172352,0.806631,1.231897,0.706562,0.430909,1.359872,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [145]:
pre

### Step 5 - Train Test Split

In [146]:
from sklearn.model_selection import train_test_split


In [147]:
xtrain, xtest, ytrain, ytest= train_test_split(X_pre, Y, test_size=0.20, random_state=21)

In [148]:
xtrain.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
57,1.365026,1.28967,1.185041,-0.423219,-0.016221,-0.356418,-0.265452,-0.304365,0.18791,-0.663618,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31,-1.003095,-0.979424,-0.911397,0.1135,0.172352,-0.841022,-0.323043,2.054464,0.157535,-1.062184,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62,0.606307,0.685966,0.729294,-0.781032,-0.959087,0.322027,1.116716,1.212025,-0.247462,0.716035,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,0.043016,-0.021825,-0.063707,-0.423219,-0.204794,0.806631,1.347077,0.87505,-0.71321,0.409445,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
51,1.985795,1.726835,1.449374,-0.781032,-0.581941,1.872759,1.27029,-1.146804,-0.996708,1.022624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [149]:
ytrain.head()

Unnamed: 0,Weight
57,2920
31,2530
62,3730
29,3490
51,4055


In [150]:
xtest.head()

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
23,-1.003095,-0.85452,-0.701753,0.1135,-0.016221,-0.453339,-0.975733,-0.809828,0.532158,-0.816912,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
86,0.203957,0.332071,0.4285,-0.781032,-1.336233,-0.259498,-0.111878,-0.472853,0.370159,0.961306,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
91,0.537333,0.332071,0.145937,-0.244313,-0.204794,-0.356418,-0.572601,0.201098,-0.237337,-0.265051,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
21,1.422504,1.039862,0.692834,-0.423219,-0.581941,0.612789,0.060893,-0.809828,-1.108083,-0.203734,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.100495,-0.073868,-0.209546,-0.959938,-0.581941,2.260442,0.502419,-1.820755,-1.988953,1.942392,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [151]:
ytest.head()

Unnamed: 0,Weight
23,2670
86,3785
91,2985
21,3570
17,3910


In [152]:
xtrain.shape

(74, 257)

In [153]:
xtest.shape

(19, 257)

### Step 6 - Build the Model

In [154]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain, ytrain)

In [155]:
model.intercept_

array([3106.87172698])

In [156]:
model.coef_

array([[-2.19818944e+00,  6.85644374e-01,  2.15605789e+00,
        -3.41046018e+01, -7.20389871e+01,  6.91516010e+01,
         8.52106983e+01, -2.06623109e+01,  5.31191732e-02,
         3.22834244e+01,  4.31514960e+01,  5.80075423e+01,
         1.59058524e+02,  8.24373258e+01,  3.43193528e+00,
        -2.65358689e+01,  9.27589211e+00, -1.77099925e+01,
         6.78028246e+01,  7.81347956e+01, -4.75387657e+01,
        -4.49280455e+01, -1.68499813e+01, -4.66247744e-03,
         1.90458111e+01,  1.12562458e+01, -3.57678823e+01,
        -7.93421818e+00, -5.20367767e+00,  1.34992132e+01,
        -1.79776046e+01, -1.18476365e+01,  2.51859382e+01,
         3.23762357e+01, -2.53771215e+01, -5.52151980e+01,
         1.03013638e+00, -9.72042217e-01,  3.64240315e+01,
        -2.79653062e+01,  4.41206116e+01,  1.39840827e+01,
        -6.45862824e+01, -5.08810480e+00,  3.45554850e+01,
         2.54617523e-03, -1.78239573e+01,  2.74973298e+01,
        -2.34062527e+00,  2.48994774e-03, -3.52520373e+0

### Step 7 : Model Evaluation

on both train data and test data

In [157]:
model.score(xtrain,ytrain)

1.0

In [158]:
model.score(xtest, ytest)

0.9330070791251123

In [159]:
y_pred_train = model.predict(xtrain)
y_pred_train[0:5]

array([[2920.],
       [2530.],
       [3730.],
       [3490.],
       [4055.]])

In [160]:
y_pred_test = model.predict(xtest)
y_pred_test[0:5]

array([[2572.88102957],
       [3616.90902415],
       [3040.73927559],
       [3332.64053289],
       [3969.97220419]])

In [161]:
ytest.head()

Unnamed: 0,Weight
23,2670
86,3785
91,2985
21,3570
17,3910


In [162]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [163]:
def evaluation_model(model, X, Y):
    ypred = model.predict(X)
    mse = mean_squared_error(Y, ypred)
    rmse = mse**(1/2)
    mae = mean_absolute_error(Y, ypred)
    mape = mean_absolute_percentage_error(Y, ypred)
    r2= r2_score(Y, ypred)
    print(f"MSE:{mse:2f}")
    print(f"RMSE :{rmse:.2f}")
    print(f"MAE:{mae:.2f}")
    print(f"MAPE :{mape:.2f}")
    print(f"R2_Score:{r2:.4f}")

In [164]:
evaluation_model(model, xtrain, ytrain)

MSE:0.000000
RMSE :0.00
MAE:0.00
MAPE :0.00
R2_Score:1.0000


In [165]:
evaluation_model(model, xtest, ytest)

MSE:24298.128268
RMSE :155.88
MAE:115.92
MAPE :0.04
R2_Score:0.9330


#### R2 score > 80% so it is a good model and we can consider it for this out of sample data

### Step 8: Out of sample prediction

In [166]:
xnew = pd.read_csv("sample.csv", na_values= ["", "NA"], keep_default_na= False)
xnew.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Fuel.tank.capacity,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,15.0,6,190,106,65,37,31.0,17.0,non-USA,Audi 100
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,15.2,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,16.5,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,20.0,2,169,96,69,37,,,non-USA,Mazda RX-7
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,12.4,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox


In [167]:
pre

In [171]:
xnew_pre = pre.transform(xnew)
xnew_pre

Unnamed: 0,num__Min.Price,num__Price,num__Max.Price,num__MPG.city,num__MPG.highway,num__EngineSize,num__Horsepower,num__RPM,num__Rev.per.mile,num__Fuel.tank.capacity,...,cat__Make_Toyota Camry,cat__Make_Toyota Celica,cat__Make_Toyota Previa,cat__Make_Toyota Tercel,cat__Make_Volkswagen Corrado,cat__Make_Volkswagen Eurovan,cat__Make_Volkswagen Fox,cat__Make_Volkswagen Passat,cat__Make_Volvo 240,cat__Make_Volvo 850
0,1.571949,1.893374,2.069191,-0.602126,-0.581941,0.128186,0.540813,0.369586,0.410659,-0.510323,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.888138,-0.875337,-0.829362,0.1135,0.360925,-0.647181,-0.649388,-0.135877,0.673908,-0.449005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.428309,-0.37572,-0.318925,-0.244313,-0.016221,-0.453339,-0.649388,-0.135877,0.532158,-0.050439,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.767376,1.352122,0.966282,-0.959938,-0.770514,-1.325626,2.134145,2.054464,-0.014589,1.022624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.968608,-1.083511,-1.130155,0.471312,0.738071,-0.841022,-1.206095,0.369586,0.441034,-1.307455,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [168]:
xnew_pre.isna().sum()

num__Min.Price                  0
num__Price                      0
num__Max.Price                  0
num__MPG.city                   0
num__MPG.highway                0
num__EngineSize                 0
num__Horsepower                 0
num__RPM                        0
num__Rev.per.mile               0
num__Fuel.tank.capacity         0
num__Passengers                 0
num__Length                     0
num__Wheelbase                  0
num__Width                      0
num__Turn.circle                0
num__Rear.seat.room             0
num__Luggage.room               0
cat__Manufacturer_Chevrolet     0
cat__Manufacturer_Mazda         0
cat__Manufacturer_Pontiac       0
cat__Manufacturer_Volkswagen    0
cat__Model_Fox                  0
cat__Model_Lumina               0
cat__Model_RX-7                 0
cat__Model_Sunbird              0
cat__Type_Midsize               0
cat__Type_Small                 0
cat__Type_Sporty                0
cat__AirBags_None               0
cat__DriveTrai

In [169]:
type(model)

sklearn.linear_model._base.LinearRegression

In [172]:
preds = model.predict(xnew_pre)
preds

array([[3335.17083283],
       [2575.        ],
       [3195.        ],
       [2895.        ],
       [2240.        ]])

In [173]:
xnew[["Weight"]] = preds
xnew= xnew.round(2)
xnew

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Price,Max.Price,MPG.city,MPG.highway,AirBags,DriveTrain,...,Passengers,Length,Wheelbase,Width,Turn.circle,Rear.seat.room,Luggage.room,Origin,Make,Weight
0,Audi,100,Midsize,30.8,37.7,44.6,19,26,,Front,...,6,190,106,65,37,31.0,17.0,non-USA,Audi 100,3335.17
1,Pontiac,Sunbird,Compact,9.4,11.1,12.8,23,31,,Front,...,5,181,101,66,39,25.0,13.0,USA,Pontiac Sunbird,2575.0
2,Chevrolet,Lumina,Midsize,13.4,15.9,18.4,21,29,,Front,...,6,198,108,71,40,28.5,16.0,USA,Chevrolet Lumina,3195.0
3,Mazda,RX-7,Sporty,32.5,32.5,32.5,17,25,Driver only,Rear,...,2,169,96,69,37,,,non-USA,Mazda RX-7,2895.0
4,Volkswagen,Fox,Small,8.7,9.1,9.5,25,33,,Front,...,4,163,93,63,34,26.0,10.0,non-USA,Volkswagen Fox,2240.0
