# Data Pipelines

In [1]:
import pandas as pd
data = pd.read_csv('Data_Files/cars.csv')
data.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [2]:
data.shape

(8128, 5)

# Adding missing values in  the dataframe

In [3]:
import numpy as np
np.random.seed(42)
missing_km_indices = np.random.choice(data.index, size = int(0.05 * len(data)), replace=False)
data.loc[missing_km_indices, 'km_driven'] = np.nan
missing_km_indices = np.random.choice(data.index, size = int(0.01 * len(data)), replace=False)
data.loc[missing_km_indices, 'owner'] = np.nan

In [4]:
data.isna().sum()

brand              0
km_driven        406
fuel               0
owner             81
selling_price      0
dtype: int64

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['selling_price']),
                                                    data['selling_price'], test_size = 0.2, random_state = 0)

In [11]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer

In [12]:
trf1 = ColumnTransformer([
            ("impute_km_driven", SimpleImputer(), [1]),
            ("impute_owner", SimpleImputer(strategy='most_frequent'), [3])
], remainder =  'passthrough')


In [13]:
trf2 = ColumnTransformer([
     ("Ordinal", OrdinalEncoder(handle_unknown= 'use_encoded_value', unknown_value=-1), [3]),
     ("OneHot", OneHotEncoder(handle_unknown='ignore', sparse_output=False), [0,2])
], remainder='passthrough'
)

In [14]:
trf3 = ColumnTransformer([
     ("Scale", MinMaxScaler(), slice(0, 38))
])

In [15]:
trf4 = SelectKBest(score_func=chi2, k = 10)

In [16]:
trf5 = RandomForestRegressor()

In [17]:
from sklearn.pipeline import Pipeline
Pipe = Pipeline([
     ("imputer", trf1),
     ("encoder", trf2),
     ("scaler", trf3),
     ("fselector", trf4),
     ("model", trf5)
])

In [18]:
Pipe.fit(X_train, y_train)