In [28]:
import pandas as pd
data = pd.read_csv('Data_Files/cars.csv')
data.head()

Unnamed: 0,brand,km_driven,fuel,owner,selling_price
0,Maruti,145500,Diesel,First Owner,450000
1,Skoda,120000,Diesel,Second Owner,370000
2,Honda,140000,Petrol,Third Owner,158000
3,Hyundai,127000,Diesel,First Owner,225000
4,Maruti,120000,Petrol,First Owner,130000


In [29]:
import numpy as np
np.random.seed(42)
missing_km_indices = np.random.choice(data.index, size = int(0.05 * len(data)), replace=False)
data.loc[missing_km_indices, 'km_driven'] = np.nan
missing_km_indices = np.random.choice(data.index, size = int(0.01 * len(data)), replace=False)
data.loc[missing_km_indices, 'owner'] = np.nan

In [30]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.drop(columns=['selling_price']),
                                                    data['selling_price'], test_size = 0.2, random_state = 0)

In [31]:
data.isnull().sum()

brand              0
km_driven        406
fuel               0
owner             81
selling_price      0
dtype: int64

In [32]:
trf1 = ColumnTransformer([
                 ("impute_km_driven", SimpleImputer(), [1]),
                 ("impute_owner", SimpleImputer(strategy='most_frequent'), [3])
], remainder='passthrough')

In [33]:
trf2 = ColumnTransformer([
                ("Ordinal", OrdinalEncoder(handle_unknown='use_encoded_value',  unknown_value=-1), [3]),
                ("OneHot", OneHotEncoder(handle_unknown='ignore', sparse_output=False), [0, 2])

], remainder='passthrough')

In [34]:
trf3 = ColumnTransformer([
     ("Scale", MinMaxScaler(), slice(0, 38))
])

In [35]:
trf4 = SelectKBest(score_func=chi2, k = 10)

In [36]:
trf5 = RandomForestRegressor()

In [37]:
from sklearn.pipeline import Pipeline
Pipe = Pipeline([
     ("imputer", trf1),
     ("encoder",  trf2),
     ("scaling", trf3),
     ("fselect", trf4),
     ("model",  trf5)


])

In [38]:
Pipe.fit(X_train, y_train)

In [39]:
import seaborn as sns

# Load the Titanic dataset directly from seaborn
df = sns.load_dataset('titanic')
df.head(1)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False


In [40]:
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [41]:
ans = df.drop(columns=[
                       'survived', 'pclass', 'age', 'sibsp',
                       'parch', 'who', 'adult_male', 'alive',  
                       'alone', 'embark_town'
                       ]
              )
ans.head(5) 

Unnamed: 0,sex,fare,embarked,class,deck
0,male,7.25,S,Third,
1,female,71.2833,C,First,C
2,female,7.925,S,Third,
3,female,53.1,S,First,C
4,male,8.05,S,Third,


In [42]:
Ans = ans.filter(items=['sex', 'embarked', 'deck', 'class', 'fare'])

In [43]:
Ans.isna().sum()

sex           0
embarked      2
deck        688
class         0
fare          0
dtype: int64

In [44]:
X_train, X_test, y_train, y_test = train_test_split(Ans.drop(columns=['fare']), 
                                                    Ans['fare'], test_size=0.2,  random_state=0)
X_train


Unnamed: 0,sex,embarked,deck,class
140,female,C,,Third
439,male,S,,Second
817,male,C,,Second
378,male,C,,Third
491,male,S,,Third
...,...,...,...,...
835,female,C,E,First
192,female,S,,Third
629,male,Q,,Third
559,female,S,,Third


In [45]:
y_train

140    15.2458
439    10.5000
817    37.0042
378     4.0125
491     7.2500
        ...   
835    83.1583
192     7.8542
629     7.7333
559    17.4000
684    39.0000
Name: fare, Length: 712, dtype: float64

In [46]:
Trf1 = ColumnTransformer([
     ("imputer_embarked", SimpleImputer(strategy='most_frequent'), ['embarked']),
     ("imputer_deck", SimpleImputer(strategy='most_frequent'), ['deck']),
], remainder='passthrough')

Trf2 = ColumnTransformer([
     ("Ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), [3]),
     ("OneHot", OneHotEncoder(handle_unknown='ignore', sparse_output=False), [0, 2])
], remainder='passthrough')

Trf3 = ColumnTransformer([
     ("Scale", MinMaxScaler(), slice(0, 12))
])

Trf4 = SelectKBest(score_func=chi2, k = 5)
Trf5 = RandomForestRegressor()

Pipes = Pipeline([
     ("Imputer",Trf1),
     ("Encoder",Trf2),
     ("Scaler",Trf3),
     ("Sfeatures",Trf4),
      ("Model",Trf5),

])

In [47]:
Trf2.set_output(transform='pandas')

In [48]:
Trf2.fit_transform(X_train)

Unnamed: 0,Ordinal__class,OneHot__sex_female,OneHot__sex_male,OneHot__deck_A,OneHot__deck_B,OneHot__deck_C,OneHot__deck_D,OneHot__deck_E,OneHot__deck_F,OneHot__deck_G,OneHot__deck_nan,remainder__embarked
140,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,C
439,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,S
817,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,C
378,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,C
491,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,S
...,...,...,...,...,...,...,...,...,...,...,...,...
835,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,C
192,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,S
629,2.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Q
559,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,S


In [49]:
Pipes

In [50]:
Pipes.named_steps

{'Imputer': ColumnTransformer(remainder='passthrough',
                   transformers=[('imputer_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  ['embarked']),
                                 ('imputer_deck', SimpleImputer(strategy='C'),
                                  ['deck'])]),
 'Encoder': ColumnTransformer(remainder='passthrough',
                   transformers=[('Ordinal',
                                  OrdinalEncoder(handle_unknown='use_encoded_value',
                                                 unknown_value=-1),
                                  [3]),
                                 ('OneHot',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [0, 2])]),
 'Scaler': ColumnTransformer(transformers=[('Scale', MinMaxScaler(), slice(0, 12, None))]),
 'Sfeatures': SelectKBes

In [51]:
X_train.isna().sum()

sex           0
embarked      2
deck        550
class         0
dtype: int64