# Struckturierter ML-Workflow mit Pipelines

In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
import joblib

In [14]:
data = pd.read_csv("data/vehicle_emissions.csv")
data

Unnamed: 0,Model_Year,Make,Model,Vehicle_Class,Engine_Size,Cylinders,Transmission,Fuel_Consumption_in_City(L/100 km),Fuel_Consumption_in_City_Hwy(L/100 km),Fuel_Consumption_comb(L/100km),CO2_Emissions,Smog_Level
0,2021,Acura,ILX,Compact,2.4,4,AM8,9.9,7.0,8.6,199,3
1,2021,Acura,NSX,Two-seater,3.5,6,AM9,11.1,10.8,11.0,256,3
2,2021,Acura,RDX SH-AWD,SUV: Small,2.0,4,AS10,11.0,8.6,9.9,232,6
3,2021,Acura,RDX SH-AWD A-SPEC,SUV: Small,2.0,4,AS10,11.3,9.1,10.3,242,6
4,2021,Acura,TLX SH-AWD,Compact,2.0,4,AS10,11.2,8.0,9.8,230,7
...,...,...,...,...,...,...,...,...,...,...,...,...
930,2021,Volvo,XC40 T5 AWD,SUV: Small,2.0,4,AS8,10.7,7.7,9.4,219,5
931,2021,Volvo,XC60 T5 AWD,SUV: Small,2.0,4,AS8,11.1,8.3,9.9,230,5
932,2021,Volvo,XC60 T6 AWD,SUV: Small,2.0,4,AS8,11.7,8.6,10.3,240,7
933,2021,Volvo,XC90 T5 AWD,SUV: Standard,2.0,4,AS8,11.5,8.4,10.1,236,5


In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 935 entries, 0 to 934
Data columns (total 12 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Model_Year                              935 non-null    int64  
 1   Make                                    935 non-null    object 
 2   Model                                   935 non-null    object 
 3   Vehicle_Class                           935 non-null    object 
 4   Engine_Size                             935 non-null    float64
 5   Cylinders                               935 non-null    int64  
 6   Transmission                            935 non-null    object 
 7   Fuel_Consumption_in_City(L/100 km)      935 non-null    float64
 8   Fuel_Consumption_in_City_Hwy(L/100 km)  935 non-null    float64
 9   Fuel_Consumption_comb(L/100km)          935 non-null    float64
 10  CO2_Emissions                           935 non-null    int64 

In [16]:
X = data.drop("CO2_Emissions", axis=1)
y = data["CO2_Emissions"]

In [17]:
numerical_col = X.select_dtypes(include=["number"]).columns.to_list()
categorical_col = X.select_dtypes(include=["object", "category", "bool"]).columns.to_list()

In [18]:
numerical_pipeline = Pipeline([
    ("inputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [19]:
categorical_pipeline = Pipeline([
    ("inputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [20]:
preprocessor = ColumnTransformer([
    ("num", numerical_pipeline, numerical_col),
    ("cat", categorical_pipeline, categorical_col)
])

In [21]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor())
])

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [25]:
predictions = pipeline.predict(X_test)


In [29]:
(pipeline
 .named_steps["preprocessor"]
 .named_transformers_["cat"]
 .named_steps["encoder"]
 .get_feature_names_out(categorical_col)
)

array(['Make_Acura', 'Make_Alfa Romeo', 'Make_Aston Martin', 'Make_Audi',
       'Make_BMW', 'Make_Bentley', 'Make_Bugatti', 'Make_Buick',
       'Make_Cadillac', 'Make_Chevrolet', 'Make_Chrysler', 'Make_Dodge',
       'Make_FIAT', 'Make_Ford', 'Make_GMC', 'Make_Genesis', 'Make_Honda',
       'Make_Hyundai', 'Make_Infiniti', 'Make_Jaguar', 'Make_Jeep',
       'Make_Kia', 'Make_Lamborghini', 'Make_Lexus', 'Make_Lincoln',
       'Make_MINI', 'Make_Maserati', 'Make_Mazda', 'Make_Mercedes-Benz',
       'Make_Mitsubishi', 'Make_Nissan', 'Make_Porsche', 'Make_Ram',
       'Make_Rolls-Royce', 'Make_Subaru', 'Make_Toyota',
       'Make_Volkswagen', 'Make_Volvo', 'Model_1500',
       'Model_1500 4X4 EcoDiesel', 'Model_1500 4X4 TRX',
       'Model_1500 4X4 eTorque', 'Model_1500 Classic',
       'Model_1500 Classic 4X4', 'Model_1500 EcoDiesel',
       'Model_1500 HFE EcoDiesel', 'Model_1500 HFE eTorque',
       'Model_1500 eTorque', 'Model_228i xDrive Gran Coupe',
       'Model_230i xDrive Coupe'

In [32]:
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)
mae = mean_absolute_error(y_test, predictions)

print(f"Model Performance:")
print(f"R2-score: {r2}")
print(f"Root Mean Square Error: {rmse}")
print(f"Mean Absolute Error: {mae}")

Model Performance:
R2-score: 0.9718827185692565
Root Mean Square Error: 10.642493902774305
Mean Absolute Error: 3.2738502673796774


In [34]:
joblib.dump(pipeline, "vehicle_emission_pipeline.joblip")

['vehicle_emission_pipeline.joblip']

### Beispiel für neuladen

In [None]:

pipe = joblib.load("vehicle_emission_pipeline.joblip")
pipe
# df_new = pd.read...
# X_new = ....
# predictions = pipe.predict(X_new)

0,1,2
,steps,"[('preprocessor', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Cross-Validation

In [37]:
from sklearn.model_selection import KFold, cross_validate

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    "r2": "r2",
    "mae": "neg_mean_absolute_error",
    "rmse": "neg_root_mean_squared_error"
}

cv_res = cross_validate(pipeline, X, y, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=True)

In [40]:
print(f"R2: {cv_res['test_r2'].mean():.3f} ± {cv_res['test_r2'].std():.3f}")

R2: 0.971 ± 0.011


## Hyperparameter-Tuning


In [47]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "model__n_estimators": [300,600],
    "model__max_depth": [None, 12, 20],
    "model__min_samples_split": [2, 5, 10]
}

grid_pipeline = GridSearchCV(pipeline, param_grid=param_grid, cv=cv, scoring="r2", n_jobs=-1)
grid_pipeline.fit(X, y)

0,1,2
,estimator,Pipeline(step...Regressor())])
,param_grid,"{'model__max_depth': [None, 12, ...], 'model__min_samples_split': [2, 5, ...], 'model__n_estimators': [300, 600]}"
,scoring,'r2'
,n_jobs,-1
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [48]:
grid_pipeline.best_params_

{'model__max_depth': None,
 'model__min_samples_split': 2,
 'model__n_estimators': 300}

In [49]:
grid_pipeline.best_score_

np.float64(0.9709348940579405)

In [50]:
joblib.dump(grid_pipeline.best_estimator_, "Best_model_pipeline.joblib")

['Best_model_pipeline.joblib']

### Validation Curve

In [52]:
from sklearn.model_selection import validation_curve

# "n_estimators" Wertebereich:
param_range = [50, 100, 200, 400, 800, 1000, 1200]

train_scores, test_scores = validation_curve(
    pipeline,
    X,
    y,
    param_name="model__n_estimators",
    param_range=param_range,
    cv=cv,
    scoring="r2",
    n_jobs=-1
)

print(f"train R2: {train_scores.mean(axis=1)}")
print(f"Val R2: {test_scores.mean(axis=1)}")

train R2: [0.99591109 0.99591497 0.9960937  0.99594421 0.9960334  0.99604474
 0.99610743]
Val R2: [0.96930424 0.97130293 0.97065426 0.97044628 0.97078092 0.97007329
 0.97053806]


In [55]:
import plotly.graph_objects as go

train_mean = train_scores.mean(axis=1)
train_std = train_scores.std(axis=1)
val_mean = test_scores.mean(axis=1)
val_std = test_scores.std(axis=1)

fig = go.Figure()

# Training
fig.add_trace(go.Scatter(
    x=param_range,
    y=train_mean,
    mode="lines+markers",
    line=dict(color="blue"),
    error_y=dict(
        type="data",
        array=train_std,
        visible=True
    ),
    name="Train R2"
))

# Validation
fig.add_trace(go.Scatter(
    x=param_range,
    y=val_mean,
    mode="lines+markers",
    line=dict(color="orange"),
    error_y=dict(
        type="data",
        array=val_std,
        visible=True
    ),
    name="Validation R2"
))

fig