# Umgang mit fehlenden werten in sklearn

In [15]:
import pandas as pd
import numpy as np

car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv")
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [2]:
car_sales_missing.dropna(subset=["Price"], inplace=True)
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [3]:
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [4]:
categorical_features = ["Make", "Colour"]
door_features = ["Doors"]
numerical_features = ["Odometer (KM)"]

In [5]:
from sklearn.impute import SimpleImputer

categorical_inputer = SimpleImputer(strategy="constant", fill_value="missing")
door_inputer = SimpleImputer(strategy="constant", fill_value=4)
numerical_inputer = SimpleImputer(strategy="mean")

In [6]:
from sklearn.compose import ColumnTransformer

inputer = ColumnTransformer([
    # Hier wird der inputer auf 2 verschiedene Spalten angewendet:
    ("cat_inputer", categorical_inputer, categorical_features),

    # Auf "Doors" Spalte einen Inputer anwenden:
    ("door_inputer", door_inputer, door_features),

    # Auf "Odometer (KM)" Spalte einen Inputer anwenden:
    ("numerical_inputer", numerical_inputer, numerical_features),
])

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
filled_X_train = inputer.fit_transform(X_train)
filled_X_test = inputer.fit_transform(X_test)

In [9]:
filled_columns = categorical_features + door_features + numerical_features
X_train_df = pd.DataFrame(filled_X_train, columns=filled_columns)
X_test_df = pd.DataFrame(filled_X_test, columns=filled_columns)

In [10]:
X_test_df.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [11]:
from sklearn.preprocessing import OneHotEncoder


# Liste von Spaltennamen, die kategoriale Werte enthalten:
categorical_features = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()

# Hier wird der "OneHotEncoder" auf die Spalten in "categorical_features" angewand.
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

X_train_transformed = inputer.fit_transform(X_train_df)
X_test_transformed = inputer.fit_transform(X_test_df)

In [12]:
X_train_transformed

array([['Honda', 'White', 4.0, 71934.0],
       ['Toyota', 'Red', 4.0, 162665.0],
       ['Honda', 'White', 4.0, 42844.0],
       ...,
       ['Toyota', 'White', 4.0, 196225.0],
       ['Honda', 'Blue', 4.0, 133117.0],
       ['Honda', 'missing', 4.0, 150582.0]], shape=(760, 4), dtype=object)

In [25]:


X_train_array = X_train_transformed.toarray()
X_test_array = X_test_transformed.toarray()

ohe_columns = transformer.named_transformers_["one_hot"].get_feature_names_out()
all_feature_names = list(ohe_columns) + list(numerical_features)

X_train_df_final = pd.DataFrame(X_train_array, columns=all_feature_names)
X_test_df_final = pd.DataFrame(X_test_array,columns=all_feature_names)
X_test_df_final              

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

In [26]:
from sklearn.ensemble import RandomForestRegressor

# Modell erstellen:
model = RandomForestRegressor(random_state=42)

# Modell trainieren:
model.fit(X_train_df_final, y_train)

# Modellbewertung:
model.score(X_test_df_final, y_test)

NameError: name 'X_train_df_final' is not defined