# Datenvorbereiten für maschinelles Lernen

## 1. Daten in Features X und in Labels y aufteilen

In [1]:
import numpy as np
import pandas as pd

In [2]:
heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
#Feature-Matrix erstellen:
X = heart_disease.drop("target", axis=1)
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [4]:
y = heart_disease["target"]
y

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64

## 2. Daten in Trainings- und Testdaten aufteilen

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")


Shape of X_train: (242, 13)
Shape of X_test: (61, 13)
Shape of y_train: (242,)
Shape of y_test: (61,)


In [7]:
# 80% der Daten sind Trainingsdaten für das Modell:
print(X.shape[0] * 0.8)

# 20% der Daten sind Testdaten für das Modell:
print(X.shape[0] * 0.2)

# Summe aud Trainingsdaten und Testdaten ist:
print(X.shape[0] * 0.8 + X.shape[0] * 0.2)

242.4
60.6
303.0


## 3. Nicht-numerische Werte in Zahlen umwandeln (Feature-Encoding)

In [8]:
car_sales = pd.read_csv("data/car-sales-extended.csv")
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
...,...,...,...,...,...
995,Toyota,Black,35820,4,32042
996,Nissan,White,155144,3,5716
997,Nissan,Blue,66604,4,31570
998,Honda,White,215883,4,4001


In [9]:
# Datentypen prüfen
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [10]:
# Feature-Matrix
X = car_sales.drop("Price", axis=1)

# Label-Vektor
y = car_sales["Price"]

In [11]:
# In Trainingsdaten und Testdaten aufteilen:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
from sklearn.ensemble import RandomForestRegressor

try:
    # ML-Modell erstellen:
    model = RandomForestRegressor()
    # Modell trainieren:
    model.fit(X_train, y_train)
    # Model bewerten:
    model.score(X_test, y_test)
except Exception as e:
    print(f"Error creating model: {e}")

Error creating model: could not convert string to float: 'Toyota'


## 3.1 One-Hot-Encoding

In [13]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Liste von Spaltennamen, die kategoriale Werte enthalten:
categorical_features = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()

# Hier wird der "OneHotEncoder" auf die Spalten in "categorical_features" angewand.
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)], remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]], shape=(1000, 13))

In [None]:
pd.DataFrame(transformed_X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [20]:
# Neue Spaltennamen auslesen:
features_names = transformer.transformers_[0][1].get_feature_names_out()
features_names

array(['Make_BMW', 'Make_Honda', 'Make_Nissan', 'Make_Toyota',
       'Colour_Black', 'Colour_Blue', 'Colour_Green', 'Colour_Red',
       'Colour_White', 'Doors_3', 'Doors_4', 'Doors_5'], dtype=object)

In [21]:
X

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3
...,...,...,...,...
995,Toyota,Black,35820,4
996,Nissan,White,155144,3
997,Nissan,Blue,66604,4
998,Honda,White,215883,4


In [22]:
# Nicht transformierte Feature Spaltennamen ausgegeben:
remainder_names = X.drop(categorical_features).columns
remainder_names

KeyError: "['Make', 'Colour', 'Doors'] not found in axis"

In [23]:
# "features_names" und "remainder_names" zusammenführen
all_names = np.concatenate([features_names, remainder_names])
all_names

NameError: name 'remainder_names' is not defined

In [24]:
# DataFrame erstellen mit Spaltenbezeichnungen:
df_transformed = pd.DataFrame(transformed_X, columns=all_names)
df_transformed

NameError: name 'all_names' is not defined

In [26]:
df_transformed.iloc[0]

NameError: name 'df_transformed' is not defined

In [None]:
# Erste Zeile aus dem Ausgangsdatensatz:
car_sales.iloc[0]


Make             Honda
Colour           White
Odometer (KM)    35431
Doors                4
Price            15323
Name: 0, dtype: object

In [None]:
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
model.score(X_test,y_test)

0.17990353868985776