In [1]:
import pandas as pd
import seaborn as sns


In [2]:
df = sns.load_dataset("penguins")
df.head()


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [3]:
#Missing values
df["sex"] = df["sex"].fillna("Unknown")

numeric_cols = ["bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"]
for col in numeric_cols:
    df[col] = df[col].fillna(df[col].median())


In [4]:
#Relevante Features einbauen
df["bill_ratio"] = df["bill_length_mm"] / df["bill_depth_mm"]
df["mass_to_flipper"] = df["body_mass_g"] / df["flipper_length_mm"]
df["is_big"] = (df["body_mass_g"] > df["body_mass_g"].median()).astype(int)
df["sex_binary"] = df["sex"].apply(lambda x: 1 if x == "Male" else 0)


In [5]:
#Feature Target Split
X = df.drop(columns=["species"])
y = df["species"]


In [6]:
#Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

numeric_features = X.select_dtypes(include=["float64", "int64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

clf = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("model", RandomForestClassifier(random_state=42))
    ]
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 1.0


In [8]:
# Transformiere das gesamte Dataset
X_processed = preprocessor.fit_transform(X)

# Sparse -> Dense konvertieren (falls n√∂tig)
if hasattr(X_processed, "toarray"):
    X_processed = X_processed.toarray()

# In DataFrame umwandeln
X_df = pd.DataFrame(X_processed)

# Speichern
X_df.to_csv("data/processed/penguins_matrix.csv", index=False)

print("ML-Matrix gespeichert!")


ML-Matrix gespeichert!
