In [7]:
### Bibliotheken importieren
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [8]:
df = pd.read_csv('./windmill_data.csv')
df = df.sort_values("timestamp")

# Test Train Split

In [9]:
split_idx = int(len(df) * 0.8)
train_data = df.iloc[:split_idx]
test_data = df.iloc[split_idx:]

# Datenbereinigung

In [10]:
# unnötige Spalten entfernen
X = df.drop(['failure_risk_30d', 'days_since_maintenance', 'turbine_id'], axis=1)
# Zielvariable definieren
y = df['failure_risk_30d']

# Separate features and target for training and testing sets
X_train = train_data.drop(['failure_risk_30d', 'days_since_maintenance', 'turbine_id'], axis=1)
y_train = train_data['failure_risk_30d']

X_test = test_data.drop(['failure_risk_30d', 'days_since_maintenance', 'turbine_id'], axis=1)
y_test = test_data['failure_risk_30d']

X_train= X_train.dropna(subset=['vibration_mm_s', 'temperature_c'])
y_train = y_train.loc[X_train.index]
X_test = X_test.dropna(subset=['vibration_mm_s', 'temperature_c'])
y_test = y_test.loc[X_test.index]

# Encoding

In [11]:
from imblearn.over_sampling import SMOTE

# encoding
numerical_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# gleichmäßige Aufteilung der Klassen
smote = SMOTE(random_state=42)

X_train_processed, y_train = smote.fit_resample(X_train_processed, y_train)


# TrainTrain 🚂


In [12]:
## 3. Modelltraining
### Logistische Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_processed, y_train)
y_pred_lr = lr.predict(X_test_processed)
### Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_processed, y_train)
y_pred_dt = dt.predict(X_test_processed)
### KNN
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')
knn.fit(X_train_processed, y_train)
y_pred_knn = knn.predict(X_test_processed)
### Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_processed, y_train)
y_pred_rf = rf.predict(X_test_processed)
## 3.5 Ensemble Learning
### Bagging
bagging = BaggingClassifier(estimator=dt, random_state=42)
bagging.fit(X_train_processed, y_train)
y_pred_bag = bagging.predict(X_test_processed)
### Gradient Boosting
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train_processed, y_train)
y_pred_gradboost = gb.predict(X_test_processed)
## 4. Modellbewertung