In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
import seaborn as sns

# Importing the data
dataset = pd.read_excel('/Users/mustafa/Documents/GitHub/ML_knife_QA/data/chiefs_knife_dataset.xlsx')
index_Ra = dataset.columns.get_loc('Ra')  # Index der Spalte für Rauheit

# Spezifikationsgrenzen
LOWER_SPECIFICATION_LIMIT = 0.125
UPPER_SPECIFICATION_LIMIT = 0.215

# Einteilung in "gut" und "schlecht"
is_between_specification_bounds = (dataset['Ra'] >= LOWER_SPECIFICATION_LIMIT) & (dataset['Ra'] < UPPER_SPECIFICATION_LIMIT)
good_product_range = np.where(is_between_specification_bounds, "good", "bad")
dataset.insert(index_Ra + 1, 'Quality', good_product_range)

# Features und Zielvariable
X = dataset.loc[:, 'Original_Linienanzahl':'DFT_Median_sobel_Bereich'].values
y = dataset['Quality'].values

# Data Augmentation durch Rauschzugabe
noise = np.random.normal(0, 0.01, X.shape)  # Rauschen mit Mittelwert 0 und Standardabweichung 0.01
X_augmented = X + noise
y_augmented = np.copy(y)  # Labels bleiben gleich

# Verknüpfung von Original- und Augmented Features
X_combined = np.vstack((X, X_augmented))
y_combined = np.concatenate((y, y_augmented))



In [4]:
print(X_combined)
print(y_combined)



[[ 8.70000000e+01  1.19310345e+01  2.00870480e+01 ...  2.60000000e+01
   2.00000000e+00  2.40000000e+01]
 [ 7.70000000e+01  1.48831169e+01  2.29488273e+01 ...  2.60000000e+01
   4.00000000e+00  2.20000000e+01]
 [ 5.90000000e+01  1.96779661e+01  3.18382781e+01 ...  1.60000000e+01
   5.00000000e+00  1.10000000e+01]
 ...
 [ 9.93723333e-01  9.93468941e-01 -1.30763445e-02 ...  2.69993620e+01
   1.99309488e+00  2.50092318e+01]
 [ 1.00663969e+00  1.01341673e+00 -2.14581095e-03 ...  2.49954164e+01
   2.99579480e+00  2.20030958e+01]
 [ 9.94045386e-01  9.94401332e-01  1.31977327e-02 ...  2.79916869e+01
   4.99609517e+00  2.29972671e+01]]
['good' 'good' 'good' ... 'good' 'good' 'good']


In [None]:
# Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_combined, test_size=0.2, random_state=42)


# Feature-Skalierung
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Modelltraining mit RandomForestClassifier (Vor Randomized Search)
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train_scaled, y_train)

# Vorhersagen
y_pred = classifier.predict(X_test_scaled)

# Evaluierung
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy (Original Model): {accuracy}')
print(f'Classification Report (Original Model):\n{report}')

# Confusion Matrix für das Originalmodell
cm_original = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_original, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted Bad', 'Predicted Good'], yticklabels=['Actual Bad', 'Actual Good'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Original Model)')
plt.show()

# Hyperparameter-Raster für Randomized Search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

# Randomized Search für Random Forest Classifier
random_search_classifier = RandomizedSearchCV(estimator=classifier,
                                              param_distributions=param_grid, 
                                              n_iter=100, # Anzahl der zufälligen Kombinationen
                                              cv=5,       # Cross-Validation-Folds
                                              scoring='accuracy', # search for the HP combination with the best accuracy
                                              n_jobs=-1,  # Alle verfügbaren Kerne verwenden
                                              verbose=2,  # Ausführlichkeit
                                              random_state=42)  # Für Reproduzierbarkeit
random_search_classifier.fit(X_train_scaled, y_train)

# Ausgabe der besten Hyperparameter
print(f'Beste Hyperparameter für Classifier: {random_search_classifier.best_params_}')

# Bestes Modell basierend auf der Suche
best_model = random_search_classifier.best_estimator_

# Vorhersagen mit dem besten Modell
y_pred_best = best_model.predict(X_test_scaled)

# Evaluierung des besten Modells
accuracy_best = accuracy_score(y_test, y_pred_best)
report_best = classification_report(y_test, y_pred_best)
print(f'Best Model Accuracy: {accuracy_best}')
print(f'Best Model Classification Report:\n{report_best}')

# Confusion Matrix für das beste Modell
cm_best = confusion_matrix(y_test, y_pred_best)
plt.figure(figsize=(6, 4))
sns.heatmap(cm_best, annot=True, fmt='d', cmap='Greens', xticklabels=['Predicted Bad', 'Predicted Good'], yticklabels=['Actual Bad', 'Actual Good'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Best Model)')
plt.show()

# Feature Importances
importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1]

# Namen der Features
feature1 = dataset.columns.get_loc('Original_Linienanzahl')
feature2 = dataset.columns.get_loc('DFT_Median_sobel_Bereich')
feature_names = dataset.columns[feature1:feature2]

# Visualisierung der Feature Importances
plt.figure(figsize=(12, 8))
plt.title("Feature Importances")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), feature_names[indices], rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()

# Ausgabe der Feature Importances
feature_importances = pd.DataFrame({'Feature': feature_names[indices], 'Importance': importances[indices]})
print(feature_importances)


In [8]:
a = np.array([True])
b = np.array([False])
print(a.shape)
print(b.shape)
v = np.concatenate([a,b])
v.shape

(1,)
(1,)


(2,)