## # SGDClassifier Training & Anwendung auf Punktwolken

**Abstract:**
In diesem Workflow wird ein Klassifikationsmodell für Punktwolkendaten mit dem SGDClassifier aus Scikit-learn und Optuna für die Hyperparameter-Optimierung trainiert. Die Punktwolkendaten werden eingelesen, vorverarbeitet und in Trainings- und Testdaten aufgeteilt. Nach erfolgreichem Training und Evaluierung wird das Modell auf neue Punktwolken angewendet, die vorhergesagten Klassen werden gespeichert und die Ergebnisse strukturiert ausgegeben. Alle Zwischen- und Endergebnisse werden zentral im definierten Output-Verzeichnis abgelegt.

In [None]:
# Training & Anwendung auf Punktwolken

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
import joblib
import pickle
import os
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import time

# ---- Pfade ----
input_datei = "PW_Klass_P3A1_gesamt_normalisiert.txt"
output_path = "output"

# Ergebnis-Ordner
model_name = "v4_H_nz"
result_dir = os.path.join(output_path, f"Resultate_SGDClassifier_{model_name}")
os.makedirs(result_dir, exist_ok=True)

parts2 = result_dir.split("_")
classifier, model_name_short = parts2[-2], parts2[-1]

start_time = time.time()

# 1. Daten einlesen (ohne Header)
df = pd.read_csv(input_datei, sep=";", header=None, decimal=".")

# 1.1 Spaltennamen zuweisen
columns = [
    "X", "Y", "Z",
    "Red", "Green", "Blue",
    "Hue", "Saturation", "Value",
    "X_dir", "Y_dir", "Z_dir",
    "Label"
]
df.columns = columns

# 2. Label-Encoding
df["Label"] = df["Label"].astype("category")
df["LabelEncoded"] = df["Label"].cat.codes
label_mapping = dict(enumerate(df["Label"].cat.categories))

# Optional: Visualisierung der Label-Verteilung
label_verteilung_png = os.path.join(output_path, "label_verteilung.png")
df["Label"].value_counts().plot(kind='bar', title="Label Verteilung")
plt.tight_layout()
plt.savefig(label_verteilung_png)
plt.close()

# 3. Features & Ziel definieren
# X = df[[...]]  # alle Features
X = df[["Hue", "Z_dir"]]  # Beispiel mit zwei Features
y = df["LabelEncoded"]

# 3.1 Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 4. Skalierung
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. Modell trainieren mit Optuna und SGDClassifier
import optuna
from sklearn.model_selection import cross_val_score

# Ergebnis-Ordner
model_name = "v3_H_nz"
result_dir = os.path.join(output_path, f"Resultate_SGDClassifier_{model_name}")
os.makedirs(result_dir, exist_ok=True)

parts2 = result_dir.split("_")
classifier, model_name_short = parts2[-2], parts2[-1]

# Optuna-Ziel-Funktion definieren
def objective(trial):
    alpha = trial.suggest_loguniform("alpha", 1e-6, 1e-1)
    loss = trial.suggest_categorical("loss", ["hinge", "log_loss", "modified_huber"])
    penalty = trial.suggest_categorical("penalty", ["l2", "elasticnet"])

    clf = SGDClassifier(
        alpha=alpha,
        loss=loss,
        penalty=penalty,
        max_iter=1000,
        early_stopping=True,
        n_iter_no_change=5,
        tol=1e-3,
        random_state=42,
        verbose=0
        class_weight="balanced"
    )
    score = cross_val_score(clf, X_train_scaled, y_train, cv=3).mean()
    return 1.0 - score

study = optuna.create_study()
study.optimize(objective, n_trials=30)

# Optuna-Visualisierungen speichern
import optuna.visualization as vis
optuna_history_png = os.path.join(result_dir, "optuna_optimization_history.png")
optuna_history_html = os.path.join(result_dir, "optuna_optimization_history.html")
optuna_param_html = os.path.join(result_dir, "optuna_param_importance.html")

fig = vis.plot_optimization_history(study)
fig.write_image(optuna_history_png)
vis.plot_optimization_history(study).write_html(optuna_history_html)
vis.plot_param_importances(study).write_html(optuna_param_html)

# Bestes Modell trainieren
model = SGDClassifier(**study.best_params, max_iter=1000, early_stopping=True, n_iter_no_change=5, tol=1e-3, random_state=42)
model.fit(X_train_scaled, y_train)

# Trainings- und Test-Score speichern
training_score = model.score(X_train_scaled, y_train)
test_score = model.score(X_test_scaled, y_test)
score_info = (
    f"Trainings-Score: {training_score:.4f}\n"
    f"Test-Score: {test_score:.4f}\n"
)

score_txt = os.path.join(result_dir, "training_scores.txt")
with open(score_txt, "w") as f:
    f.write(score_info)

# Score pro Epoche visualisieren (falls vorhanden)
if hasattr(model, "validation_scores_"):
    plt.plot(model.validation_scores_)
    plt.title("Validation Score pro Epoche")
    plt.xlabel("Epoche")
    plt.ylabel("Score")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(result_dir, "validation_score_plot.png"))
    plt.close()

# Klassifikationsbericht
y_pred = model.predict(X_test_scaled)
report = classification_report(y_test, y_pred, target_names=df["Label"].cat.categories)
with open(os.path.join(result_dir, "klassifikationsbericht.txt"), "w", encoding="utf-8") as f:
    f.write(report)

# Confusion Matrix speichern
disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=df["Label"].cat.categories, cmap="Blues", xticks_rotation=45)
disp.figure_.savefig(os.path.join(result_dir, "confusion_matrix.png"))
plt.close()

# Modell, Scaler, Mapping speichern
joblib.dump(model, os.path.join(result_dir, "sgd_model.pkl"))
joblib.dump(scaler, os.path.join(result_dir, "scaler.pkl"))
with open(os.path.join(result_dir, "label_mapping.pkl"), "wb") as f:
    pickle.dump(label_mapping, f)

# Zeit anzeigen
elapsed = time.time() - start_time
h, rem = divmod(elapsed, 3600)
m, s = divmod(rem, 60)
print(f"Laufzeit: {int(h):02d}:{int(m):02d}:{s:05.2f} (Std:Min:Sek)")


# -----------------
# Anwendung auf neue Punktwolke

import time
from pathlib import Path

start_time = time.time()

# 1. Modell und Scaler laden
model = joblib.load(os.path.join(result_dir, "sgd_model.pkl"))
scaler = joblib.load(os.path.join(result_dir, "scaler.pkl"))
with open(os.path.join(result_dir, "label_mapping.pkl"), "rb") as f:
    label_mapping = pickle.load(f)

# 2. Neue Punktwolke laden
input_file = Path("PW_P3A2_normalisiert.txt")
df_new = pd.read_csv(input_file, sep=";", header=None)

# 3. Dateiname auslesen
file_name = input_file.stem
parts = file_name.split("_")
objekt = parts[0]
klass = parts[1] if len(parts) >= 2 else "Unknown"
file_name_neu = f"{objekt}_{klass}_{classifier}_{model_name_short}"

# 4. Spaltennamen (ohne Label)
df_new.columns = [
    "X", "Y", "Z",
    "Red", "Green", "Blue",
    "Hue", "Saturation", "Value",
    "X_dir", "Y_dir", "Z_dir"
]

# 5. Feature-Auswahl
X_new = df_new[["Hue", "Z_dir"]]

# 6. Skalieren & Klassifizieren
X_new_scaled = scaler.transform(X_new)
predicted_labels = model.predict(X_new_scaled)
df_new["Predicted_Label"] = predicted_labels
df_new["Label_Decoded"] = pd.Series(predicted_labels).map(label_mapping)

# 7. Speichern der Resultate
output_txt = os.path.join(result_dir, f"{file_name_neu}.txt")
df_new.to_csv(output_txt, sep=";", index=False)
print(f"Punktwolke mit Vorhersage gespeichert als {output_txt}")

# Punktwolke nach Klassen aufteilen und speichern
for label_value, gruppe in df_new.groupby("Label_Decoded"):
    klass_datei = os.path.join(result_dir, f"{file_name_neu}_klasse_{label_value}.txt")
    gruppe.to_csv(klass_datei, sep=";", index=False)
    print(f"Klasse '{label_value}' gespeichert als {klass_datei}")

# 8. Zeit anzeigen
elapsed = time.time() - start_time
h, rem = divmod(elapsed, 3600)
m, s = divmod(rem, 60)
print(f"Laufzeit: {int(h):02d}:{int(m):02d}:{s:05.2f} (Std:Min:Sek)")


[I 2025-05-23 13:19:28,868] A new study created in memory with name: no-name-84044718-8237-4c09-8d8c-6a273f92096a
  alpha = trial.suggest_loguniform("alpha", 1e-6, 1e-1)
[I 2025-05-23 13:20:23,975] Trial 0 finished with value: 0.45431452363583047 and parameters: {'alpha': 0.002680583242091042, 'loss': 'modified_huber', 'penalty': 'elasticnet'}. Best is trial 0 with value: 0.45431452363583047.
  alpha = trial.suggest_loguniform("alpha", 1e-6, 1e-1)
[I 2025-05-23 13:21:11,596] Trial 1 finished with value: 0.45172909443302167 and parameters: {'alpha': 0.006796202139893583, 'loss': 'log_loss', 'penalty': 'elasticnet'}. Best is trial 1 with value: 0.45172909443302167.
  alpha = trial.suggest_loguniform("alpha", 1e-6, 1e-1)
[I 2025-05-23 13:22:20,170] Trial 2 finished with value: 0.44491568668767245 and parameters: {'alpha': 0.0013722298867714744, 'loss': 'hinge', 'penalty': 'elasticnet'}. Best is trial 2 with value: 0.44491568668767245.
  alpha = trial.suggest_loguniform("alpha", 1e-6, 1e-1

Laufzeit: 00:28:17.39 (Std:Min:Sek)
Punktwolke mit Vorhersage gespeichert als output\Resultate_SGDClassifier_v3_H_nz\PW_P3A2_H_nz.txt
Klasse 'Bäume' gespeichert als output\Resultate_SGDClassifier_v3_H_nz\PW_P3A2_H_nz_klasse_Bäume.txt
Klasse 'Ground' gespeichert als output\Resultate_SGDClassifier_v3_H_nz\PW_P3A2_H_nz_klasse_Ground.txt
Laufzeit: 00:01:08.99 (Std:Min:Sek)
