### Imports

In [74]:
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from pathlib import Path


### Load Datasets

In [75]:
data_path = Path.cwd()

# Pfad zur CSV-Datei das Später erstellt wird
train = pd.read_csv(data_path/'../csv/feature_data/train_engineered.csv')
test = pd.read_csv(data_path/'../csv/feature_data/test_engineered.csv')


In [76]:
print(len(train))
print(len(test))

891
418


### Modell

### Skitlearn RandomForest

In [84]:
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from pathlib import Path


data_path = Path.cwd()

# Pfad zur CSV-Datei das Später erstellt wird
train = pd.read_csv(data_path/'../csv/feature_data/train_engineered.csv')
test = pd.read_csv(data_path/'../csv/feature_data/test_engineered.csv')
print(len(train))
print(len(test))

# Überprüfe, ob im Testdatensatz 'Survived' fehlt und setze es auf -1, um die Struktur beizubehalten
test['Survived'] = -1

# Bereinige die Daten, falls nötig (z.B. fehlende Werte)
# Für das Beispiel hier keine Bereinigung angenommen.

# train =  train.drop(['Ticket'], axis=1)
# test = test.drop(['Ticket'], axis=1)

# Feature und Label definieren
X_train = train.drop(['PassengerId', 'Survived'], axis=1)
y_train = train['Survived']

# Modell vorbereiten
model = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100, random_state=7))

# Cross-Validation
cv = KFold(n_splits=10, random_state=42, shuffle=True)
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
print(f'Accuracy scores for the 10 folds: {scores}')
print(f'Average accuracy: {scores.mean()}')

# Modell mit dem gesamten Trainingsdatensatz trainieren
model.fit(X_train, y_train)

# Vorhersagen für Testdatensatz machen
X_test = test.drop(['PassengerId', 'Survived'], axis=1)
predictions = model.predict(X_test)

# Vorhersagen in den Testdatensatz einfügen und in CSV exportieren
test['Survived'] = predictions
result = test[['PassengerId', 'Survived']]
result.to_csv(data_path/'../csv/submission_test.csv', index=False)

891
418
Accuracy scores for the 10 folds: [0.83333333 0.82022472 0.82022472 0.78651685 0.85393258 0.86516854
 0.80898876 0.78651685 0.84269663 0.87640449]
Average accuracy: 0.8294007490636706


### XGBoost-Klassifikator

In [83]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from pathlib import Path


data_path = Path.cwd()

# Pfad zur CSV-Datei das Später erstellt wird
train = pd.read_csv(data_path/'../csv/feature_data/train_engineered.csv')
test = pd.read_csv(data_path/'../csv/feature_data/test_engineered.csv')
print(len(train))
print(len(test))

# Angenommen, deine Trainings- und Testdatensätze wurden bereits geladen in `train` und `test`

# Setze 'Survived' im Testdatensatz auf -1, um die Struktur beizubehalten
test['Survived'] = -1

# Entferne die 'Ticket' Spalte aus beiden Datensätzen
# train = train.drop(['Ticket'], axis=1)
# test = test.drop(['Ticket'], axis=1)

# Definiere Features und Label
X_train = train.drop(['PassengerId', 'Survived'], axis=1)
y_train = train['Survived']

# Vorbereitung des Modells
# Hier verwenden wir XGBClassifier direkt, da XGBoost seine eigene Methode zur Handhabung von Feature-Skalierung hat,
# und es ist nicht zwingend notwendig, StandardScaler in der Pipeline zu verwenden.
model = XGBClassifier(n_estimators=100, random_state=7, use_label_encoder=False, eval_metric='logloss')

# Cross-Validation
cv = KFold(n_splits=15, random_state=42, shuffle=True)
scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
print(f'Accuracy scores for the 10 folds: {scores}')
print(f'Average accuracy: {scores.mean()}')

# Trainiere das Modell mit dem gesamten Trainingsdatensatz
model.fit(X_train, y_train)

# Mache Vorhersagen für den Testdatensatz
X_test = test.drop(['PassengerId', 'Survived'], axis=1)
predictions = model.predict(X_test)

# Füge Vorhersagen in den Testdatensatz ein und exportiere in CSV
test['Survived'] = predictions
result = test[['PassengerId', 'Survived']]
result.to_csv(data_path / '../csv/submission_test.csv', index=False)


891
418
Accuracy scores for the 10 folds: [0.81666667 0.8        0.81666667 0.78333333 0.8        0.7
 0.86440678 0.83050847 0.86440678 0.79661017 0.77966102 0.83050847
 0.79661017 0.83050847 0.89830508]
Average accuracy: 0.8138794726930321


### PCA für Dimensionalitätsreduktion

In [79]:
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, KFold
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import pandas as pd
from pathlib import Path


data_path = Path.cwd()

# Pfad zur CSV-Datei das Später erstellt wird
train = pd.read_csv(data_path/'../csv/feature_data/train_engineered.csv')
test = pd.read_csv(data_path/'../csv/feature_data/test_engineered.csv')
print(len(train))
print(len(test))


# # Entferne die 'Ticket' Spalte (wie in Ihrem Code)
# train.drop(['Ticket'], axis=1, inplace=True)
# test.drop(['Ticket'], axis=1, inplace=True)

# Features und Label definieren
X_train = train.drop(['PassengerId', 'Survived'], axis=1)
y_train = train['Survived']

# PCA und XGBoost in einer Pipeline
pipeline = Pipeline([
    ('pca', PCA(n_components=0.95)),  # behalte 95% der Varianz
    ('classifier', XGBClassifier(n_estimators=100, random_state=7, use_label_encoder=False, eval_metric='logloss'))
])

# Cross-Validation
cv = KFold(n_splits=15, random_state=42, shuffle=True)
scores = cross_val_score(pipeline, X_train, y_train, cv=cv, scoring='accuracy')

print(f'Accuracy scores for the folds: {scores}')
print(f'Average accuracy: {scores.mean()}')

# Trainieren des Modells mit dem gesamten Trainingsdatensatz
pipeline.fit(X_train, y_train)

# Vorhersagen für den Testdatensatz
X_test = test.drop(['PassengerId', 'Survived'], axis=1)
predictions = pipeline.predict(X_test)

# Ergebnisse in CSV exportieren
test['Survived'] = predictions
result = test[['PassengerId', 'Survived']]
# result.to_csv('Pfad_für_Ihre_Ergebnisdatei.csv', index=False)  # Pfad aktualisieren


891
418
Accuracy scores for the folds: [0.65       0.61666667 0.78333333 0.7        0.51666667 0.65
 0.59322034 0.6440678  0.71186441 0.76271186 0.6779661  0.61016949
 0.6779661  0.57627119 0.79661017]
Average accuracy: 0.6645009416195857


### Hyperparameter-Tuning mit RandomizedSearchCV

In [81]:
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, KFold, cross_val_score
from xgboost import XGBClassifier
from pathlib import Path
import numpy as np

data_path = Path.cwd()

# Lade die Datensätze
train = pd.read_csv(data_path/'../csv/feature_data/train_engineered.csv')
test = pd.read_csv(data_path/'../csv/feature_data/test_engineered.csv')

# Setze 'Survived' im Testdatensatz auf -1
test['Survived'] = -1

# Definiere Features und Labels
X_train = train.drop(['PassengerId', 'Survived'], axis=1)
y_train = train['Survived']

# Hyperparameter-Tuning
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

param_dist = {
    'n_estimators': np.arange(50, 400, 50),
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': np.arange(3, 10, 1),
    'colsample_bytree': [0.3, 0.5, 0.7, 0.9, 1.0],
    'subsample': [0.6, 0.7, 0.8, 0.9, 1.0]
}

random_search = RandomizedSearchCV(xgb, param_distributions=param_dist, n_iter=25, scoring='accuracy', n_jobs=-1, cv=5, random_state=42)
random_search.fit(X_train, y_train)

print(f"Beste Parameter: {random_search.best_params_}")
print(f"Beste Accuracy: {random_search.best_score_}")

# Verwende die besten Parameter, um das Modell zu trainieren
model = XGBClassifier(**random_search.best_params_)
model.fit(X_train, y_train)

# Mache Vorhersagen für den Testdatensatz und exportiere Ergebnisse
X_test = test.drop(['PassengerId', 'Survived'], axis=1)
predictions = model.predict(X_test)

test['Survived'] = predictions
result = test[['PassengerId', 'Survived']]
result.to_csv(data_path / '../csv/submission_test.csv', index=False)

Beste Parameter: {'subsample': 0.6, 'n_estimators': 100, 'max_depth': 9, 'learning_rate': 0.05, 'colsample_bytree': 0.5}
Beste Accuracy: 0.8439708743958321


### Ensemble-Modell mit Stacking

In [82]:
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

import pandas as pd
from sklearn.model_selection import RandomizedSearchCV, KFold, cross_val_score
from xgboost import XGBClassifier
from pathlib import Path
import numpy as np

data_path = Path.cwd()

# Lade die Datensätze
train = pd.read_csv(data_path/'../csv/feature_data/train_engineered.csv')
test = pd.read_csv(data_path/'../csv/feature_data/test_engineered.csv')

# Setze 'Survived' im Testdatensatz auf -1
test['Survived'] = -1

# Definiere Features und Labels
X_train = train.drop(['PassengerId', 'Survived'], axis=1)
y_train = train['Survived']

# Definiere Basis-Modelle
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', XGBClassifier(**random_search.best_params_)),
]

# Definiere das Meta-Modell
meta_model = LogisticRegression()

# Erstelle den StackingClassifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

# Trainiere und evaluiere das Modell mit Cross-Validation
stacking_model.fit(X_train, y_train)
stacking_scores = cross_val_score(stacking_model, X_train, y_train, cv=5, scoring='accuracy')

print(f"Accuracy des Stacking-Modells: {np.mean(stacking_scores)}")

# Vorhersagen für den Testdatensatz und exportiere Ergebnisse
predictions = stacking_model.predict(X_test)

test['Survived'] = predictions
result = test[['PassengerId', 'Survived']]
result.to_csv(data_path / '../csv/submission_test.csv', index=False)


Accuracy des Stacking-Modells: 0.8350072186303434
