In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

df = pd.read_csv('/Users/georgkaltenbrunner/python/MLDatenAnonymisiert/data/raw/Bank_Customer_Churn_Prediction.csv')

In [2]:
def test_train(dataset):
    y = dataset['churn']
    X = dataset.drop('churn', axis= 1)
    return train_test_split(X, y, test_size=0.33, random_state=42)

In [3]:
X_train, X_test, y_train, y_test = test_train(df)

In [4]:
# Feature Engineering
# Age

def age_group(df) -> pd.DataFrame:
    bins = [0, 10, 18, 30, 50, 65, float("inf")]
    labels = ["0-10", "11-18", "19-30", "31-50", "51-65", "66+"]

    df["age_group"] = pd.cut(df["age"], bins=bins, labels=labels, right=True, include_lowest=True)
    return df

# CreditSccore

def credit_score_group(df) -> pd.DataFrame:
    bins = [350, 550,  750, float("inf")]
    labels = [0, 1, 2]

    df["credit_score_group"] = pd.cut(df["credit_score"], bins=bins, labels=labels, right=True, include_lowest=True)
    return df

# estimated_salary Quartile

def estimated_salary_quartile(df) -> pd.DataFrame:
    df["salary_quartile"] = pd.qcut(df["estimated_salary"], q=4, labels=[1, 2, 3, 4])
    return df

# Tenure einteilen in "Junge Kund:innen" vs. Bestandskund:inne
def tenure_groups(df) -> pd.DataFrame:
    df["kunden_typ"] = np.where(df["tenure"] < 2, "Junge Kund:in", "Bestandskund:in")
    return df

# Balance Estimated Salary Ratio
def balance_salray_ratio(df) -> pd.DataFrame:
    df["balance_salary_ratio"] = df["balance"] / df["estimated_salary"]
    return df

# Balance Product Ratio
def balance_product_ratio(df) -> pd.DataFrame:
    df["balance_product_ratio"] = df["balance"] / df["products_number"]
    return df

# Credit Score Age Ratio
def credit_score_age_ratio(df) -> pd.DataFrame:
    df["credit_score_age_ratio"] = df["credit_score"] / df["age"]

    return df






def feature_enigneering_pipeline_all_features(df) -> pd.DataFrame:
    # Age
    df = age_group(df)
    # CreditSccore
    df = credit_score_group(df)
    # estimated_salary Quartile
    df = estimated_salary_quartile(df)
    # Tenure
    df = tenure_groups(df)
    # Balance Estimated Salary Ratio
    df = balance_salray_ratio(df)
    # Balance Product Ratio
    df = balance_product_ratio(df)
    # Credit Score Age Ratio
    df = credit_score_age_ratio(df)    
    

    return df

In [5]:
X_train_encoded = feature_enigneering_pipeline_all_features(X_train)
X_test_encoded = feature_enigneering_pipeline_all_features(X_test)

In [6]:
X_test_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3300 entries, 6252 to 9401
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   customer_id             3300 non-null   int64   
 1   credit_score            3300 non-null   int64   
 2   country                 3300 non-null   object  
 3   gender                  3300 non-null   object  
 4   age                     3300 non-null   int64   
 5   tenure                  3300 non-null   int64   
 6   balance                 3300 non-null   float64 
 7   products_number         3300 non-null   int64   
 8   credit_card             3300 non-null   int64   
 9   active_member           3300 non-null   int64   
 10  estimated_salary        3300 non-null   float64 
 11  age_group               3300 non-null   category
 12  credit_score_group      3300 non-null   category
 13  salary_quartile         3300 non-null   category
 14  kunden_typ              33

In [7]:
# Logisitsche Regression

from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score

categorical_features = ['country', 'gender', 'age_group', 'credit_score_group', 'salary_quartile', 'kunden_typ']
numeric_features = ['estimated_salary', 'balance'] 



# Preprocessing definieren
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numeric_features)
])

# Pipeline mit Logistic Regression
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=10000))
])

# Trainiere das Modell
pipeline.fit(X_train, y_train)

# Vorhersage auf Testdaten
y_pred = pipeline.predict(X_test)
f1_score(y_test, y_pred, average='macro')




0.5760138622053618

In [147]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

# Merkmale definieren
categorical_features = ['country', 'gender', 'age_group', 'credit_score_group', 'salary_quartile', 'kunden_typ']
numeric_features = ['estimated_salary', 'balance'] 

# Preprocessing definieren
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numeric_features)
])

# Pipeline definieren
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(random_state=0))
])

# Hyperparameter-Raster für Grid Search
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 50, None],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2],
    'classifier__max_features': ['sqrt', 'log2']
}

# Grid Search mit F1-Score als Bewertungsmaß
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=-1, verbose=24)

# Training
grid_search.fit(X_train, y_train)

# Beste Parameter
print("Beste Parameterkombination:", grid_search.best_params_)

# Vorhersage und Bewertung
y_pred = grid_search.predict(X_test)
score = f1_score(y_test, y_pred, average='macro')
print("F1-Score (macro):", score)


Fitting 5 folds for each of 48 candidates, totalling 240 fits
[CV 2/5; 1/48] START classifier__max_depth=10, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100
[CV 4/5; 1/48] START classifier__max_depth=10, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100
[CV 1/5; 1/48] START classifier__max_depth=10, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100
[CV 5/5; 1/48] START classifier__max_depth=10, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100
[CV 3/5; 1/48] START classifier__max_depth=10, classifier__max_features=sqrt, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100
[CV 2/5; 2/48] START classifier__max_depth=10, classifier__max_features=sqrt, 

In [9]:
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

# Features definieren
categorical_features = ['country', 'gender', 'age_group', 'credit_score_group', 'salary_quartile', 'kunden_typ']
numeric_features = ['estimated_salary', 'balance']

# Preprocessing
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numeric_features)
])

# Pipeline mit XGBoost
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=0))
])

# Hyperparameter-Tuning
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [3, 6, 10],
    'classifier__learning_rate': [0.01, 0.1, 0.3],
    'classifier__subsample': [0.8, 1.0],
    'classifier__colsample_bytree': [0.8, 1.0]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_macro', n_jobs=-1)

# Trainieren
grid_search.fit(X_train, y_train)

# Beste Parameter
print("Beste Parameterkombination:", grid_search.best_params_)

# Vorhersage & Bewertung
y_pred = grid_search.predict(X_test)
score = f1_score(y_test, y_pred, average='macro')
print("F1-Score (macro):", score)


ModuleNotFoundError: No module named 'xgboost'