In [1]:
import pandas as pd
from scipy.stats import randint
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, OneHotEncoder

## Erstelle aus `bank-churn.csv` einen DataFrame

## Data Cleaning

In [2]:
df = pd.read_csv("data/bank-churn.csv")
df = df.drop(columns=["RowNumber", "Surname","CustomerId"])

tf = ColumnTransformer(
   [("oneHot", OneHotEncoder(), ["Geography", "Gender"]),],
    remainder="passthrough"
)

pipeline = Pipeline([
    ('column_transformer', tf),
    ('scaler', RobustScaler())
])

## Training
* 20% Testdaten
* `random_state=42` sofern unterstützt
* target feature: `Exited`
* Ziel: score von 0.85

In [3]:
X = df.drop(columns="Exited")
y = df.Exited

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 ,random_state=42)

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

In [4]:
forest_cv = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions= {
        'n_estimators': randint(50, 500),
        'max_features': [1.0, 'sqrt', 'log2'],
        'class_weight': [None, 'balanced', 'balanced_subsample']
    },
    n_jobs=4,
    n_iter=25,
    scoring="accuracy",
    cv=3,
    random_state=42
)
forest_cv.fit(X_train, y_train)

RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(random_state=42),
                   n_iter=25, n_jobs=4,
                   param_distributions={'class_weight': [None, 'balanced',
                                                         'balanced_subsample'],
                                        'max_features': [1.0, 'sqrt', 'log2'],
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000002078FCFA100>},
                   random_state=42, scoring='accuracy')

In [5]:
confusion_matrix(y_test, forest_cv.best_estimator_.predict(X_test))

array([[1550,   57],
       [ 211,  182]], dtype=int64)

In [6]:
from sklearn.metrics import classification_report

print(classification_report(y_test, forest_cv.best_estimator_.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.76      0.46      0.58       393

    accuracy                           0.87      2000
   macro avg       0.82      0.71      0.75      2000
weighted avg       0.86      0.87      0.85      2000



## Interpretiere den rechten oberen Eintrag der Confusion Matrix

Er hat bei denen, die in wirklichkeit nicht gegangen sind gesagt, dass sie gegangen sind

## Predicte, mit welcher Wahrscheinlichkeit der Kunde aussteigen wird
Er sollte mit etwa 90%iger Wahrscheinlichkeit **bleiben**

In [7]:
customer = pd.DataFrame(data={
    'CreditScore': 500,
    'Age': 20,
    'Tenure': 2,
    'Balance': 0,
    'NumOfProducts': 1,
    'HasCrCard': 1,
    'IsActiveMember': 1,
    'EstimatedSalary': 60_000,
    'Geography': 'Germany',
    'Gender': 'Male'},
    index=[0])

customer = pipeline.transform(customer)

forest_cv.best_estimator_.predict_proba(customer)

array([[0.85929648, 0.14070352]])