In [43]:
import pandas as pd
from scipy.stats import randint
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler

## Erstelle aus `bank-churn.csv` einen DataFrame

In [44]:
df = pd.read_csv("data/bank-churn.csv")
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


## Data Cleaning

In [12]:
df.Gender.unique()

array(['Female', 'Male'], dtype=object)

In [17]:
df.drop(columns=["RowNumber","Surname"])
df["isMale"] = df.Gender.replace({"Female": 0, "Male": 1})
df = pd.get_dummies(data=df, columns=["Geography"])
df = df.drop(columns=["RowNumber","Surname","Gender","CustomerId"])
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,isMale,Geography_France,Geography_Germany,Geography_Spain
0,619,42,2,0.0,1,1,1,101348.88,1,0,1,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,0,0,1
2,502,42,8,159660.8,3,1,0,113931.57,1,0,1,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,1,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,0,0,1


## Training
* 20% Testdaten
* `random_state=42` sofern unterstützt
* target feature: `Exited`
* Ziel: score von 0.85

In [42]:
X = df.drop(columns="Exited")
y = df.Exited

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 ,random_state=42)

transformer = ColumnTransformer(transformers=[
    ('dummies', pd.get_dummies(df, columns=["Geography"])),
], remainder='passthrough')

pipeline = Pipeline([
    ('column_transformer', transformer),
    ('scaler', RobustScaler())
])

X_train = pipeline.fit_transform(X_train)
X_test = pipeline.transform(X_test)

KeyError: "None of [Index(['Geography'], dtype='object')] are in the [columns]"

In [28]:
forest_cv = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions= {
        'n_estimators': randint(50, 500),
        'max_features': [1.0, 'sqrt', 'log2'],
        'class_weight': [None, 'balanced', 'balanced_subsample']
    },
    n_jobs=2,
    n_iter=25,
    scoring="accuracy",
    cv=3,
    random_state=42
)
forest_cv.fit(X_train, y_train)

In [29]:
confusion_matrix(y_test, forest_cv.best_estimator_.predict(X_test))

array([[1550,   57],
       [ 204,  189]], dtype=int64)

In [30]:
from sklearn.metrics import classification_report

print(classification_report(y_test, forest_cv.best_estimator_.predict(X_test)))

              precision    recall  f1-score   support

           0       0.88      0.96      0.92      1607
           1       0.77      0.48      0.59       393

    accuracy                           0.87      2000
   macro avg       0.83      0.72      0.76      2000
weighted avg       0.86      0.87      0.86      2000



## Interpretiere den rechten oberen Eintrag der Confusion Matrix

Er hat bei denen, die in wirklichkeit nicht gegangen sind gesagt, dass sie gegangen sind

## Predicte, mit welcher Wahrscheinlichkeit der Kunde aussteigen wird
Er sollte mit etwa 90%iger Wahrscheinlichkeit **bleiben**

In [39]:
customer = pd.DataFrame(data={
    'CreditScore': 500,
    'Age': 20,
    'Tenure': 2,
    'Balance': 0,
    'NumOfProducts': 1,
    'HasCrCard': 1,
    'IsActiveMember': 1,
    'EstimatedSalary': 60_000,
    'Geography': 'Germany',
    'Gender': 'Male'},
    index=[0])

customer["isMale"] = customer.Gender.replace({"Female": 0, "Male": 1})
customer = customer.drop(columns=["Gender"])

customer.head()

#mit pipeline probieren oder einfach manuell

#forest_cv.best_estimator_.predict(customer)
df.head()

Unnamed: 0,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,isMale,Geography_France,Geography_Germany,Geography_Spain
0,619,42,2,0.0,1,1,1,101348.88,1,0,1,0,0
1,608,41,1,83807.86,1,0,1,112542.58,0,0,0,0,1
2,502,42,8,159660.8,3,1,0,113931.57,1,0,1,0,0
3,699,39,1,0.0,2,0,0,93826.63,0,0,1,0,0
4,850,43,2,125510.82,1,1,1,79084.1,0,0,0,0,1
