<a href="https://colab.research.google.com/github/LeonHauch/bachelorarbeit/blob/main/BA_TableData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files


uploaded = files.upload()

Saving Bank Customer Churn Prediction.csv to Bank Customer Churn Prediction.csv


In [None]:
# Import der libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler


In [None]:
# Laden des Datensatzes

churn_data = pd.read_csv('Bank Customer Churn Prediction.csv')


In [None]:
#Info
print(churn_data.info())


In [None]:
print(churn_data.describe())


In [None]:
print(churn_data.isnull().sum())
#keine fehlenden Werte vorhanden

In [None]:
# Scatter Plot Age vs Balance
plt.figure(figsize=(10, 5))
sns.scatterplot(data=churn_data, x='age', y='balance', hue='churn', palette="viridis")
plt.xlabel('Age', fontsize=12)
plt.ylabel('Balance', fontsize=12)
plt.title('Scatter plot of Age vs. Balance colored by Churn')
plt.show()


In [None]:
# Boxplot, nicht in Arbeit aufgenommen
sns.boxplot(data=churn_data, x='credit_card', y='balance', hue='churn', palette="viridis")


In [None]:
# Korrelationsmatrix
corr_matrix = churn_data.corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Feature Correlation')
plt.show()


In [None]:
#Prprocessing: Entfernen der Customer ID und Encoding der katgorischen Variablen
# Drop 'customer_id'
churn_data.drop('customer_id', axis=1, inplace=True)

# Encode categorical variables
label_encoders = {}
for column in ['country', 'gender']:
    le = LabelEncoder()
    churn_data[column] = le.fit_transform(churn_data[column])
    label_encoders[column] = le


In [None]:
#Scaling
scaler = StandardScaler()
scaled_data = churn_data.copy()
scaled_data[scaled_data.columns] = scaler.fit_transform(scaled_data)

In [None]:
#Auswahl der Classifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, roc_auc_score


In [None]:
# Split data into features and target
X = scaled_data.drop('churn', axis=1)
y = churn_data['churn']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.metrics import accuracy_score
# Initialize the classifiers
logreg = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)
svc_rbf = SVC(kernel='rbf', random_state=42, probability=True)  # SVC with RBF kernel

# List of classifiers and their names
classifiers = [logreg, rf, gb, svc_rbf]
classifier_names = ["Logistic Regression", "Random Forest", "Gradient Boosting", "SVC (RBF)"]

# Dictionary to store evaluation metrics for each classifier
evaluation_metrics = {}

for clf, clf_name in zip(classifiers, classifier_names):
    # Train the classifier
    clf.fit(X_train, y_train)

    # Predictions for the current classifier
    y_preds = clf.predict(X_test)

    # Compute classification report (precision, recall, f1-score)
    clf_report = classification_report(y_test, y_preds, output_dict=True)

    # Compute AUC-ROC
    if clf_name == "SVC (RBF)":
        y_scores = clf.decision_function(X_test)
    else:
        y_scores = clf.predict_proba(X_test)[:, 1]
    auc_roc = roc_auc_score(y_test, y_scores)

    # Compute accuracy
    accuracy = accuracy_score(y_test, y_preds)

    # Store the metrics including accuracy
    evaluation_metrics[clf_name] = {
        'Accuracy': accuracy,
        'Precision': clf_report['1']['precision'],
        'Recall': clf_report['1']['recall'],
        'F1-score': clf_report['1']['f1-score'],
        'AUC-ROC': auc_roc
    }


In [None]:
evaluation_metrics_df = pd.DataFrame(evaluation_metrics).transpose()
print(evaluation_metrics_df)



In [None]:
# Predict labels for the test set using the gb model
predicted_labels = gb.predict(X_test)

# Count the number of instances predicted as '1'
count_predicted_1 = sum(predicted_labels == 1)

print(f"Number of instances predicted as '1' by Gradient Boosting: {count_predicted_1}")


In [None]:
!pip install shap lime

In [None]:
# Select 5 samples from each class
samples_churned = X_test[y_test == 1].sample(5)
samples_not_churned = X_test[y_test == 0].sample(5)

In [None]:
# LIME und KernelSHAP
import shap
import time
from lime.lime_tabular import LimeTabularExplainer

# 1. Initialize LIME explainer
lime_explainer = LimeTabularExplainer(X_train.values,
                                      feature_names=X_train.columns,
                                      class_names=['Not Churned', 'Churned'],
                                      mode='classification')

# 2. Define the prediction function for the GB model
def predict_fn(data):
    return gb.predict_proba(data)

# 3. Initialize SHAP KernelExplainer
background_data = X_train.sample(100, random_state=42)  # KernelExplainer requires background data
shap_explainer = shap.KernelExplainer(predict_fn, background_data)

# LIME explanations
shap.initjs()
for index, instance in sample_instances.iterrows():
    actual_label = y_test.loc[index]
    predicted_label = gb.predict(instance.values.reshape(1, -1))[0]
    print(f"Sample instance: {index}")
    print(f"Actual label: {actual_label}, Predicted by GB: {predicted_label}")

    # LIME explanation
    lime_exp = lime_explainer.explain_instance(instance.values, gb.predict_proba)
    lime_exp.show_in_notebook()

# SHAP explanations with KernelExplainer
shap.initjs()
for index, instance in sample_instances.iterrows():
    # Compute SHAP values for the instance
    shap_values_instance = shap_explainer.shap_values(instance.values.reshape(1, -1), nsamples=100)

    # Predict the label using the Gradient Boosting model
    predicted_label = gb.predict(instance.values.reshape(1, -1))[0]
    actual_label = y_test.loc[index]

    # Display actual and predicted labels
    print(f"Instance {index}: Actual Label = {actual_label}, Predicted Label = {predicted_label}")

    # Display the force plot for the instance
    display(shap.force_plot(shap_explainer.expected_value[0], shap_values_instance[0], instance))


In [None]:
# For SHAP explanations
shap.initjs()

for index, instance in sample_instances.iterrows():
    actual_label = y_test.loc[index]
    predicted_label = gb.predict(instance.values.reshape(1, -1))[0]

    print(f"Sample instance: {index}")
    print(f"Actual label: {actual_label}, Predicted by GB: {predicted_label}")

    # Retrieve original feature values
    original_instance = churn_data.loc[index]

    # Compute SHAP values for the instance
    shap_values_instance = shap_explainer.shap_values(instance.values.reshape(1, -1))

    # Display SHAP values alongside original feature values
    for feature_name, shap_value in zip(X_train.columns, shap_values_instance[0]):
        actual_value = original_instance[feature_name]
        print(f"{feature_name} (Original value: {actual_value}): SHAP Value: {shap_value}")

    # Display the force plot
    display(shap.force_plot(shap_explainer.expected_value[0], shap_values_instance[0], instance))


In [None]:
# 10 Instanzen true postitive
# Get indices where both actual and predicted labels are 'Churned'
positive_indices = y_test[(y_test == 1) & (gb.predict(X_test) == 1)].index

# Randomly select 10 of these indices
selected_indices = np.random.choice(positive_indices, 10, replace=False)

In [None]:
#SHAP RP
import pandas as pd
import numpy as np


# List to store the results
shap_top_features_list = []

# Compute SHAP values for each selected instance
for index in selected_indices:
    instance = X_test.loc[index]

    # Compute SHAP values for the instance
    shap_values_instance = shap_explainer.shap_values(instance.values.reshape(1, -1))
    shap_values_positive_class = shap_values_instance[1][0]

    # Extracting top 5 influential feature names from SHAP based on absolute values, in descending order of their influence
    shap_top_values = np.argsort(np.abs(shap_values_positive_class))[-5:][::-1]
    shap_top_feature_names = [X_train.columns[i] for i in shap_top_values]
    shap_top_features_list.append(shap_top_feature_names)



# Convert results to DataFrame for display
shap_df = pd.DataFrame(shap_top_features_list, columns=['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'], index=selected_indices)

# Display the results
display(shap_df)


In [None]:
#LIME RP
import pandas as pd
import numpy as np
import re  # Import the regular expressions module

# List to store the results
lime_top_features_list = []

# Compute LIME explanations for each selected instance
for index in selected_indices:
    instance = X_test.loc[index]

    # LIME explanation
    lime_exp = lime_explainer.explain_instance(instance.values, gb.predict_proba)

    # Extracting top 5 influential features from LIME based on absolute values, in descending order of their influence
    lime_sorted_features = sorted(lime_exp.as_list(), key=lambda x: abs(x[1]), reverse=True)[:5]

    # Remove numbers and the characters <, >, and = from the LIME output using regular expressions
    lime_top_feature_names = [re.sub(r'[0-9<>=\-.]', '', feat[0]).strip() for feat in lime_sorted_features]
    lime_top_features_list.append(lime_top_feature_names)

# Convert results to DataFrame for display
lime_df = pd.DataFrame(lime_top_features_list, columns=['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'], index=selected_indices)

# Display the LIME results
display(lime_df)


In [None]:
# Display the feature values for the selected instances from the original dataframe
feature_values_df = churn_data.loc[selected_indices]
display(feature_values_df)


In [None]:
# Zeitmessung
import time
import pandas as pd
from sklearn.svm import SVC

# Assume you've already split your data into training and testing sets: X_train, X_test, y_train, y_test

# Train the SVC classifier
svc = SVC(probability=True)  # Ensure probability=True to use predict_proba
svc.fit(X_train, y_train)

# Randomly select 10 instances
random_indices = np.random.choice(X_test.index, 10, replace=False)

# Initialize LIME and SHAP explainers (similar to what you've done above)
lime_explainer = LimeTabularExplainer(X_train.values,
                                      feature_names=X_train.columns,
                                      class_names=['Not Churned', 'Churned'],
                                      mode='classification')
background_data = X_train.sample(20, random_state=42)
shap_explainer = shap.KernelExplainer(svc.predict_proba, background_data)

# Lists to store execution times
lime_times = []
shap_times = []

# Compute LIME and SHAP explanations for each instance and store execution times
for index in random_indices:
    instance = X_test.loc[index]

    # LIME explanation and timing
    start_time = time.time()
    lime_exp = lime_explainer.explain_instance(instance.values, svc.predict_proba, num_features=15)
    lime_times.append(time.time() - start_time)

    # SHAP explanation and timing
    start_time = time.time()
    shap_values_instance = shap_explainer.shap_values(instance.values.reshape(1, -1))
    shap_times.append(time.time() - start_time)

# Sum the execution times across all instances
total_lime_time = sum(lime_times)
total_shap_time = sum(shap_times)

# Create a DataFrame to display results
results_df = pd.DataFrame({
    'XAI Methode': ['LIME', 'SHAP'],
    'Ausführungszeit in Sekunden': [total_lime_time, total_shap_time]
})

display(results_df)


In [None]:
# Erstellung des Plots, um Overfitting zu visualisieren; unabhänig von LIME und SHAP
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

# Generate synthetic data
np.random.seed(0)
X = np.sort(5 * np.random.rand(80, 1), axis=0)
y = np.sin(X).ravel() + np.random.normal(0, 0.1, X.shape[0])

# Split into training and test sets
X_train, X_test = X[:60], X[60:]
y_train, y_test = y[:60], y[60:]

# Well-fitted model (using the correct polynomial degree of 2)
degree = 4
model1 = make_pipeline(PolynomialFeatures(degree), LinearRegression())
model1.fit(X_train, y_train)
y_pred1 = model1.predict(X_test)

# Overfitted model (using a high polynomial degree of 15)
degree = 6
model2 = make_pipeline(PolynomialFeatures(degree), LinearRegression())
model2.fit(X_train, y_train)
y_pred2 = model2.predict(X_test)

# Visualization
X_plot = np.linspace(0, 5, 100).reshape(-1, 1)

plt.figure(figsize=(12, 5))

# Model ohne Overfitting
plt.subplot(1, 2, 1)
plt.scatter(X_train, y_train, color='blue', s=20, marker='o', label="training points")
plt.scatter(X_test, y_test, color='red', s=20, marker='x', label="test points")
plt.plot(X_plot, model1.predict(X_plot), color='green', linewidth=2, label="degree %d" % 2)
plt.title("Model ohne Overfitting")
plt.legend(loc='lower left')

# Model mit Overfitting
plt.subplot(1, 2, 2)
plt.scatter(X_train, y_train, color='blue', s=20, marker='o', label="training points")
plt.scatter(X_test, y_test, color='red', s=20, marker='x', label="test points")
plt.plot(X_plot, model2.predict(X_plot), color='green', linewidth=2, label="degree %d" % 15)
plt.title("Model mit Overfitting")
plt.legend(loc='lower left')

plt.tight_layout()
plt.show()
