In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('/content/Bank Customer Churn Prediction.csv')

In [3]:
df.head(10)

Unnamed: 0,customer_id,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0
5,15574012,645,Spain,Male,44,8,113755.78,2,1,0,149756.71,1
6,15592531,822,France,Male,50,7,0.0,2,1,1,10062.8,0
7,15656148,376,Germany,Female,29,4,115046.74,4,1,0,119346.88,1
8,15792365,501,France,Male,44,4,142051.07,2,0,1,74940.5,0
9,15592389,684,France,Male,27,2,134603.88,1,1,1,71725.73,0


In [21]:
data = df.drop([  'gender','country'], axis=1)
data.head()

Unnamed: 0,customer_id,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,43,2,125510.82,1,1,1,79084.1,0


In [22]:
data.duplicated().sum()

0

In [23]:
data.isnull().sum()

customer_id         0
credit_score        0
age                 0
tenure              0
balance             0
products_number     0
credit_card         0
active_member       0
estimated_salary    0
churn               0
dtype: int64

In [24]:
data.shape

(10000, 10)

In [25]:
threshold_balance = data["balance"].mean()
threshold_active = data["active_member"].mean()




def create_purchase_label(row):
    if row['active_member'] > threshold_active and row['balance'] > threshold_balance:
        return 1
    else:
        return 0

In [26]:
data['PurchaseLabel'] = data.apply(create_purchase_label, axis=1)

In [27]:
print(data[['balance', 'active_member', 'PurchaseLabel']])

        balance  active_member  PurchaseLabel
0          0.00              1              0
1      83807.86              1              1
2     159660.80              0              0
3          0.00              0              0
4     125510.82              1              1
...         ...            ...            ...
9995       0.00              0              0
9996   57369.61              1              0
9997       0.00              1              0
9998   75075.31              0              0
9999  130142.79              0              0

[10000 rows x 3 columns]


In [28]:
(data['PurchaseLabel']).sum()

3016

In [29]:
y = data['PurchaseLabel']
x = data.drop(['PurchaseLabel'] , axis=1 )

In [30]:
x.head()

Unnamed: 0,customer_id,credit_score,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,churn
0,15634602,619,42,2,0.0,1,1,1,101348.88,1
1,15647311,608,41,1,83807.86,1,0,1,112542.58,0
2,15619304,502,42,8,159660.8,3,1,0,113931.57,1
3,15701354,699,39,1,0.0,2,0,0,93826.63,0
4,15737888,850,43,2,125510.82,1,1,1,79084.1,0


In [31]:
y.head()

0    0
1    1
2    0
3    0
4    1
Name: PurchaseLabel, dtype: int64

In [33]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.35, random_state=42)


In [34]:
clf = DecisionTreeClassifier(random_state=42)


In [35]:
clf.fit(X_train, y_train)

In [36]:
y_pred = clf.predict(X_test)

In [37]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9997142857142857


In [38]:
print("Classification Report:\n", classification_report(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      2426
           1       1.00      1.00      1.00      1074

    accuracy                           1.00      3500
   macro avg       1.00      1.00      1.00      3500
weighted avg       1.00      1.00      1.00      3500



In [39]:
from sklearn.tree import export_graphviz
import graphviz

dot_data = export_graphviz(clf, out_file=None, feature_names=x.columns, class_names=["No", "Yes"], filled=True, rounded=True)
graph = graphviz.Source(dot_data)
graph.render("decision_tree")

'decision_tree.pdf'