In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

df = pd.read_csv("../data/customer_churn.csv")
df

Unnamed: 0,Call Failure,Complains,Subscription Length,Charge Amount,Seconds of Use,Frequency of use,Frequency of SMS,Distinct Called Numbers,Age Group,Tariff Plan,Status,Age,Customer Value,Churn
0,8,0,38,0,4370,71,5,17,3,1,1,30,197.640,0
1,0,0,39,0,318,5,7,4,2,1,2,25,46.035,0
2,10,0,37,0,2453,60,359,24,3,1,1,30,1536.520,0
3,10,0,38,0,4198,66,1,35,1,1,1,15,240.020,0
4,3,0,38,0,2393,58,2,33,1,1,1,15,145.805,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3145,21,0,19,2,6697,147,92,44,2,2,1,25,721.980,0
3146,17,0,17,1,9237,177,80,42,5,1,1,55,261.210,0
3147,13,0,18,4,3157,51,38,21,3,1,1,30,280.320,0
3148,7,0,11,2,4695,46,222,12,3,1,1,30,1077.640,0


There are two columns with binary data that are represented as `1` and `2`, the first transformation will be to change it into `1` and `0`,
as discussed in the EDA the `status` is totally determined by `churn` with the asumptions mentioned in EDA, so it is a column that can be droped.

In [2]:
df.drop(columns=["Status", "Age Group"], inplace=True)

# String join is used instead of replace due to some columns having more than one blank space between words.
cols = ["_".join(column.split()).lower() for column in df.columns]

df.columns = cols

`age_group` and `age` in this case are essentially the same, so one of them can be dropped

In [3]:
df

Unnamed: 0,call_failure,complains,subscription_length,charge_amount,seconds_of_use,frequency_of_use,frequency_of_sms,distinct_called_numbers,tariff_plan,age,customer_value,churn
0,8,0,38,0,4370,71,5,17,1,30,197.640,0
1,0,0,39,0,318,5,7,4,1,25,46.035,0
2,10,0,37,0,2453,60,359,24,1,30,1536.520,0
3,10,0,38,0,4198,66,1,35,1,15,240.020,0
4,3,0,38,0,2393,58,2,33,1,15,145.805,0
...,...,...,...,...,...,...,...,...,...,...,...,...
3145,21,0,19,2,6697,147,92,44,2,25,721.980,0
3146,17,0,17,1,9237,177,80,42,1,55,261.210,0
3147,13,0,18,4,3157,51,38,21,1,30,280.320,0
3148,7,0,11,2,4695,46,222,12,1,30,1077.640,0


In [4]:
df["tariff_plan"] = df["tariff_plan"].replace(2, 0)

In [5]:
df["use_per_month"] = df.seconds_of_use / df.subscription_length
df["calls_per_month"] = df.frequency_of_use / df.subscription_length
df["sms_per_month"] = df.frequency_of_sms / df.subscription_length
df["dist_nums_per_month"] = df.distinct_called_numbers / df.subscription_length

df.drop(
    columns=[
        "seconds_of_use",
        "frequency_of_use",
        "frequency_of_sms",
        "distinct_called_numbers",
    ],
    inplace=True,
)
df

Unnamed: 0,call_failure,complains,subscription_length,charge_amount,tariff_plan,age,customer_value,churn,use_per_month,calls_per_month,sms_per_month,dist_nums_per_month
0,8,0,38,0,1,30,197.640,0,115.000000,1.868421,0.131579,0.447368
1,0,0,39,0,1,25,46.035,0,8.153846,0.128205,0.179487,0.102564
2,10,0,37,0,1,30,1536.520,0,66.297297,1.621622,9.702703,0.648649
3,10,0,38,0,1,15,240.020,0,110.473684,1.736842,0.026316,0.921053
4,3,0,38,0,1,15,145.805,0,62.973684,1.526316,0.052632,0.868421
...,...,...,...,...,...,...,...,...,...,...,...,...
3145,21,0,19,2,0,25,721.980,0,352.473684,7.736842,4.842105,2.315789
3146,17,0,17,1,1,55,261.210,0,543.352941,10.411765,4.705882,2.470588
3147,13,0,18,4,1,30,280.320,0,175.388889,2.833333,2.111111,1.166667
3148,7,0,11,2,1,30,1077.640,0,426.818182,4.181818,20.181818,1.090909


In [6]:
from sklearn.model_selection import train_test_split

X = df.drop(["churn"], axis=1)
y = df["churn"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.15, random_state=6
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1764, random_state=6
)

In [7]:
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(RobustScaler())

columns_to_scale = [
    "call_failure",
    "subscription_length",
    "age",
    "customer_value",
    "use_per_month",
    "calls_per_month",
    "sms_per_month",
    "dist_nums_per_month",
]

X_train_scaled = pd.DataFrame(
    pipeline.fit_transform(X_train[columns_to_scale]), columns=columns_to_scale
)
X_train_scaled = X_train_scaled.combine_first(
    X_train.reset_index()
)  # Add the other columns back


X_test_scaled = pd.DataFrame(
    pipeline.transform(X_test[columns_to_scale]), columns=columns_to_scale
)
X_test_scaled = X_test_scaled.combine_first(
    X_test.reset_index()
)  # Add the other columns back

X_val_scaled = pd.DataFrame(
    pipeline.transform(X_val[columns_to_scale]), columns=columns_to_scale
)
X_val_scaled = X_val_scaled.combine_first(
    X_val.reset_index()
)  # Add the other columns back

In [8]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train_scaled, y_train)

In [9]:
from sklearn.metrics import accuracy_score

predictions = model.predict(X_val_scaled)
print("Accuracy: " + str(accuracy_score(y_val, predictions)))

Accuracy: 0.9408033826638478


In [10]:
# Define the model
model_2 = XGBClassifier(random_state=0, n_estimators=500, learning_rate=0.05)

# Fit the model
model_2.fit(X_train_scaled, y_train)
# Get predictions
predictions_2 = model_2.predict(X_val_scaled)

# Calculate MAE
acc_2 = accuracy_score(y_val, predictions_2)

# Uncomment to print MAE
print("Accuracy: ", acc_2)

Accuracy: 0.9513742071881607
