In [7]:
import pandas as pd
import numpy as np

np.random.seed(42)

#Customer data/features
n_customers = 1000
ages = np.random.randint(18, 70, size = n_customers)
incomes = np.random.randint(20000, 120000, size = n_customers)
last_purchase_days_ago = np.random.randint(0, 365, size = n_customers)
total_spent = np.random.randint(100, 10000, size = n_customers)
purchases_amount = np.random.randint(1, 50, size = n_customers)

churn = np.random.choice([0,1], size = n_customers, p = [0.7, 0.3]) #1 means they left (churn) and 0 means they stayed. this model recommended a 30% churn rate from chat gpt)

#Creating the dataframe
df = pd.DataFrame({'Customer_ID' : range(1, n_customers + 1), 'Age' : ages, 'Annual_Income' : incomes, 'Last_Purchase_Days_Ago' : last_purchase_days_ago, 'Total_Spent' : total_spent, 'Purchases_Amount' :purchases_amount, 'Churn' : churn})
df.head()

Unnamed: 0,Customer_ID,Age,Annual_Income,Last_Purchase_Days_Ago,Total_Spent,Purchases_Amount,Churn
0,1,56,54674,105,9535,10,1
1,2,69,55854,304,5098,36,0
2,3,46,66271,359,262,9,1
3,4,32,93688,316,2074,27,0
4,5,60,58518,122,3391,34,1


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

#splitting data into test sets
x = df[['Age', 'Annual_Income', 'Last_Purchase_Days_Ago', 'Total_Spent', 'Purchases_Amount']]
y = df['Churn']

x_train, x_test, y_train, y_test, = train_test_split(x, y, test_size = 0.2, random_state = 42)

model = LogisticRegression(random_state = 42)
model.fit(x_train, y_train)

#prediction
y_prediction = model.predict(x_test)

#evaluate the model
accuracy = accuracy_score(y_test, y_prediction)
conf_matrix = confusion_matrix(y_test, y_prediction)

print(f"Model Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")

Model Accuracy: 0.7
Confusion Matrix:
[[140   0]
 [ 60   0]]


In [21]:
#Finding the samples that the model got wrong (5)
wrong_index = np.where(y_test != y_prediction)[0][:5]
wrong_samples = x_test.iloc[wrong_index]
wrong_labels = y_test.iloc[wrong_index]
wrong_predictions = y_prediction[wrong_index]

#displaying wrong samples
for i in range(len(wrong_samples)):
    print(f"Sample {wrong_samples.index[i]}:")
    print(f"True Label: {wrong_labels.iloc[i]}, Predicted: {wrong_predictions[i]}")
    print(wrong_samples.iloc[i])
    print()
    

Sample 521:
True Label: 1, Predicted: 0
Age                          22
Annual_Income             33350
Last_Purchase_Days_Ago      153
Total_Spent                4254
Purchases_Amount              5
Name: 521, dtype: int64

Sample 740:
True Label: 1, Predicted: 0
Age                          38
Annual_Income             70106
Last_Purchase_Days_Ago      291
Total_Spent                1909
Purchases_Amount             44
Name: 740, dtype: int64

Sample 411:
True Label: 1, Predicted: 0
Age                          59
Annual_Income             21279
Last_Purchase_Days_Ago       84
Total_Spent                4300
Purchases_Amount             15
Name: 411, dtype: int64

Sample 811:
True Label: 1, Predicted: 0
Age                          28
Annual_Income             51966
Last_Purchase_Days_Ago       94
Total_Spent                1851
Purchases_Amount             23
Name: 811, dtype: int64

Sample 76:
True Label: 1, Predicted: 0
Age                          61
Annual_Income             501