In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [16]:
# Load the historical churn data (assuming you have a CSV file)
data = pd.read_csv('Customer_data.csv')

In [17]:
data.head()

Unnamed: 0,customer_id,first_name,last_name,age,email,country,postal_code,purchase_history,monthly_payment,contract_length,data_usage,customer_service_rating,Churn
0,1,Flem,Boultwood,65,fboultwood0@imgur.com,Brazil,76420-000,iPhone,497.51,24,85.12,2,True
1,2,Ciel,Hearse,44,chearse1@xinhuanet.com,France,38509 CEDEX,Google Pixel,332.1,1,15.92,2,True
2,3,Loren,Astley,53,lastley2@lycos.com,Peru,,Google Pixel,390.71,16,14.68,5,False
3,4,Larine,Ferfulle,19,lferfulle3@wp.com,Indonesia,,Google Pixel,541.86,21,50.64,5,False
4,5,Chrissy,Jerrans,34,cjerrans4@wikia.com,China,,iPhone,652.51,12,76.28,5,False


In [18]:
# handle missing values
data.dropna(inplace = True)

# checking on data structure
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 468 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   customer_id              468 non-null    int64  
 1   first_name               468 non-null    object 
 2   last_name                468 non-null    object 
 3   age                      468 non-null    int64  
 4   email                    468 non-null    object 
 5   country                  468 non-null    object 
 6   postal_code              468 non-null    object 
 7   purchase_history         468 non-null    object 
 8   monthly_payment          468 non-null    float64
 9   contract_length          468 non-null    int64  
 10  data_usage               468 non-null    float64
 11  customer_service_rating  468 non-null    int64  
 12  Churn                    468 non-null    bool   
dtypes: bool(1), float64(2), int64(4), object(6)
memory usage: 48.0+ KB


In [19]:
#dropping columns which are not of importance 
data = data.drop(['customer_id','first_name','last_name','email','postal_code'], axis = 1)

In [20]:
#encoding categorical variable using one-hot coding
encoded_data = pd.get_dummies(data[['country','purchase_history','Churn']],drop_first = True).astype(int)


In [21]:
#creating separate dta frame for float and int columns
int_df = data[['age','monthly_payment','contract_length','data_usage','customer_service_rating']]

In [22]:
# combining 2 data sets into 1 data frame along columns
final_df = pd.concat([int_df, encoded_data],axis = 1)       

In [23]:
final_df.head()
final_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 468 entries, 0 to 999
Data columns (total 50 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   age                              468 non-null    int64  
 1   monthly_payment                  468 non-null    float64
 2   contract_length                  468 non-null    int64  
 3   data_usage                       468 non-null    float64
 4   customer_service_rating          468 non-null    int64  
 5   Churn                            468 non-null    int32  
 6   country_Argentina                468 non-null    int32  
 7   country_Australia                468 non-null    int32  
 8   country_Austria                  468 non-null    int32  
 9   country_Bangladesh               468 non-null    int32  
 10  country_Brazil                   468 non-null    int32  
 11  country_Bulgaria                 468 non-null    int32  
 12  country_Canada             

In [24]:
# Assuming 'Churn' is the target variable
X = final_df.drop('Churn', axis=1)
y = final_df['Churn']

In [25]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
# Model selection and training
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

In [27]:
# Model evaluation
y_pred = clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)

Accuracy: 0.4787234042553192
Precision: 0.4406779661016949
Recall: 0.6190476190476191
F1 Score: 0.5148514851485149
ROC AUC Score: 0.4697802197802198


In [28]:
# Deployment: Save the trained model for future predictions
import joblib
joblib.dump(clf, 'churn_model.pkl')


['churn_model.pkl']