### Churn Analysis on Subscription Data

In [1]:
# Import Libs
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from datetime import datetime


In [31]:
# Load the dataset from a CSV file
file_path = 'student_data.csv'  # Path to the dataset
df = pd.read_csv(file_path)

In [36]:
def check_df(dataframe, head=5):
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(head))
    print("##################### Tail #####################")
    print(dataframe.tail(head))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print("##################### Quantiles #####################")
    print(dataframe.quantile([0, 0.05, 0.50, 0.95, 0.99, 1]).T)

In [37]:
df.isnull().sum()
df.isna().sum()

student_id               0
join_date                0
membership_type          0
total_classes            0
avg_classes_per_mo       0
last_class_date          0
no_show_count            0
monthly_payment          0
outstanding_balance      0
tenure_days              0
time_since_last_class    0
dtype: int64

In [33]:
# Feature Engineering
df["join_date"] = pd.to_datetime(df["join_date"])
df["last_class_date"] = pd.to_datetime(df["last_class_date"])
current_date = datetime(2024, 12, 31) # Example current date for the sake of this example
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   student_id           199 non-null    int64         
 1   join_date            199 non-null    datetime64[ns]
 2   membership_type      199 non-null    object        
 3   total_classes        199 non-null    int64         
 4   avg_classes_per_mo   199 non-null    int64         
 5   last_class_date      199 non-null    datetime64[ns]
 6   no_show_count        199 non-null    int64         
 7   monthly_payment      199 non-null    int64         
 8   outstanding_balance  199 non-null    object        
dtypes: datetime64[ns](2), int64(5), object(2)
memory usage: 14.1+ KB


In [38]:
# Calculate the number of days since joining
df["tenure_days"] = (current_date - df["join_date"]).dt.days
df["time_since_last_class"] = (current_date - df["last_class_date"]).dt.days
df.head()



Unnamed: 0,student_id,join_date,membership_type,total_classes,avg_classes_per_mo,last_class_date,no_show_count,monthly_payment,outstanding_balance,tenure_days,time_since_last_class
0,674339,2023-10-29,Monthly,207,2,2024-12-20,13,97,Yes,429,11
1,525480,2022-10-04,Drop-in,170,10,2024-08-26,26,90,No,819,127
2,209650,2021-04-19,Monthly,71,2,2024-10-28,26,77,Yes,1352,64
3,886849,2024-01-31,Annual,67,11,2024-07-27,4,101,Yes,335,157
4,762112,2024-06-25,Monthly,72,9,2024-09-07,21,77,Yes,189,115


In [39]:
# Preprocess the data
# Encode categorical variables (Membership type & Outstanding balance)
le_member = LabelEncoder()
df["membership_type_encoder"] = le_member.fit_transform(df["membership_type"])

# Encode the outstanding balance
df["outstanding_balance_encoded"] = df["outstanding_balance"].map({"Yes": 1, "No": 0})

df.head()

Unnamed: 0,student_id,join_date,membership_type,total_classes,avg_classes_per_mo,last_class_date,no_show_count,monthly_payment,outstanding_balance,tenure_days,time_since_last_class,membership_type_encoder,outstanding_balance_encoded
0,674339,2023-10-29,Monthly,207,2,2024-12-20,13,97,Yes,429,11,2,1
1,525480,2022-10-04,Drop-in,170,10,2024-08-26,26,90,No,819,127,1,0
2,209650,2021-04-19,Monthly,71,2,2024-10-28,26,77,Yes,1352,64,2,1
3,886849,2024-01-31,Annual,67,11,2024-07-27,4,101,Yes,335,157,0,1
4,762112,2024-06-25,Monthly,72,9,2024-09-07,21,77,Yes,189,115,2,1


In [40]:
# Drop unnecessary columns
X = df[[
    "tenure_days", 
    "time_since_last_class",
    "total_classes",
    "avg_classes_per_mo",
    "no_show_count",
    "monthly_payment", 
    "membership_type_encoder", 
    "outstanding_balance_encoded"
]]

In [41]:
# Create a target variable - there is not a churn column in the dataset. 
df["churn"] = [1 if i % 2 == 0 else 0 for i in range(len(df))]  # Example target variable for demonstration
y = df["churn"]

In [42]:
# Test-train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [25]:
# Evaluate the model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

from sklearn.metrics import roc_auc_score
print("ROC-AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))


Accuracy: 0.6
Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.67      0.60        18
           1       0.67      0.55      0.60        22

    accuracy                           0.60        40
   macro avg       0.61      0.61      0.60        40
weighted avg       0.61      0.60      0.60        40

ROC-AUC Score: 0.5883838383838385


In [21]:
# Make Predictions on new data
new_data = pd.DataFrame({
    "tenure_days": [365],
    "time_since_last_class": [30],
    "total_classes": [50],
    "avg_classes_per_mo": [1],
    "no_show_count": [10],
    "monthly_payment": [100],
    "membership_type_encoder": [1],  # Example encoded value
    "outstanding_balance_encoded": [0]  # Example encoded value
})
prediction = model.predict(new_data)
print("Churn Prediction for new data:", prediction[0])

Churn Prediction for new data: 0


   tenure_days  time_since_last_class  total_classes  avg_classes_per_mo  \
0          365                     30             50                   1   

   no_show_count  monthly_payment  membership_type_encoder  \
0             10              100                        1   

   outstanding_balance_encoded  
0                            0  
