In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from datetime import datetime

In [6]:
data = pd.DataFrame({
    'UserID': [1, 2, 3, 1, 2, 3],
    'SessionID': ['S1', 'S2', 'S3', 'S4', 'S5', 'S6'],
    'SessionStart': ['2024-10-01 10:00', '2024-10-01 11:00', '2024-10-01 12:00',
                     '2024-10-02 09:00', '2024-10-02 10:00', '2024-10-02 11:00'],
    'SessionEnd': ['2024-10-01 10:30', '2024-10-01 11:15', '2024-10-01 12:45',
                   '2024-10-02 09:20', '2024-10-02 10:35', '2024-10-02 11:25'],
    'ActivityType': ['PageView', 'PageView', 'Login', 'ButtonClick', 'PageView', 'ButtonClick'],
    'DeviceType': ['Mobile', 'Tablet', 'Mobile', 'Mobile', 'Tablet', 'Mobile'],
    'Location': ['USA', 'UK', 'Canada', 'USA', 'UK', 'Canada'],
    'SignupDate': ['2024-09-01', '2024-09-10', '2024-08-15', '2024-09-01', '2024-09-10', '2024-08-15']
})

# Convert date columns to datetime

In [8]:
data['SessionStart'] = pd.to_datetime(data['SessionStart'])
data['SessionEnd'] = pd.to_datetime(data['SessionEnd'])
data['SignupDate'] = pd.to_datetime(data['SignupDate'])

In [11]:
# Calculate session duration in minutes
data['SessionDuration'] = ( data['SessionEnd'] - data['SessionStart']).dt.total_seconds() / 60

# Calculate Day since signup
data['DaysSinceSignup'] = (data['SessionStart'] - data['SignupDate']).dt.days
                           


In [12]:
data.head()

Unnamed: 0,UserID,SessionID,SessionStart,SessionEnd,ActivityType,DeviceType,Location,SignupDate,SessionDuration,DaysSinceSignup
0,1,S1,2024-10-01 10:00:00,2024-10-01 10:30:00,PageView,Mobile,USA,2024-09-01,30.0,30
1,2,S2,2024-10-01 11:00:00,2024-10-01 11:15:00,PageView,Tablet,UK,2024-09-10,15.0,21
2,3,S3,2024-10-01 12:00:00,2024-10-01 12:45:00,Login,Mobile,Canada,2024-08-15,45.0,47
3,1,S4,2024-10-02 09:00:00,2024-10-02 09:20:00,ButtonClick,Mobile,USA,2024-09-01,20.0,31
4,2,S5,2024-10-02 10:00:00,2024-10-02 10:35:00,PageView,Tablet,UK,2024-09-10,35.0,22


In [19]:
# Average session duration per user
data['AvgSessionDuration'] = data.groupby('UserID')['SessionDuration'].transform('mean')

#Total session per user
data['TotalSessions'] = data.groupby('UserID')['SessionID'].transform('count')

# Days since last session (sssuming 'SessionStart' is sorted by date)
data['LastSession'] = data.groupby('UserID')['SessionStart'].transform('max')
data['DaysSinceLastSession'] = (datetime.now() - data['LastSession']).dt.days

In [20]:
# Create the churn label: if a user hasn't logged in for 30 days, they're considered churned
data['Churn'] = (data['DaysSinceLastSession'] > 30).astype(int)


In [21]:
# Drop duplicate rows (one per user)
user_data = data[['UserID', 'AvgSessionDuration', 
                  'TotalSessions', 'DaysSinceSignup', 
                  'DaysSinceLastSession', 'Churn']].drop_duplicates()

In [22]:
user_data.head()

Unnamed: 0,UserID,AvgSessionDuration,TotalSessions,DaysSinceSignup,DaysSinceLastSession,Churn
0,1,25.0,2,30,42,1
1,2,25.0,2,21,42,1
2,3,35.0,2,47,42,1
3,1,25.0,2,31,42,1
4,2,25.0,2,22,42,1


# Model Selection & Traning

###### We will split the data into training and testing sets and train a Random Forest classifier for churn prediction.

In [28]:
X = user_data[['AvgSessionDuration', 'TotalSessions', 'DaysSinceSignup', 'DaysSinceLastSession']]
y = user_data['Churn']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

In [30]:
# Train a Random Forest classifier
model = RandomForestClassifier(n_estimators = 100, random_state = 42)
model.fit(X_train, y_train)

# Model Evaluation

#### Let’s evaluate the model using classification metrics like accuracy, precision, recall, and F1-score.

In [34]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluation metrics
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         2

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2

Confusion Matrix:
[[2]]


# Predictions

###### Now use the trained model to predict chrun for new users.

In [35]:
new_user_data = pd.DataFrame({'AvgSessionDuration': [30, 10], 'TotalSessions': [5, 1], 'DaysSinceSignup': [20, 45], 'DaysSinceLastSession': [10, 60]})

In [36]:
# predict churn for new users
new_predictions = model.predict(new_user_data)
print("Churn Predictions for New Users:", new_predictions)

Churn Predictions for New Users: [1 1]
