In [None]:
!gdown 1qsjT1vfjKohCjTkU03CevOKraHJ3hlGc

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, roc_curve, auc, make_scorer


import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/2023_senior_data_scientist_challenge.csv')

# **Basic Preprocessing**

In [None]:
# need to fix wrong data types and fill some values for now.
df.info()

In [None]:
# datatime conversion
df['REGISTRATION_AT'] = pd.to_datetime(df.REGISTRATION_AT).dt.to_period('D').dt.to_timestamp()
df['LAST_VISIT_AT'] = pd.to_datetime(df.LAST_VISIT_AT).dt.to_period('D').dt.to_timestamp()
df['LAST_POST_AT'] = pd.to_datetime(df.LAST_POST_AT).dt.to_period('D').dt.to_timestamp()
df['LAST_LIKE_RECEIVED_AT'] = pd.to_datetime(df.LAST_LIKE_RECEIVED_AT).dt.to_period('D').dt.to_timestamp()
df['LAST_COMMENT_RECEIVED_AT'] = pd.to_datetime(df.LAST_COMMENT_RECEIVED_AT).dt.to_period('D').dt.to_timestamp()
df['LAST_LIKE_GIVEN_AT'] = pd.to_datetime(df.LAST_LIKE_GIVEN_AT).dt.to_period('D').dt.to_timestamp()
df['LAST_COMMENT_WRITTEN_AT'] = pd.to_datetime(df.LAST_COMMENT_WRITTEN_AT).dt.to_period('D').dt.to_timestamp()

In [None]:
# filled dates with earliest date we had for the user
df.LAST_LIKE_RECEIVED_AT.fillna(df['REGISTRATION_AT'],inplace=True)
df.LAST_COMMENT_RECEIVED_AT.fillna(df['REGISTRATION_AT'],inplace=True)
df.LAST_LIKE_GIVEN_AT.fillna(df['REGISTRATION_AT'],inplace=True)
df.LAST_COMMENT_WRITTEN_AT.fillna(df['REGISTRATION_AT'],inplace=True)

# if no value then 0
df.TOTAL_LIKES_GIVEN.fillna(0,inplace=True)
df.TOTAL_COMMENTS_WRITTEN.fillna(0,inplace=True)

# **EDA**

In [None]:
df.info()

In [None]:
df.head()

In [None]:
# mostly min dates are 2022-03-01
df.describe(datetime_is_numeric=True, include = 'all')

In [None]:
# perfectly balanced classes
df['CHURNED'].value_counts(), df['CHURNED'].astype(int).hist(bins=3,figsize=(5,5))

In [None]:
df2 = df.copy()
le = LabelEncoder()
for col in df2.select_dtypes(include=['object', 'bool', 'datetime64[ns, UTC]']).columns:
    df2[col] = le.fit_transform(df2[col]).astype('float64')
plt.figure(figsize=(6, 6))  # Adjust size here
sns.heatmap(df2.corr(method='spearman',numeric_only=True), annot=True, fmt='.2f')
plt.show()

# **Feature Engineering**

**Feature Creation**

In [None]:
# get current date
max_dataset_date = df.LAST_VISIT_AT.max()
max_dataset_date

In [None]:
# 1. Adding new time-based features
df['DAYS_SINCE_LAST_COMMENT_WRITTEN'] = (max_dataset_date - df['LAST_COMMENT_WRITTEN_AT']).dt.days
df['DAYS_SINCE_LAST_POST'] = (max_dataset_date - df['LAST_POST_AT']).dt.days
df['DAYS_SINCE_LAST_VISIT'] = (max_dataset_date - df['LAST_VISIT_AT']).dt.days
df['DAYS_SINCE_LAST_LIKE_GIVEN'] = (max_dataset_date - df['LAST_LIKE_GIVEN_AT']).dt.days
df['DAYS_SINCE_LAST_LIKE_RECEIVED'] = (max_dataset_date - df['LAST_LIKE_RECEIVED_AT']).dt.days
df['DAYS_SINCE_LAST_COMMENT_RECEIVED'] = (max_dataset_date - df['LAST_COMMENT_RECEIVED_AT']).dt.days

# 2. Days Since Registration (Customer's lifetime)
df['DAYS_SINCE_REGISTRATION'] = (max_dataset_date - df['REGISTRATION_AT']).dt.days

# Days passed since last activity (Recency)
df['DAYS_SINCE_LAST_ACTIVITY'] = df[['LAST_VISIT_AT', 'LAST_POST_AT',
                                                'LAST_LIKE_GIVEN_AT', 'LAST_COMMENT_WRITTEN_AT']].max(axis=1)
df['DAYS_SINCE_LAST_ACTIVITY'] = (max_dataset_date - df['DAYS_SINCE_LAST_ACTIVITY']).dt.days

# 3. Average Visit Frequency (Frequency)
# Total days since registration divided by total visit count (avoid division by zero)
df['AVERAGE_VISIT_FREQUENCY'] = df['DAYS_SINCE_REGISTRATION'] / df['TOTAL_VISIT_COUNT'].replace(0, 1)

# 4. Post to Visit Ratio
df['POST_TO_VISIT_RATIO'] = df['TOTAL_POST_COUNT'] / df['TOTAL_VISIT_COUNT'].replace(0, 1)

# 5. Like to Comment Ratio (Given)
df['LIKE_TO_COMMENT_RATIO_GIVEN'] = df['TOTAL_LIKES_GIVEN'] / df['TOTAL_COMMENTS_WRITTEN'].replace(0, 1)

# 6. Like to Comment Ratio (Received)
df['LIKE_TO_COMMENT_RATIO_RECEIVED'] = df['TOTAL_LIKES_RECEIVED'] / df['TOTAL_COMMENTS_RECEIVED'].replace(0, 1)

# 7. Engagement Score
# Creating a simple sum of normalized key engagement metrics
engagement_metrics = ['TOTAL_VISIT_COUNT', 'TOTAL_POST_COUNT', 'TOTAL_LIKES_GIVEN', 'TOTAL_COMMENTS_WRITTEN']
df['ENGAGEMENT_SCORE'] = df[engagement_metrics].apply(lambda x: x / x.max(), axis=1).sum(axis=1)



---


#####################################

---



**Label Creation**: As, we wanna find out users that arent already churned but on risk of churning. We have created a criteria for these. Which says:
1. Users that are showing a decreased activity on the app.
2. Users that were last active 20 - 62  days (They are 33% to churn already)

In [None]:
'''
As, we already know users with DAYS_SINCE_LAST_VISIT > 62 have already been churned, so this doesnt need model's training to detect.
We are only concerned about 2 things,
1. Users who a not been churned.
2. User at the risk of churning.

So, we will first remove the churned user's set from the data directly.
'''
# removing the churned users from the data
df = df[~df.CHURNED]

# setting value for users which are at risk of churning to True
df.loc[(~df.CHURNED) &
       ((df.DAYS_SINCE_LAST_VISIT > 30) |  # half way to churn
        (df['ENGAGEMENT_SCORE'] < 1.2)),
       'CHURNED'] = True
      #  ((df.AVERAGE_VISIT_FREQUENCY < df.DAYS_SINCE_LAST_VISIT))),  # activity decreasing on app

In [None]:
df.CHURNED.value_counts()

In [None]:
'''
To balance out the classes a little bit,
Keeping a random sample of 12000 from the not-churn class.
This would be enough to represent user segement.
'''
to_remove = df[df.CHURNED]
to_keep = to_remove.sample(n=7000)
df = df.drop(to_remove.index).append(to_keep)



---


#####################################

---



Further EDA using aggregated features.

In [None]:
df.info()

In [None]:
df2 = df.copy()
le = LabelEncoder()
for col in df2.select_dtypes(include=['object', 'bool']).columns:
    df2[col] = le.fit_transform(df2[col]).astype('float64')
plt.figure(figsize=(15, 15))  # Adjust size here
sns.heatmap(df2.corr(method='spearman',numeric_only=True), annot=True, fmt='.2f')
plt.show()

Removing some features based on
1. No contribution to label.
2. Collinearity.
2. Aggregated to new features.

In [None]:
df = df.drop(columns=['USER_ID','REGISTRATION_AT','LAST_VISIT_AT','LAST_POST_AT','LAST_LIKE_GIVEN_AT','LAST_LIKE_RECEIVED_AT',
                      'LAST_COMMENT_RECEIVED_AT','LAST_COMMENT_WRITTEN_AT','AVERAGE_VISIT_FREQUENCY','DAYS_SINCE_LAST_ACTIVITY'], errors='ignore')

Let's see how does the data distribution looks now (focusing on outliers).

In [None]:
df.describe()

In [None]:
# Plotting box plots for each column to visually identify outliers
df2 = df.copy()

df2.drop('CHURNED',axis = 1,inplace=True)
plt.figure(figsize=(20, 15))
for i, column in enumerate(df2.columns, 1):
    plt.subplot(4, 5, i)
    sns.boxplot(y=df2[column])
    plt.title(column)
plt.tight_layout()

In [None]:
df.head()

Our dataset contain a lot of outliers. We have 3 options,
1. Remove outliers
2. Keeping random sample from the outliers to keep their representation intact.
3. Normalize using Robust Scaling then log transformation of features.

We will go with the 3rd option -> Normalize using Robust Scaling then log transformation of features.

In [None]:

# Initialize RobustScaler
robust_scaler = RobustScaler()

# Identify columns to scale
columns_to_scale =[col for col in df.columns if col != 'CHURNED']

# Apply RobustScaler to the columns with outliers
df[columns_to_scale] = robust_scaler.fit_transform(df[columns_to_scale])

In [None]:
df.describe()

In [None]:
# Identify skewed columns
skewed_columns = ['TOTAL_POST_COUNT','TOTAL_LIKES_RECEIVED', 'TOTAL_COMMENTS_RECEIVED', 'TOTAL_LIKES_GIVEN', 'TOTAL_COMMENTS_WRITTEN', 'POST_TO_VISIT_RATIO', 'LIKE_TO_COMMENT_RATIO_GIVEN','LIKE_TO_COMMENT_RATIO_RECEIVED']

# Apply log transformation to skewed features, adding 1 to shift from zero
for col in skewed_columns:
    df[col] = np.log1p(df[col])

In [None]:
df.describe()

In [None]:
# Plotting box plots for each column to visually identify outliers
df2 = df.copy()
# df2 = df2[(df2.TOTAL_POST_COUNT	< 20) & (df2.TOTAL_LIKES_RECEIVED	< 20) & (df2.TOTAL_LIKES_GIVEN	< 20) & (df2.TOTAL_COMMENTS_RECEIVED	< 20) & (df2.TOTAL_COMMENTS_WRITTEN	< 20)]
df2.drop('CHURNED',axis = 1,inplace=True)
plt.figure(figsize=(20, 15))
for i, column in enumerate(df2.columns, 1):
    plt.subplot(4, 5, i)
    sns.boxplot(y=df2[column])
    plt.title(column)
plt.tight_layout()

The outliers are handled pretty well and we are ready for the training!!

# **Train and evaluate models**

Let's separate label and feature sets

In [None]:
# validation splitting
features = df.drop('CHURNED', axis=1)
labels = df['CHURNED'].astype(int)


# As these features tend to show too much direct correlation with the label, because these features are also part of the label criteria, so we drop them to prevent
# Also when these features are inlcuded, it makes the model acheive 100% scores, which is unbelieavable
features.drop(columns=['ENGAGEMENT_SCORE'],axis=1, inplace=True)
# features.drop(columns=['DAYS_SINCE_LAST_VISIT'],axis=1, inplace=True)


X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.2, random_state=42)

In [None]:
# # Define parameter grid
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [10, 20, 30],
#     'min_samples_split': [2, 5, 10],
#     'random_state': [42]
# }


# # Create a Random Forest Classifier
# rf_clf = RandomForestClassifier()

# # Initialize GridSearchCV
# grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid,
#                            scoring=make_scorer(accuracy_score), cv=5)

# # Fit GridSearchCV
# grid_search.fit(X_train, y_train)

# # Get best parameters and estimator
# best_params = grid_search.best_params_
# best_estimator = grid_search.best_estimator_

# print("Best Parameters:", best_params)

In [None]:
# Initialize and train the model
rf_clf = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=2, random_state=42)

In [None]:
rf_clf.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = rf_clf.predict(X_val)
y_pred_proba = rf_clf.predict_proba(X_val)[:,1]

In [None]:
# Evaluate the model
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
roc_auc = roc_auc_score(y_val, y_pred_proba)

print(f'Validation Accuracy: {accuracy}')
print(f'F1 Score: {f1}')
print(f'ROC AUC Score: {roc_auc}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')

In [None]:
# Compute ROC curve and AUC
fpr, tpr, _ = roc_curve(y_val, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plot
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc='lower right')
plt.show()
