In [2]:
# Data Manipulation and Handling
import polars as pl
import pandas as pd
import numpy as np
import psycopg2

# DB Credentials
from dotenv import load_dotenv
import os
import sys
from sqlalchemy import create_engine

# Machine Learning Libraries
import torch
import xgboost as xgb
import lightgbm as lgb
# from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Handling Imbalanced Data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Gradient Boosting Libraries
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Model Lifecycle Management
import mlflow
import mlflow.sklearn

# Distributed Computing
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier as SparkRFClassifier

# Model Interpretability
import shap

# Hyperparameter Optimization
import optuna

# Automated Feature Engineering
import featuretools as ft

# Add parent directory to sys.path
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Custom Modules
from fetch_data_hook import fetch_sql_code, fetch_sql_file

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.


In [3]:
churn_df = fetch_sql_code('''
WITH temp1 AS (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY timestamp) AS rn,
        timestamp::date - ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY timestamp)::int AS streak_id
    FROM
        equity_value_data
),
temp2 AS (
    SELECT
        user_id,
        MIN(timestamp::date) AS start_streak_date,
        MAX(timestamp::date) AS end_streak_date,
        COUNT(*) AS duration_of_above10_streak
    FROM
        temp1
    GROUP BY
        user_id, streak_id
),
temp3 AS (
    SELECT
        *,
        LAG(end_streak_date) OVER (PARTITION BY user_id ORDER BY start_streak_date ASC) AS prev_above10_streak_date,
        start_streak_date - LAG(end_streak_date) OVER (PARTITION BY user_id ORDER BY start_streak_date ASC) AS duration_between_above10_streaks
    FROM
        temp2
)
SELECT distinct user_id
FROM temp3
WHERE duration_between_above10_streaks >= 28
''')
churn_df
churn_users = set(churn_df['user_id'].tolist())

df = fetch_sql_code('''
select * from features_data
''')

df['churn_flag'] = df['user_id'].apply(lambda x: 1 if x in churn_users else 0 )
df
num_cols = ['time_spent', 'first_deposit_amount']


Unnamed: 0,risk_tolerance,investment_experience,liquidity_needs,platform,time_spent,instrument_type_first_traded,first_deposit_amount,time_horizon,user_id,churn_flag
0,high_risk_tolerance,limited_investment_exp,very_important_liq_need,Android,33.129417,stock,40.0,med_time_horizon,895044c23edc821881e87da749c01034,0
1,med_risk_tolerance,limited_investment_exp,very_important_liq_need,Android,16.573517,stock,200.0,short_time_horizon,458b1d95441ced242949deefe8e4b638,0
2,med_risk_tolerance,limited_investment_exp,very_important_liq_need,iOS,10.008367,stock,25.0,long_time_horizon,c7936f653d293479e034865db9bb932f,0
3,med_risk_tolerance,limited_investment_exp,very_important_liq_need,Android,1.031633,stock,100.0,short_time_horizon,b255d4bd6c9ba194d3a350b3e76c6393,0
4,high_risk_tolerance,limited_investment_exp,very_important_liq_need,Android,8.187250,stock,20.0,long_time_horizon,4a168225e89375b8de605cbc0977ae91,0
...,...,...,...,...,...,...,...,...,...,...
5579,high_risk_tolerance,limited_investment_exp,very_important_liq_need,Android,8.339283,stock,300.0,long_time_horizon,03880c726d8a4e5db006afe4119ad974,0
5580,med_risk_tolerance,limited_investment_exp,somewhat_important_liq_need,iOS,7.241383,stock,100.0,short_time_horizon,ae8315109657f44852b24c6bca4decd6,1
5581,med_risk_tolerance,no_investment_exp,very_important_liq_need,both,22.967167,stock,50.0,short_time_horizon,f29c174989f9737058fe808fcf264135,0
5582,med_risk_tolerance,limited_investment_exp,somewhat_important_liq_need,iOS,10.338417,stock,100.0,long_time_horizon,24843497d1de88b2e7233f694436cb3a,0


## EDA

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5584 entries, 0 to 5583
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   risk_tolerance                5584 non-null   object 
 1   investment_experience         5584 non-null   object 
 2   liquidity_needs               5584 non-null   object 
 3   platform                      5584 non-null   object 
 4   time_spent                    5584 non-null   float64
 5   instrument_type_first_traded  5584 non-null   object 
 6   first_deposit_amount          5584 non-null   float64
 7   time_horizon                  5584 non-null   object 
 8   user_id                       5584 non-null   object 
 9   churn_flag                    5584 non-null   int64  
dtypes: float64(2), int64(1), object(7)
memory usage: 436.4+ KB


In [5]:
df.describe()

Unnamed: 0,time_spent,first_deposit_amount,churn_flag
count,5584.0,5584.0,5584.0
mean,34.509706,633.566805,0.049964
std,155.080551,2118.323263,0.21789
min,0.0,0.0,0.0
25%,2.848908,50.0,0.0
50%,13.474708,100.0,0.0
75%,33.823829,500.0,0.0
max,8788.32945,50000.0,1.0


In [6]:
df.isnull().sum()

risk_tolerance                  0
investment_experience           0
liquidity_needs                 0
platform                        0
time_spent                      0
instrument_type_first_traded    0
first_deposit_amount            0
time_horizon                    0
user_id                         0
churn_flag                      0
dtype: int64

In [None]:
'''
To further enhance the exploratory data analysis (EDA) from a statistical perspective—especially given that you're analyzing churn with a small percentage (~5%)—we can expand our analysis to include advanced statistical techniques and hypothesis testing. Here's what I would suggest adding as a skilled statistician with a PhD and data science expertise:

1. Descriptive Statistics & Distribution Fitting
Before diving into modeling, it's crucial to understand the exact nature of the distributions for numerical features. If any feature deviates significantly from a normal distribution, we might consider applying transformations like log, square root, or exponential.
Checking for Normality:
Use Q-Q plots and Shapiro-Wilk tests to assess whether the numerical features follow a normal distribution.

2. Hypothesis Testing
T-Tests and Mann-Whitney U Test for comparing means between churn and non-churn groups:
For normally distributed data, we can use independent t-tests to compare the means between churn and non-churn users.
For non-normally distributed data, the Mann-Whitney U test is more appropriate.

3. Bivariate and Multivariate Hypothesis Testing
Chi-Square Test: For testing relationships between categorical variables like platform, risk_tolerance, and churn.
ANOVA: If we have more than two groups (e.g., risk tolerance levels), we can use ANOVA to check for significant differences between group means.
T-TEST: ?

4. Correlation and Multicollinearity Analysis
We should also examine the multicollinearity of our features, especially if we are considering logistic regression or other linear models.

Variance Inflation Factor (VIF) to detect multicollinearity:

5. Advanced Distribution Analysis: Fitting data to other distributions (e.g., Poisson, Exponential, etc.)
You can fit various distributions to see which one best describes your numerical data, especially when not normally distributed.

6. Multivariate Analysis and Interactions
Interaction Terms: To capture relationships between variables and churn, you can explore interaction effects.

7. Outlier Detection and Handling
Using Z-scores or IQR for outlier detection.

8. Final Thoughts on Transformations
Based on EDA, we may need to transform features for better model performance:

Log Transformation: For highly skewed data, log transformation can normalize distributions.
Box-Cox Transformation: Helps normalize data and is more flexible than log transformations.
The decision to apply transformations should be based on how well the feature distribution aligns with the assumptions of the chosen models (e.g., logistic regression assumes normality).

Summary of Additions:
Normality Checks: Q-Q plots and Shapiro-Wilk tests for normality.
Hypothesis Testing: T-tests, Mann-Whitney U, Chi-Square, and ANOVA to assess differences between churn and non-churn groups.
Transformation Decisions: Log, Box-Cox transformations based on distribution analysis.
Correlation and Multicollinearity: VIF and correlation heatmaps.
Advanced Distribution Fitting: Fit Poisson or other distributions as necessary.
Outlier Detection: Using Z-scores or IQR.
'''

###  Descriptive Statistics & Distribution Fitting | Checking for Normality

In [3]:
# Use Q-Q plots and Shapiro-Wilk tests to assess whether the numerical features follow a normal distribution.
# Q-Q Plot for normality check
import scipy.stats as stats
import matplotlib.pyplot as plt

for col in num_cols:
    plt.figure(figsize=(8, 6))
    stats.probplot(df[col], dist="norm", plot=plt)
    plt.title(f'Q-Q plot for {col}')
    plt.show()

# Shapiro-Wilk test
for col in num_cols:
    stat, p_value = stats.shapiro(df[col])
    print(f'Shapiro-Wilk Test for {col}: p-value = {p_value}')


In [4]:
# If the p-value is below a certain threshold (e.g., 0.05), the feature is not normally distributed,
# and we might consider applying transformations like log or Box-Cox transformations.
df['log_first_deposit'] = np.log1p(df['first_deposit_amount'])  # Log transform
df['boxcox_first_deposit'], _ = stats.boxcox(df['first_deposit_amount'] + 1)  # Apply Box-Cox transformation

### Hypothesis Testing | T-Tests and Mann-Whitney U Test

In [5]:
# T-Tests and Mann-Whitney U Test for comparing means between churn and non-churn groups:
# For normally distributed data, we can use independent t-tests to compare the means between churn and non-churn users.
# For non-normally distributed data, the Mann-Whitney U test is more appropriate.

# T-Test for normally distributed variables
for col in num_cols:
    churned = df[df['churn_flag'] == 1][col]
    not_churned = df[df['churn_flag'] == 0][col]
    t_stat, p_value = stats.ttest_ind(churned, not_churned, equal_var=False)
    print(f'T-Test for {col}: p-value = {p_value}')

# Mann-Whitney U Test for non-normally distributed variables
for col in num_cols:
    churned = df[df['churn_flag'] == 1][col]
    not_churned = df[df['churn_flag'] == 0][col]
    u_stat, p_value = stats.mannwhitneyu(churned, not_churned)
    print(f'Mann-Whitney U Test for {col}: p-value = {p_value}')


### Bivariate and Multivariate Hypothesis Testing | Chi-Square Test & ANOVA

In [None]:
# Chi-Square Test: For testing relationships between categorical variables like platform, risk_tolerance, and churn.
# Chi-Square test for categorical variables
from scipy.stats import chi2_contingency

for col in cat_cols:
    contingency_table = pd.crosstab(df[col], df['churn_flag'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    print(f'Chi-Square Test for {col}: p-value = {p}')

#ANOVA: If we have more than two groups (e.g., risk tolerance levels), we can use ANOVA to check for significant differences between group means.
# ANOVA test for multiple groups
from statsmodels.formula.api import ols
import statsmodels.api as sm

model = ols('first_deposit_amount ~ C(risk_tolerance)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print(anova_table)

### Correlation and Multicollinearity Analysis | VIF

In [None]:
# We should also examine the multicollinearity of our features, especially if we are considering logistic
# regression or other linear models.

# Variance Inflation Factor (VIF) to detect multicollinearity:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculating VIF for numerical features
vif_data = pd.DataFrame()
vif_data['Feature'] = num_cols
vif_data['VIF'] = [variance_inflation_factor(df[num_cols].values, i) for i in range(len(num_cols))]
print(vif_data)


### Advanced Distribution Analysis: Fitting data to other distributions (e.g., Poisson, Exponentia

In [9]:
# You can fit various distributions to see which one best describes your numerical data,
# especially when not normally distributed.
# Fit a Poisson distribution for count-like data
from scipy.stats import poisson

poisson_fit = poisson.fit(df['time_spent'])
sns.histplot(df['time_spent'], kde=False, stat="density")
plt.plot(np.arange(0, max(df['time_spent']), 1), poisson.pmf(np.arange(0, max(df['time_spent']), 1), *poisson_fit))
plt.title('Poisson Distribution Fit for Time Spent')
plt.show()


### Multivariate Analysis | Interactions

In [None]:
# Interaction Terms: To capture relationships between variables and churn, you can explore interaction effects.
# Creating interaction terms
df['time_spent_platform_interaction'] = df['time_spent'] * df['platform']

# Exploring interaction using pairplot
sns.pairplot(df[['time_spent', 'first_deposit_amount', 'platform', 'churn_flag']], hue='churn_flag')
plt.show()


### Outlier Detection and Handling | ZSCORE & IQR

In [12]:
# Detecting outliers using Z-scores
from scipy import stats

z_scores = np.abs(stats.zscore(df[num_cols]))
outliers = (z_scores > 3).sum(axis=0)  # Count of outliers per column
print(f'Number of outliers per column: {outliers}')


# Need to fill in IAR using BoxPLot

In [None]:
'''

Based on EDA, we may need to transform features for better model performance:

Log Transformation: For highly skewed data, log transformation can normalize distributions.
Box-Cox Transformation: Helps normalize data and is more flexible than log transformations.
The decision to apply transformations should be based on how well the feature distribution aligns with the assumptions of the chosen models (e.g., logistic regression assumes normality).

Summary of Additions:
Normality Checks: Q-Q plots and Shapiro-Wilk tests for normality.
Hypothesis Testing: T-tests, Mann-Whitney U, Chi-Square, and ANOVA to assess differences between churn and non-churn groups.
Transformation Decisions: Log, Box-Cox transformations based on distribution analysis.
Correlation and Multicollinearity: VIF and correlation heatmaps.
Advanced Distribution Fitting: Fit Poisson or other distributions as necessary.
Outlier Detection: Using Z-scores or IQR.
By adding these methods, we ensure a comprehensive statistical and data science-driven EDA, leading to better understanding of the data structure and
subsequent decisions for model selection and feature engineering. Would you like to implement any specific sections first, or would you like the full Jupyter notebook with these enhancements?'''