In [1]:
# Data Manipulation and Handling
import polars as pl
import pandas as pd
import numpy as np
import psycopg2

# DB Credentials
from dotenv import load_dotenv
import os
import sys
from sqlalchemy import create_engine

# Machine Learning Libraries
import torch
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Handling Imbalanced Data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Gradient Boosting Libraries
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Model Lifecycle Management
import mlflow
import mlflow.sklearn

# Distributed Computing
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier as SparkRFClassifier

# Model Interpretability
import shap

# Hyperparameter Optimization
import optuna

# Automated Feature Engineering
import featuretools as ft

# Add parent directory to sys.path
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Custom Modules
from fetch_data_hook import fetch_sql_code, fetch_sql_file

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.


## Step 1: Load the Data

In [2]:
merge_sql = 'merged_data'
equity_value_data = 'equity_value_data'
chatgpt_sql_4o = 'chatGPT-model-4o'
chatgpt_sql_o1 = 'chatGPT-model-o1'
chatgpt_sql_o1_q1='chatGPT-model-o1-Q1'
percent_churned_sql = 'not_senior_enough'
test_sql = 'test'

In [13]:
user_churned_df = fetch_sql_file(test_sql)
user_churned_df

Unnamed: 0,user_id,min_date,max_date,duration_of_above10_streak,last_streak_date,duration_between_above10_streaks
0,00440034cc4152bfb01b30f5c381c4e3,2017-05-18,2017-05-19,2,2017-02-27,80
1,005d630a68b4ab3a2f4cd49d9a87c50d,2017-03-07,2017-03-10,4,2016-11-04,123
2,028367ff3cbcc04c2afc2ce3336c00e2,2016-12-19,2016-12-23,5,2016-11-14,35
3,028367ff3cbcc04c2afc2ce3336c00e2,2017-03-09,2017-03-09,1,2017-02-03,34
4,0423b88554cedaa7efd8dd4c81774cce,2016-12-30,2016-12-30,1,2016-10-26,65
...,...,...,...,...,...,...
319,fdc54af66d1190dec81b95b4a2965634,2017-07-18,2017-07-21,4,2017-05-12,67
320,ff0ae95285c43e3a5af84860bffaa544,2017-06-23,2017-06-23,1,2017-04-03,81
321,ff377467d4e28b425266a8b2c8b2f5c7,2016-11-28,2016-12-02,5,2016-09-29,60
322,ff7610fdd7ac5cbfa0b17aca53af5db4,2017-07-03,2017-07-03,1,2017-01-12,172


In [14]:
user_id = tuple(user_churned_df['user_id'])
churned_user_activity_df = fetch_sql_code(f''' select user_id, min(timestamp::date), max(timestamp::date) from equity_value_data where user_id in {user_id} group by user_id''')
churned_user_activity_df

Unnamed: 0,user_id,min,max
0,08dcfa90e0fee20f81520d54b57699d3,2016-08-18,2017-08-17
1,d84a73b2e29b6e9cdf3f8fc18363bd63,2016-08-22,2017-08-18
2,ed336912962777f3d9815e4af435bc6a,2016-09-02,2017-07-25
3,b8343c09ec9bca0b3dd62790f2314742,2016-11-03,2017-08-18
4,570847a47a671f85e20d621db8473c43,2016-10-04,2017-06-27
...,...,...,...
274,20936e45b85da1ec29305d15aecc4d7f,2017-02-07,2017-08-18
275,1d65de8d8c9041ddc097654d2b9009f4,2016-11-25,2017-03-03
276,83dc7d45915184fbc6dab8d55209aee4,2016-08-17,2017-08-16
277,5b947006854fcc89e8d274c451e61723,2016-08-22,2017-06-21


In [19]:
churned_user_activity_df[churned_user_activity_df['user_id'] =='005d630a68b4ab3a2f4cd49d9a87c50d']

Unnamed: 0,user_id,min,max
208,005d630a68b4ab3a2f4cd49d9a87c50d,2016-08-18,2017-08-17


In [17]:
testdf = pd.merge(user_churned_df, churned_user_activity_df, how='left', left_on='user_id', right_on='user_id', suffixes=('_left', '_right'))
testdf

Unnamed: 0,user_id,min_date,max_date,duration_of_above10_streak,last_streak_date,duration_between_above10_streaks,min,max
0,00440034cc4152bfb01b30f5c381c4e3,2017-05-18,2017-05-19,2,2017-02-27,80,2016-12-30,2017-08-16
1,005d630a68b4ab3a2f4cd49d9a87c50d,2017-03-07,2017-03-10,4,2016-11-04,123,2016-08-18,2017-08-17
2,028367ff3cbcc04c2afc2ce3336c00e2,2016-12-19,2016-12-23,5,2016-11-14,35,2016-11-14,2017-03-09
3,028367ff3cbcc04c2afc2ce3336c00e2,2017-03-09,2017-03-09,1,2017-02-03,34,2016-11-14,2017-03-09
4,0423b88554cedaa7efd8dd4c81774cce,2016-12-30,2016-12-30,1,2016-10-26,65,2016-10-21,2017-08-18
...,...,...,...,...,...,...,...,...
319,fdc54af66d1190dec81b95b4a2965634,2017-07-18,2017-07-21,4,2017-05-12,67,2016-08-19,2017-08-18
320,ff0ae95285c43e3a5af84860bffaa544,2017-06-23,2017-06-23,1,2017-04-03,81,2016-10-14,2017-08-18
321,ff377467d4e28b425266a8b2c8b2f5c7,2016-11-28,2016-12-02,5,2016-09-29,60,2016-09-23,2017-08-18
322,ff7610fdd7ac5cbfa0b17aca53af5db4,2017-07-03,2017-07-03,1,2017-01-12,172,2016-11-08,2017-07-28
