In [7]:
# Data Manipulation and Handling
import polars as pl
import pandas as pd
import numpy as np
import psycopg2

# DB Credentials
from dotenv import load_dotenv
import os
import sys
from sqlalchemy import create_engine

# Machine Learning Libraries
import torch
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Handling Imbalanced Data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Gradient Boosting Libraries
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Model Lifecycle Management
import mlflow
import mlflow.sklearn

# Distributed Computing
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier as SparkRFClassifier

# Model Interpretability
import shap

# Hyperparameter Optimization
import optuna

# Automated Feature Engineering
import featuretools as ft

# Add parent directory to sys.path
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Custom Modules
from fetch_data_hook import fetch_sql_code, fetch_sql_file

## Step 1: Load the Data

In [8]:
merge_sql = 'merged_data'
equity_value_data = 'equity_value_data'
chatgpt_sql_4o = 'chatGPT-model-4o'
chatgpt_sql_o1 = 'chatGPT-model-o1'
chatgpt_sql_o1_q1='chatGPT-model-o1-Q1'
percent_churned_sql = 'not_senior_enough'
test_sql = 'test'

In [9]:
df = fetch_sql_file(test_sql)
df

Unnamed: 0,user_id,min_date,max_date,duration_of_above10_streak,last_streak_date,duration_between_above10_streaks
0,00440034cc4152bfb01b30f5c381c4e3,2017-05-18,2017-05-19,2,2017-02-27,80
1,005d630a68b4ab3a2f4cd49d9a87c50d,2017-03-07,2017-03-10,4,2016-11-04,123
2,028367ff3cbcc04c2afc2ce3336c00e2,2016-12-19,2016-12-23,5,2016-11-14,35
3,028367ff3cbcc04c2afc2ce3336c00e2,2017-03-09,2017-03-09,1,2017-02-03,34
4,0423b88554cedaa7efd8dd4c81774cce,2016-12-30,2016-12-30,1,2016-10-26,65
...,...,...,...,...,...,...
319,fdc54af66d1190dec81b95b4a2965634,2017-07-18,2017-07-21,4,2017-05-12,67
320,ff0ae95285c43e3a5af84860bffaa544,2017-06-23,2017-06-23,1,2017-04-03,81
321,ff377467d4e28b425266a8b2c8b2f5c7,2016-11-28,2016-12-02,5,2016-09-29,60
322,ff7610fdd7ac5cbfa0b17aca53af5db4,2017-07-03,2017-07-03,1,2017-01-12,172


In [15]:
user_date_df = fetch_sql_code(''' select user_id, min(timestamp::date), max(timestamp::date) from equity_value_data group by user_id''')
user_date_df

Unnamed: 0,user_id,min,max
0,f74b540369a47ce39453bd605341a51a,2016-08-18,2017-08-17
1,19f12f242ca451d01387be0ddd0b78f7,2017-01-05,2017-08-18
2,d7ed4648b55362e5bc8e8eb274596ca8,2016-08-18,2017-08-17
3,314f750f411a3d3aca5dec67444f0144,2016-08-17,2017-08-16
4,9cec28a468177c38ff2151732e5f4eee,2016-08-17,2017-08-16
...,...,...,...
5579,e69ba45aa1400f081cec0b0fbc607efb,2016-11-16,2017-08-18
5580,677dd17cbd42ae77591276b8e437af6f,2016-08-19,2017-03-17
5581,0e33746f6717e8d3c163f0d8b24d9a47,2016-08-18,2017-08-17
5582,ce424776bb28137abd140bb011d5eae3,2016-08-17,2017-08-16


In [14]:
df['user_id'].nunique()

279