In [1]:
# Data Manipulation and Handling
import polars as pl
import pandas as pd
import numpy as np
import psycopg2

# DB Credentials
from dotenv import load_dotenv
import os
import sys
from sqlalchemy import create_engine

# Machine Learning Libraries
import torch
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Handling Imbalanced Data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Gradient Boosting Libraries
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Model Lifecycle Management
import mlflow
import mlflow.sklearn

# Distributed Computing
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier as SparkRFClassifier

# Model Interpretability
import shap

# Hyperparameter Optimization
import optuna

# Automated Feature Engineering
import featuretools as ft

# Add parent directory to sys.path
current_dir = os.getcwd()
parent_dir = os.path.abspath(os.path.join(current_dir, os.pardir))
if parent_dir not in sys.path:
    sys.path.insert(0, parent_dir)

# Custom Modules
from fetch_data_hook import fetch_sql_code, fetch_sql_file

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.


In [44]:
poor_ppl_df = fetch_sql_code('''select * from features_data where first_deposit_amount< 10''')
poor_ppl_df
## does sum(timespent) mean anything? can we use this as feature engineering?

Unnamed: 0,risk_tolerance,investment_experience,liquidity_needs,platform,time_spent,instrument_type_first_traded,first_deposit_amount,time_horizon,user_id
0,low_risk_tolerance,good_investment_exp,very_important_liq_need,Android,30.710083,stock,5.0,long_time_horizon,f4cac8c1544b723ddedc9bdba81c64c0
1,high_risk_tolerance,no_investment_exp,very_important_liq_need,Android,56.187617,stock,5.0,med_time_horizon,f59175fe38f9f5e98853efe6e9caefaa
2,high_risk_tolerance,limited_investment_exp,very_important_liq_need,Android,4.300000,stock,5.0,long_time_horizon,28e81aeb3bd9524fdbd8cd381293517e
3,med_risk_tolerance,limited_investment_exp,very_important_liq_need,iOS,36.269117,stock,1.0,short_time_horizon,b9fcf3097935ed35f2a0e8f8184f8778
4,high_risk_tolerance,no_investment_exp,very_important_liq_need,iOS,11.018200,stock,4.0,long_time_horizon,d39a9f2ae92927dbf89bb3e5cecea195
...,...,...,...,...,...,...,...,...,...
268,med_risk_tolerance,no_investment_exp,very_important_liq_need,iOS,8.406917,stock,7.0,short_time_horizon,acd9b3c452a7cbdf63acb029b8c1a25c
269,high_risk_tolerance,limited_investment_exp,very_important_liq_need,iOS,92.083867,stock,5.0,med_time_horizon,36312e9a92e1d7b925bc8c97cbf5ca86
270,high_risk_tolerance,no_investment_exp,somewhat_important_liq_need,both,26.066200,stock,5.0,short_time_horizon,5208a5b04683563636817f6fc05f2001
271,med_risk_tolerance,no_investment_exp,very_important_liq_need,iOS,0.000000,stock,5.0,med_time_horizon,03f51748bb1fb47d5ab2c0426f043d81


In [61]:
streaks_df = fetch_sql_code('''
WITH temp1 AS (
    SELECT
        *,
        ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY timestamp) AS rn,
        timestamp::date - ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY timestamp)::int AS streak_id
    FROM
        equity_value_data
),
temp2 AS (
    SELECT
        user_id,
        MIN(timestamp::date) AS start_streak_date,
        MAX(timestamp::date) AS end_streak_date,
        COUNT(*) AS duration_of_above10_streak
    FROM
        temp1
    GROUP BY
        user_id, streak_id
)
SELECT
    *,
    LAG(end_streak_date) OVER (PARTITION BY user_id ORDER BY start_streak_date ASC) AS prev_above10_streak_date,
    start_streak_date - LAG(end_streak_date) OVER (PARTITION BY user_id ORDER BY start_streak_date ASC) AS duration_between_above10_streaks
FROM
    temp2
''')
streaks_df

Unnamed: 0,user_id,start_streak_date,end_streak_date,duration_of_above10_streak,prev_above10_streak_date,duration_between_above10_streaks
0,0012db34aa7b083f5714e7831195e54d,2016-08-18,2016-08-19,2,,
1,0012db34aa7b083f5714e7831195e54d,2016-08-22,2016-08-26,5,2016-08-19,3.0
2,0012db34aa7b083f5714e7831195e54d,2016-08-29,2016-09-02,5,2016-08-26,3.0
3,0012db34aa7b083f5714e7831195e54d,2016-09-06,2016-09-09,4,2016-09-02,4.0
4,0012db34aa7b083f5714e7831195e54d,2016-09-12,2016-09-16,5,2016-09-09,3.0
...,...,...,...,...,...,...
245095,ffc1e622f3a0b2666f09a6dcb7f27918,2017-07-17,2017-07-21,5,2017-07-14,3.0
245096,ffc1e622f3a0b2666f09a6dcb7f27918,2017-07-24,2017-07-28,5,2017-07-21,3.0
245097,ffc1e622f3a0b2666f09a6dcb7f27918,2017-07-31,2017-08-04,5,2017-07-28,3.0
245098,ffc1e622f3a0b2666f09a6dcb7f27918,2017-08-07,2017-08-11,5,2017-08-04,3.0


In [57]:
#41/239 users had 28+days of under 10$ happening 2x...Wondering if we can "cluster" these users.
df = fetch_sql_file('test2.5')
df

Unnamed: 0,user_id,count
0,028367ff3cbcc04c2afc2ce3336c00e2,2
1,0423b88554cedaa7efd8dd4c81774cce,2
2,09bc5486de3a64be6dcf3a8c76bbe06a,2
3,0e28a976de6797ffa6de7cd2e9677d61,2
4,14b44db6ff01a66a284d6d4a62975706,3
5,16e5aa4d2433e921992eff4125fdcdab,2
6,19c5ffe69d42129a131da5ba77dd110c,3
7,1ff89680e3e3596c67cea1519810967c,2
8,25db7f38515e4f486547705bb3fd283b,2
9,28099dd28b095346298166c41b5981c1,2


In [50]:
# # user_churned_df: dataframe that shows of the users who had a balance of less than $10, the last date that the user had a balance of over $10
#  # user_id: users who churned. In other words, users who had a balance of less than $10 at one point in time
#  # start_streak_date: this is the last date range where the user had a balance of over $10
#  # end_streak_date: this is the last date range where the user had a balance of over $10
#  # duration_of_above10_streak: this is the last total amount of days the user had a balance of over $10
#  # last_streak_date: this is last time the user had a balance of over $10
#  # duration_between_above10streaks: the date range in days between the last time the user had a balance of over $10
#  # user_behavior_first_login: the user's first time logging into robinhood
#  # user_behavior_last_login: the user's last time logging into robinhood
# test2_sql = 'test2'
# user_churned_df = fetch_sql_file(test2_sql)
# user_churned_df

Unnamed: 0,user_id,start_streak_date,end_streak_date,duration_of_above10_streak,prev_above10_streak_date,duration_between_above10_streaks,first_above10_streak,last_above10_streak
0,00440034cc4152bfb01b30f5c381c4e3,2017-05-18,2017-05-19,2,2017-02-27,80,2016-12-30,2017-08-16
1,005d630a68b4ab3a2f4cd49d9a87c50d,2017-03-07,2017-03-10,4,2016-11-04,123,2016-08-18,2017-08-17
2,028367ff3cbcc04c2afc2ce3336c00e2,2017-03-09,2017-03-09,1,2017-02-03,34,2016-11-14,2017-03-09
3,0423b88554cedaa7efd8dd4c81774cce,2017-07-24,2017-07-28,5,2017-03-14,132,2016-10-21,2017-08-18
4,062ea0ff3b7fc36ae471968aced1f4a1,2017-02-01,2017-02-03,3,2016-09-15,139,2016-08-18,2017-03-31
...,...,...,...,...,...,...,...,...
274,fdc54af66d1190dec81b95b4a2965634,2017-07-18,2017-07-21,4,2017-05-12,67,2016-08-19,2017-08-18
275,ff0ae95285c43e3a5af84860bffaa544,2017-06-23,2017-06-23,1,2017-04-03,81,2016-10-14,2017-08-18
276,ff377467d4e28b425266a8b2c8b2f5c7,2016-11-28,2016-12-02,5,2016-09-29,60,2016-09-23,2017-08-18
277,ff7610fdd7ac5cbfa0b17aca53af5db4,2017-07-03,2017-07-03,1,2017-01-12,172,2016-11-08,2017-07-28


In [52]:
user_churned_df[user_churned_df['start_streak_date']==user_churned_df['last_above10_streak']]

Unnamed: 0,user_id,start_streak_date,end_streak_date,duration_of_above10_streak,prev_above10_streak_date,duration_between_above10_streaks,first_above10_streak,last_above10_streak
2,028367ff3cbcc04c2afc2ce3336c00e2,2017-03-09,2017-03-09,1,2017-02-03,34,2016-11-14,2017-03-09
12,0b05f8d83e97ac57867fe87a2437b21f,2017-02-02,2017-02-02,1,2016-12-14,50,2016-09-30,2017-02-02
231,d5c6a5a87d3f3dd8ae2e17260ac6fbcc,2017-08-18,2017-08-18,1,2017-04-21,119,2016-08-22,2017-08-18
