In [58]:
# Data Manipulation and Handling
import polars as pl
import pandas as pd
import numpy as np
import psycopg2

# DB Credentials
from dotenv import load_dotenv
import os

# Machine Learning Libraries
import torch
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Handling Imbalanced Data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Gradient Boosting Libraries
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Model Lifecycle Management
import mlflow
import mlflow.sklearn

# Distributed Computing
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier as SparkRFClassifier

# Model Interpretability
import shap

# Hyperparameter Optimization
import optuna

# Automated Feature Engineering
import featuretools as ft

%load_ext dotenv
%dotenv
BASE_DIR = '../SQL/'


The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [59]:
def etl_query(query:str):
    connection = None
    cursor = None

    try:
        connection = psycopg2.connect(
            host=os.environ['host'],
            database=os.environ['database'],
            user=os.environ['user'],
            password=os.environ['password'],
            port=os.environ['port']
        )
        cursor = connection.cursor()
        cursor.execute(query)

        col_names = [desc[0] for desc in cursor.description]
        return pd.DataFrame(cursor.fetchall(), columns=col_names)

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    finally:
        if cursor:
            cursor.close()
        if connection:
            connection.close()

def read_sql(name: str):
    with open(BASE_DIR + name + '.sql', 'r') as file:
        sql_query = file.read()

    return etl_query(sql_query)

## Step 1: Load the Data

In [60]:
equity_data = read_sql('equity_value_data')
features_data = read_sql('features_data')

In [61]:
print(equity_data.dtypes)


timestamp        object
close_equity    float64
user_id          object
dtype: object


In [62]:
equity_data['timestamp'] = pd.to_datetime(equity_data['timestamp'])

In [63]:
print(equity_data.dtypes)


timestamp       datetime64[ns, UTC]
close_equity                float64
user_id                      object
dtype: object


## Step 2: Generate the Complete Calendar

In [64]:
# Get the min and max dates from the equity data
min_date = equity_data['timestamp'].min()
max_date = equity_data['timestamp'].max()

# Create a date range of all calendar dates
all_dates = pd.date_range(start=min_date, end=max_date, freq='D')


## Step 3: Identify Market Open and Closed Days

In [65]:
# Determine market open days (dates with any data)
market_open_days = equity_data['timestamp'].dt.date.unique()

# Create a DataFrame for all dates with market status
calendar_df = pd.DataFrame({'date': all_dates})
calendar_df['market_status'] = calendar_df['date'].dt.date.apply(
    lambda x: 'open' if x in market_open_days else 'closed'
)

## Step 4: Prepare User-Date Data

In [66]:
# Get the list of all users
user_ids = equity_data['user_id'].unique()

# Create a MultiIndex with all users and all dates
user_date_index = pd.MultiIndex.from_product(
    [user_ids, all_dates], names=['user_id', 'date']
)

# Create a DataFrame with this index
user_date_df = pd.DataFrame(index=user_date_index).reset_index()

# Merge with the calendar DataFrame to get market status
user_date_df = user_date_df.merge(calendar_df, on='date', how='left')


In [67]:
equity_data

Unnamed: 0,timestamp,close_equity,user_id
0,2016-11-16 00:00:00+00:00,48.16,bcef4fa9b0bdf22bcf7deae708decf03
1,2016-11-17 00:00:00+00:00,48.16,bcef4fa9b0bdf22bcf7deae708decf03
2,2016-11-18 00:00:00+00:00,48.16,bcef4fa9b0bdf22bcf7deae708decf03
3,2016-11-21 00:00:00+00:00,48.16,bcef4fa9b0bdf22bcf7deae708decf03
4,2016-11-22 00:00:00+00:00,48.16,bcef4fa9b0bdf22bcf7deae708decf03
...,...,...,...
2099153,2017-05-25 00:00:00+00:00,1575.85,4dd8735b40b35b9d81fbf9fe4114c100
2099154,2017-05-26 00:00:00+00:00,1644.11,4dd8735b40b35b9d81fbf9fe4114c100
2099155,2017-05-30 00:00:00+00:00,1902.85,4dd8735b40b35b9d81fbf9fe4114c100
2099156,2017-05-31 00:00:00+00:00,1952.10,4dd8735b40b35b9d81fbf9fe4114c100


In [53]:
equity_data['date'] = pd.to_datetime(equity_data['date'])

In [57]:
equity_data

Unnamed: 0,user_id,date,close_equity
0,bcef4fa9b0bdf22bcf7deae708decf03,2016-11-16,48.16
1,bcef4fa9b0bdf22bcf7deae708decf03,2016-11-17,48.16
2,bcef4fa9b0bdf22bcf7deae708decf03,2016-11-18,48.16
3,bcef4fa9b0bdf22bcf7deae708decf03,2016-11-21,48.16
4,bcef4fa9b0bdf22bcf7deae708decf03,2016-11-22,48.16
...,...,...,...
2099153,4dd8735b40b35b9d81fbf9fe4114c100,2017-05-25,1575.85
2099154,4dd8735b40b35b9d81fbf9fe4114c100,2017-05-26,1644.11
2099155,4dd8735b40b35b9d81fbf9fe4114c100,2017-05-30,1902.85
2099156,4dd8735b40b35b9d81fbf9fe4114c100,2017-05-31,1952.10


In [54]:
print(equity_data.dtypes)


user_id                 object
date            datetime64[ns]
close_equity           float64
dtype: object


## Step 5: Merge Equity Data

In [55]:
# Prepare equity data by setting the index
equity_data['date'] = equity_data['date'].dt.date
equity_data = equity_data[['user_id', 'date', 'close_equity']]

# Merge user-date DataFrame with equity data
user_date_df = user_date_df.merge(
    equity_data, on=['user_id', 'date'], how='left'
)


ValueError: You are trying to merge on datetime64[ns, UTC] and object columns. If you wish to proceed you should use pd.concat