In [82]:
# Data Manipulation and Handling
import polars as pl
import pandas as pd
import numpy as np
import psycopg2

# DB Credentials
from dotenv import load_dotenv
import os

# Machine Learning Libraries
import torch
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Handling Imbalanced Data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Gradient Boosting Libraries
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Model Lifecycle Management
import mlflow
import mlflow.sklearn

# Distributed Computing
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier as SparkRFClassifier

# Model Interpretability
import shap

# Hyperparameter Optimization
import optuna

# Automated Feature Engineering
import featuretools as ft

%load_ext dotenv
%dotenv


The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [86]:
def query_to_dataframe(query):
    connection = None
    cursor = None

    try:
        connection = psycopg2.connect(
            host=os.environ['host'],
            database=os.environ['database'],
            user=os.environ['user'],
            password=os.environ['password'],
            port=os.environ['port']
        )

        cursor = connection.cursor()
        cursor.execute(query)

        col_names = [desc[0] for desc in cursor.description]
        return pd.DataFrame(cursor.fetchall(), columns=col_names)

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    finally:
        if cursor:
            cursor.close()
        if connection:
            connection.close()

In [87]:
sql_query = '''
    SELECT
    e.user_id,
    risk_tolerance,
    investment_Experience,
    time_horizon,
    platform,
    time_spent,
    first_deposit_amount,
    instrument_type_first_traded,
    e.close_equity,
    e.timestamp
    FROM ds.features_data f
    LEFT JOIN ds.equity_value_data e on f.user_id=e.user_id;
'''

df = query_to_dataframe(sql_query)
df

Unnamed: 0,user_id,risk_tolerance,investment_experience,time_horizon,platform,time_spent,first_deposit_amount,instrument_type_first_traded,close_equity,timestamp
0,44b859254eafb23c246a93275f4bff62,high_risk_tolerance,good_investment_exp,short_time_horizon,iOS,0.0,6000.0,stock,11926.722,2017-01-09T00:00:00Z
1,44b859254eafb23c246a93275f4bff62,high_risk_tolerance,good_investment_exp,short_time_horizon,iOS,0.0,6000.0,stock,11849.876,2017-01-10T00:00:00Z
2,44b859254eafb23c246a93275f4bff62,high_risk_tolerance,good_investment_exp,short_time_horizon,iOS,0.0,6000.0,stock,12185.460,2017-01-11T00:00:00Z
3,44b859254eafb23c246a93275f4bff62,high_risk_tolerance,good_investment_exp,short_time_horizon,iOS,0.0,6000.0,stock,12313.460,2017-01-12T00:00:00Z
4,44b859254eafb23c246a93275f4bff62,high_risk_tolerance,good_investment_exp,short_time_horizon,iOS,0.0,6000.0,stock,13736.420,2017-01-13T00:00:00Z
...,...,...,...,...,...,...,...,...,...,...
2099153,44b859254eafb23c246a93275f4bff62,high_risk_tolerance,good_investment_exp,short_time_horizon,iOS,0.0,6000.0,stock,11371.120,2016-12-30T00:00:00Z
2099154,44b859254eafb23c246a93275f4bff62,high_risk_tolerance,good_investment_exp,short_time_horizon,iOS,0.0,6000.0,stock,11433.540,2017-01-03T00:00:00Z
2099155,44b859254eafb23c246a93275f4bff62,high_risk_tolerance,good_investment_exp,short_time_horizon,iOS,0.0,6000.0,stock,11458.572,2017-01-04T00:00:00Z
2099156,44b859254eafb23c246a93275f4bff62,high_risk_tolerance,good_investment_exp,short_time_horizon,iOS,0.0,6000.0,stock,12469.776,2017-01-05T00:00:00Z
