In [1]:
# Data Manipulation and Handling
import polars as pl
import pandas as pd
import numpy as np
import psycopg2

# DB Credentials
from dotenv import load_dotenv
import os

# Machine Learning Libraries
import torch
import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Handling Imbalanced Data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

# Gradient Boosting Libraries
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Model Lifecycle Management
import mlflow
import mlflow.sklearn

# Distributed Computing
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier as SparkRFClassifier

# Model Interpretability
import shap

# Hyperparameter Optimization
import optuna

# Automated Feature Engineering
import featuretools as ft

%load_ext dotenv
%dotenv
BASE_DIR = '../SQL/'


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
'PYARROW_IGNORE_TIMEZONE' environment variable was not set. It is required to set this environment variable to '1' in both driver and executor sides if you use pyarrow>=2.0.0. pandas-on-Spark will set it for you but it does not work if there is a Spark context already launched.


In [2]:
def etl_query(query:str):
    connection = None
    cursor = None

    try:
        connection = psycopg2.connect(
            host=os.environ['host'],
            database=os.environ['database'],
            user=os.environ['user'],
            password=os.environ['password'],
            port=os.environ['port']
        )
        cursor = connection.cursor()
        cursor.execute(query)

        col_names = [desc[0] for desc in cursor.description]
        return pd.DataFrame(cursor.fetchall(), columns=col_names)

    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    finally:
        if cursor:
            cursor.close()
        if connection:
            connection.close()

def read_sql(name: str):
    with open(BASE_DIR + name + '.sql', 'r') as file:
        sql_query = file.read()

    return etl_query(sql_query)

In [4]:
merge_sql = 'merged_data'
chatgpt_sql_4o = 'chatGPT-model-4o'
chatgpt_sql_o1 = 'chatGPT-model-o1'
chatgpt_sql_o1_q1='chatGPT-model-o1-Q1'
df = read_sql(chatgpt_sql_o1_q1)
df