In [None]:
!pip install openai pandas python-dotenv
!pip install xgboost lightgbm
!pip install optuna
!pip install optuna-integration[lightgbm]

In [None]:
# --- Import libraries ---
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

import xgboost as xgb
import lightgbm as lgb

import openai
from dotenv import load_dotenv
import os
import datetime
import logging
from google.colab import drive

# Hyperparameter tuning
import optuna
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from optuna.integration import LightGBMPruningCallback
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [None]:
drive.mount('/content/drive')  # follow the prompt to authorize

# Global variable to hold the generated DataFrame
GLOBAL_DF = None

add_more_data = False # This will increase runtime by adding more data

# Filepath containing data
FILE_PATH = '/content/drive/MyDrive/fintechSoc_synthetic_data.csv'

# --- Load environment variables from .env file ---
load_dotenv()

# Access the OpenAI API key from the environment variable or use the provided key
api_key = os.getenv("OPENAI_API_KEY", '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')

if api_key is None:
    raise ValueError("API key not found. Make sure OPENAI_API_KEY is defined in your .env file.")

# Initialize OpenAI client (adjust according to the API version if needed)
client = openai.OpenAI(api_key=api_key)

# Set up logging configuration
logging.basicConfig(
    filename='synthetic_data_generation.log',
    filemode='a',
    format='%(asctime)s - %(levelname)s - %(message)s',
    level=logging.INFO
)

logging.info("Started synthetic data generation script")
print("Started synthetic data generation script")

MODEL = 'gpt-3.5-turbo'

# Define stages and their respective prompts
stage_prompts = {
    0: "Explain a complex financial concept to a beginner.",
    1: "Provide a quiz question related to investments.",
    2: "Describe risk diversification in a simple way.",
    3: "Encourage students to ask financial questions.",
    4: "Adjust the lesson plan based on the student's understanding.",
    5: ("Previous topics: Explain a complex financial concept to a beginner, "
         "Provide a quiz question related to investments, "
         "Describe risk diversification in a simple way, "
         "Encourage students to ask financial questions, "
         "Adjust the lesson plan based on the student's understanding. "
         "Based on the previous questions, come up with a different and similar financial topic.")
}

# Number of samples to generate for each stage
samples_per_stage = 200

In [None]:
if os.path.isfile(FILE_PATH):
    print("Found existing CSV – loading it …")
    GLOBAL_DF = pd.read_csv(FILE_PATH)
    print(f"loaded successfully, with {len(GLOBAL_DF)} rows of data.")
else:
    print("No existing CSV – starting fresh.")
    GLOBAL_DF = pd.DataFrame()               # empty placeholder

## Data Preprocessing: Logic for generating new data (normally only ran when data is empty)

In [None]:
def generate_synthetic_data():
    """
    Generates one response per (stage, sample) pair and appends each row
    directly to the global DataFrame GLOBAL_DF.
    """
    global GLOBAL_DF

    # Ensure GLOBAL_DF exists with the correct columns
    if GLOBAL_DF is None:
        GLOBAL_DF = pd.DataFrame(
            columns=[
                "stage", "sample", "prompt",
                "generated_response", "model_used", "timestamp"
            ]
        )

    # Generate data and append row‑by‑row
    for stage, prompt in stage_prompts.items():
        for sample in range(samples_per_stage):
            logging.info(f"Generating sample {sample+1} for Stage {stage}…")
            print(f"Generating sample {sample+1} for Stage {stage}…")

            try:
                response = client.chat.completions.create(
                    model=MODEL,
                    messages=[{"role": "system", "content": prompt}],
                )
                chatgpt_output = response.choices[0].message.content

                # Build a one‑row DataFrame and concat it
                new_row = pd.DataFrame([{
                    "stage": stage,
                    "sample": sample + 1,
                    "prompt": prompt,
                    "generated_response": chatgpt_output,
                    "model_used": MODEL,
                    "timestamp": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                }])

                GLOBAL_DF = pd.concat(
                    [GLOBAL_DF, new_row],
                    ignore_index=True
                )

            except Exception as e:
                logging.error(f"Error generating sample {sample+1} for stage {stage}: {e}")
                print(f"Error generating sample {sample+1} for stage {stage}: {e}")

    logging.info("Data generation complete — GLOBAL_DF updated in place.")
    print("Data generation complete — GLOBAL_DF updated in place.")

## Data Saving: Generates new data, only if global_DF is empty. But remove the if condition if you would like to generate more data

In [None]:
# Run the data generation function if the data hasn't been generated yet
if GLOBAL_DF is None or GLOBAL_DF.empty or add_more_data:
    print("Generating fresh synthetic data…")

    # Instead of saving to CSV, store the DataFrame in a global variable
    generate_synthetic_data()

    # Saves data that is generated
    GLOBAL_DF.to_csv(
        FILE_PATH,
        mode='w',                # overwrite with full DF once (simpler & safe)
        index=False,
        header=True              # always keep header when you overwrite
    )
    print(f"Successfully saved to {FILE_PATH} a total of {len(GLOBAL_DF)} rows of data")
else:
    print(f"GLOBAL_DF has {len(GLOBAL_DF)} rows – skipping generation.")

## Main logic of code

In [None]:
df = GLOBAL_DF  # Use the global dataframe

# --- Feature and Label extraction ---
X_text = df["generated_response"]
y = df["stage"]

# --- Convert text to numerical features using TF-IDF ---
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(X_text)

# --- Train/Test split ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
def train_and_evaluate_models(hyperparams_xgb=None, hyperparams_lgb=None):
    """
    Trains and evaluates XGBoost and LightGBM models using the global dataframe GLOBAL_DF.

    Parameters:
        hyperparams_xgb (dict): Hyperparameters for the XGBoost model.
        hyperparams_lgb (dict): Hyperparameters for the LightGBM model.

    Returns:
        dict: Evaluation metrics for both models.
    """
    # --- Set default hyperparameters if none are provided ---
    if hyperparams_xgb is None:
        hyperparams_xgb = {"use_label_encoder": False, "eval_metric": "mlogloss", "random_state": 42}
    if hyperparams_lgb is None:
        hyperparams_lgb = {"random_state": 42}

    # --- Train XGBoost ---
    xgb_model = xgb.XGBClassifier(**hyperparams_xgb)
    xgb_model.fit(X_train, y_train)
    xgb_preds = xgb_model.predict(X_test)

    # --- Train LightGBM ---
    lgb_model = lgb.LGBMClassifier(**hyperparams_lgb)
    lgb_model.fit(X_train, y_train)
    lgb_preds = lgb_model.predict(X_test)

    # --- Evaluate ---
    xgb_accuracy = accuracy_score(y_test, xgb_preds)
    lgb_accuracy = accuracy_score(y_test, lgb_preds)
    xgb_report = classification_report(y_test, xgb_preds, output_dict=True)
    lgb_report = classification_report(y_test, lgb_preds, output_dict=True)

    # --- Optional: Plot top 10 TF-IDF feature importances ---
    def plot_feature_importance(model, model_name):
        importances = model.feature_importances_
        feature_names = vectorizer.get_feature_names_out()
        feat_df = pd.DataFrame({
            "feature": feature_names,
            "importance": importances
        }).sort_values(by="importance", ascending=False).head(10)

        plt.figure(figsize=(8, 5))
        plt.barh(feat_df["feature"], feat_df["importance"])
        plt.xlabel("Importance")
        plt.title(f"Top 10 Features - {model_name}")
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()

    # Plot feature importances for both models
    plot_feature_importance(xgb_model, "XGBoost")
    plot_feature_importance(lgb_model, "LightGBM")

    # --- Print Evaluation Metrics ---
    print("\n=== XGBoost Evaluation ===")
    print("Accuracy: {:.4f}".format(xgb_accuracy))
    print(classification_report(y_test, xgb_preds))

    print("\n=== LightGBM Evaluation ===")
    print("Accuracy: {:.4f}".format(lgb_accuracy))
    print(classification_report(y_test, lgb_preds))

    metrics = {
        "XGBoost": {"accuracy": xgb_accuracy, "report": xgb_report},
        "LightGBM": {"accuracy": lgb_accuracy, "report": lgb_report},
    }

    return metrics

def compare_models(hyperparams_xgb=None, hyperparams_lgb=None):
    """
    Compares the performance of the XGBoost and LightGBM models using metrics
    from train_and_evaluate_models.

    Parameters:
        hyperparams_xgb (dict): Hyperparameters for the XGBoost model.
        hyperparams_lgb (dict): Hyperparameters for the LightGBM model.

    Returns:
        dict: Evaluation metrics for both models.
    """
    metrics = train_and_evaluate_models(hyperparams_xgb, hyperparams_lgb)

    xgb_accuracy = metrics["XGBoost"]["accuracy"]
    lgb_accuracy = metrics["LightGBM"]["accuracy"]

    print("\n=== Model Comparison ===")
    print("XGBoost Accuracy: {:.4f}".format(xgb_accuracy))
    print("LightGBM Accuracy: {:.4f}".format(lgb_accuracy))

    if xgb_accuracy > lgb_accuracy:
        print("XGBoost performs better based on accuracy.")
    elif lgb_accuracy > xgb_accuracy:
        print("LightGBM performs better based on accuracy.")
    else:
        print("Both models perform equally based on accuracy.")

    return metrics

In [None]:
# Run the comparison with default hyperparameters
compare_models()

# Alternatively, to experiment with different hyperparameters, you can call:
# xgb_params = {"use_label_encoder": False, "eval_metric": "mlogloss", "random_state": 42, "max_depth": 5}
# lgb_params = {"random_state": 42, "num_leaves": 31}
# compare_models(xgb_params, lgb_params)

## Below are the functions used for hyperparameter tuning

In [None]:
def objective_lgb(trial):
    # 1) Suggest hyperparameter values
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
    }

    # 2) Train/validation split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # 3) Add a pruning callback so unpromising trials stop early
    pruning_cb = LightGBMPruningCallback(trial, "binary_logloss")

    # 4) Fit model
    model = LGBMClassifier(**params, random_state=42)

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        eval_metric='binary_logloss',
        # use callbacks for both early stopping and verbosity
        callbacks=[
            early_stopping(stopping_rounds=30),
            log_evaluation(period=10)        # logs eval metric every 10 rounds
        ]
    )

    # 5) Return validation AUC
    preds = model.predict_proba(X_val)

    return roc_auc_score(
        y_val,
        preds,
        multi_class='ovr',    # or 'ovo'
        average='macro'       # or 'weighted'
    )


    # preds = model.predict_proba(X_val)[:, 1]
    # return roc_auc_score(y_val, preds)

In [None]:
def objective_xgb(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1e-1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'tree_method': 'hist',   # fast histogram-based split
        'random_state': 42
    }

    clf = XGBClassifier(**params)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    # maximize AUC
    scores = cross_val_score(clf, X, y, cv=cv, scoring='roc_auc', n_jobs=-1)
    return scores.mean()

In [None]:
# Choose direction based on your metric
study = optuna.create_study(
    direction="maximize",                 # or "minimize"
    sampler=optuna.samplers.TPESampler(), # efficient Bayesian search
    pruner=optuna.pruners.MedianPruner()  # stops bad trials early
)
study.optimize(objective_lgb, n_trials=50, timeout=3600)


print("Best trial:")
print("  Value: ", study.best_trial.value)
print("  Params: ")
for key, val in study.best_trial.params.items():
    print(f"    {key}: {val}")