In [None]:
import sys
import os
import numpy as np
import pandas as pd
from pathlib import Path

In [None]:
current_dir = os.getcwd()
project_root = current_dir
while not os.path.isdir(os.path.join(project_root, "src")):
    parent_dir = os.path.dirname(project_root)
    if parent_dir == project_root:
        raise FileNotFoundError(
            "Could not find the 'src' directory. Ensure this script is within the project structure."
        )
    project_root = parent_dir

if project_root not in sys.path:
    sys.path.insert(0, project_root)

project_root = Path(project_root)

In [None]:
# Define paths relative to the project root for maximum robustness.
PROCESSED_DATA_DIR = project_root / "data" / "processed"
TRAIN_CLEAN_PATH = PROCESSED_DATA_DIR / "clean_train_tweets.parquet"
TEST_CLEAN_PATH = PROCESSED_DATA_DIR / "clean_test_tweets.parquet"
VAL_CLEAN_PATH = PROCESSED_DATA_DIR / "clean_val_tweets.parquet"

In [None]:
# from src.models.embeddings.emb_data import get_data_embeddings (is already used)
from src.models.utils import model_metrics
from src.model.model import create_lgbm_model, create_hyperparameter_search
from src.models.embeddings.emb_data import get_data_embeddings

import pandas as pd
import joblib as jb

In [None]:
train_labels = pd.read_parquet(TRAIN_CLEAN_PATH)
test_labels = pd.read_parquet(TEST_CLEAN_PATH)
val_labels = pd.read_parquet(VAL_CLEAN_PATH)

In [None]:
# Load labels
y_train = train_labels['Label'].values
y_test = test_labels['Label'].values
y_val = val_labels['Label'].values

In [None]:
### Load embeddings ###
emb_train, emb_test, emb_val = get_data_embeddings()

In [None]:
# Create model
lbgm = create_lgbm_model()
random_search = create_hyperparameter_search(estimator=lbgm, n_iter=15)

In [None]:
# Fit the model
with jb.parallel_backend('threading'):
    random_search.fit(
        emb_train,
        y_train
    )

In [None]:
# Predictions and metrics
best_lgb = random_search.best_estimator_

pred_test = best_lgb.predict(emb_test)
metrics_test = model_metrics(y_test,pred_test)

pred_val = best_lgb.predict(emb_val)
metrics_val = model_metrics(y_val,pred_val)

In [None]:
# Export model
model_path = "/home/juancho_col/Documents/Projects/Sentiment Analysis v2/src/models/predictor/lgbm_sentiment_predictor.joblib"

# Save
jb.dump(best_lgb,model_path)