# IMPORT

In [43]:
import os
import re
import json
import joblib
import asyncio
import pandas as pd
import numpy as np

from tqdm import tqdm
from transformers import pipeline

from sklearn import metrics
from collections import Counter
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.metrics import f1_score, accuracy_score


# LOAD DATA

In [2]:
data_save_dir = r"F:\UNIVERSITY\Project\Sentiment-Analysis-Airflow\Financial-Sentiment-Analysis\project_2_training\data"
train_path = os.path.join(data_save_dir, "train.csv")
val_path = os.path.join(data_save_dir, "val.csv")
test_path = os.path.join(data_save_dir, "test.csv")

In [3]:
train_df = pd.read_csv(train_path)
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)

In [4]:
X_train, y_train = train_df["title"].values, train_df["label"].values
X_val, y_val = val_df["title"].values, val_df["label"].values
X_test, y_test = test_df["title"].values, test_df["label"].values

# TRAINING WITH SCIKIT LEARN

## *Training*

In [19]:
vectorizer = TfidfVectorizer()
X_train_transform = vectorizer.fit_transform(X_train)
X_val_transform = vectorizer.transform(X_val)
X_test_transform = vectorizer.transform(X_test)

In [39]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5],
    'max_features': ["sqrt", "log2"],
    'min_samples_leaf': [1, 2],
    'bootstrap': [True, False]
}
random_search = RandomizedSearchCV(
    RandomForestClassifier(),
    param_grid
)
random_search.fit(X_train_transform, y_train)
print(random_search.best_estimator_)

RandomForestClassifier(bootstrap=False, max_features='log2', n_estimators=200)


In [40]:
best_model = random_search.best_estimator_

In [41]:
pred = best_model.predict(X_test_transform)

In [48]:
# f1_score(y_test, pred, average="micro")
accuracy_score(y_test, pred)

0.7093821510297483

## *Save Model*

In [44]:
model_save_dir = r"F:\UNIVERSITY\Project\Sentiment-Analysis-Airflow\Financial-Sentiment-Analysis\project_2_training\save"
os.makedirs(model_save_dir, exist_ok=True)

# Save the model
model_path = os.path.join(model_save_dir, "random_forest_model.pkl")
joblib.dump(best_model, model_path)

print(f"Model saved to: {model_path}")

Model saved to: F:\UNIVERSITY\Project\Sentiment-Analysis-Airflow\Financial-Sentiment-Analysis\project_2_training\save\random_forest_model.pkl


# TRAINING WITH FINBERT

## *Training*