## Dependencies

In [274]:
import os
import sys

%load_ext autoreload
%autoreload 2

parent_dir = os.path.split(os.getcwd())[0]
parent_dir

if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    print("Parent dir has been added to sys path.")
else:
    print("Parent dir already exists in sys path")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Parent dir already exists in sys path


In [275]:
from classes import InvestingComScraper
import pandas as pd
import feedparser
import re
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AdamW
import torch
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from datasets import Dataset
from datasets import concatenate_datasets
from torch.utils.data import DataLoader, WeightedRandomSampler
from torch.utils.data import Dataset
from transformers import DataCollatorWithPadding

In [276]:
print("Torch version", torch.__version__)
print("CUDA : ", torch.version.cuda)
print("Cuda available : ", torch.cuda.is_available())
print("Torch location : ", torch.__file__)
print("Numpy : ", np.__version__)
print(torch.__config__.show())

Torch version 2.1.2+cpu
CUDA :  None
Cuda available :  False
Torch location :  d:\CodingHenry\nlp_major_move_id\venv\Lib\site-packages\torch\__init__.py
Numpy :  1.26.4
PyTorch built with:
  - C++ Version: 199711
  - MSVC 192930151
  - Intel(R) Math Kernel Library Version 2020.0.2 Product Build 20200624 for Intel(R) 64 architecture applications
  - Intel(R) MKL-DNN v3.1.1 (Git Hash 64f6bcbcbab628e96f33a62c3e975f8535a7bde4)
  - OpenMP 2019
  - LAPACK is enabled (usually provided by MKL)
  - CPU capability usage: AVX2
  - Build settings: BLAS_INFO=mkl, BUILD_TYPE=Release, CXX_COMPILER=C:/actions-runner/_work/pytorch/pytorch/builder/windows/tmp_bin/sccache-cl.exe, CXX_FLAGS=/DWIN32 /D_WINDOWS /GR /EHsc /bigobj /FS -DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO -DLIBKINETO_NOCUPTI -DLIBKINETO_NOROCTRACER -DUSE_FBGEMM -DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE /utf-8 /wd4624 /wd4068 /wd4067 /wd4267 /wd4661 /wd4717 /wd4244 /wd4804 /wd4273, LAPACK_INFO=mkl, PERF_WITH_AVX=1, PERF_WITH_AVX2=1, PE

## Data Preprocessing

In [292]:
df = pd.read_csv("../data/raw/stock_news_partial_labeled.csv")
df = df.drop(df.columns[0], axis=1)
df = df.rename(columns={"headline": "text", "Importance": "labels"})
df["labels"] = df["labels"].str.strip()

In [293]:
labeled_df = df[df["labels"].notna()].copy()
unlabeled_df = df[df["labels"].isna()].copy()

In [294]:
label_encoder = LabelEncoder()
labeled_df["labels"] = label_encoder.fit_transform(labeled_df["labels"])

In [295]:
train_df, test_df = train_test_split(
    labeled_df, 
    test_size=0.2, 
    random_state=42, 
    stratify=labeled_df["labels"]
)

In [296]:
def clean_text(text):
    text = text.replace("\n", " ")
    text = " ".join(text.split())
    return text

In [297]:
train_df["text"] = train_df["text"].apply(clean_text)
test_df["text"] = test_df["text"].apply(clean_text)
unlabeled_df["text"] = unlabeled_df["text"].apply(clean_text)

In [298]:
print("=== Checking Label Encoding ===")
print("Unique labels in train_df:", train_df["labels"].unique())
print("Label counts:", train_df["labels"].value_counts().sort_index())
print("Label encoder classes:", label_encoder.classes_)
print("Label mapping:", {label: idx for idx, label in enumerate(label_encoder.classes_)})

# Verify the mapping is correct
sample_row = train_df.iloc[0]
print(f"\nSample check - Text: '{sample_row['text'][:50]}...', Label: {sample_row['labels']}")

=== Checking Label Encoding ===
Unique labels in train_df: [0 2 1]
Label counts: labels
0     95
1    218
2     86
Name: count, dtype: int64
Label encoder classes: ['major' 'minor' 'neutral']
Label mapping: {'major': 0, 'minor': 1, 'neutral': 2}

Sample check - Text: '3 Top Growth Stocks That Are a Fantastic Long-Term...', Label: 0


In [299]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



In [300]:
class NewsDataset(Dataset):
    def __init__(self, df, tokenizer, labeled=True, max_length=128):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.labeled = labeled
        self.max_length = max_length
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text = self.df.iloc[idx]["text"]
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        
        item = {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0)
        }
        
        if self.labeled:
            item["labels"] = torch.tensor(self.df.iloc[idx]["labels"], dtype=torch.long)
        
        return item

In [301]:
train_dataset = NewsDataset(train_df, tokenizer, labeled=True)
test_dataset = NewsDataset(test_df, tokenizer, labeled=True)
unlabeled_dataset = NewsDataset(unlabeled_df, tokenizer, labeled=False)

In [302]:
labels_np = train_df["labels"].values
class_counts = np.bincount(labels_np)
class_weights = 1.0 / class_counts

print("Class counts:", class_counts)
print("Class weights:", class_weights)

sample_weights = class_weights[labels_np]

# Create weighted sampler
sampler = WeightedRandomSampler(
    weights=torch.DoubleTensor(sample_weights),
    num_samples=len(sample_weights),
    replacement=True
)

# Create DataLoaders
labeled_train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    sampler=sampler
)

labeled_test_loader = DataLoader(
    test_dataset,
    batch_size=16,
    shuffle=False
)

unlabeled_loader = DataLoader(
    unlabeled_dataset,
    batch_size=128,
    shuffle=False
)

Class counts: [ 95 218  86]
Class weights: [0.01052632 0.00458716 0.01162791]


In [303]:
print("\n=== Testing DataLoader ===")
batch = next(iter(labeled_train_loader))
print("Train batch keys:", batch.keys())
print("input_ids shape:", batch["input_ids"].shape)
print("attention_mask shape:", batch["attention_mask"].shape)
print("labels shape:", batch["labels"].shape)
print("Sample labels:", batch["labels"][:5])


=== Testing DataLoader ===
Train batch keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
input_ids shape: torch.Size([32, 128])
attention_mask shape: torch.Size([32, 128])
labels shape: torch.Size([32])
Sample labels: tensor([0, 2, 2, 1, 2])


### Pseudo-Labeling With Fine Tuning FinBERT

In [304]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, WeightedRandomSampler, ConcatDataset
from transformers import AutoModelForSequenceClassification, AdamW
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np
import pandas as pd

from transformers import AutoConfig

config = AutoConfig.from_pretrained("yiyanghkust/finbert-tone")

print("Original FinBERT labels:")
print(f"  num_labels: {config.num_labels}")
print(f"  id2label: {config.id2label}")
print(f"  label2id: {config.label2id}")

num_labels = 3
model = AutoModelForSequenceClassification.from_pretrained("yiyanghkust/finbert-tone", num_labels=num_labels).to(device)

print("\nYour initialized model:")
print(f"  num_labels: {model.num_labels}")
print(f"  classifier output size: {model.classifier.out_features}")
print(f"  Expected: 3 (for your major/minor/neutral classes)")

for param in model.parameters():
    param.requires_grad = False

for param in model.classifier.parameters():
    param.requires_grad = True

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=2e-5)

Original FinBERT labels:
  num_labels: 3
  id2label: {0: 'Neutral', 1: 'Positive', 2: 'Negative'}
  label2id: {'Positive': 1, 'Negative': 2, 'Neutral': 0}

Your initialized model:
  num_labels: 3
  classifier output size: 3
  Expected: 3 (for your major/minor/neutral classes)




In [305]:
def train_per_epoch(model, loader, optimizer):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    progress = tqdm(loader, desc="Training", leave=True)
    for batch in progress:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device).long()

        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        preds = outputs.logits.argmax(dim=-1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

        progress.set_postfix({"loss": total_loss / (progress.n + 1)})

    train_loss = total_loss / len(loader)
    train_acc = accuracy_score(all_labels, all_preds)
    return train_loss, train_acc

def evaluate(model, loader):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    count = 0

    progress = tqdm(loader, desc="Evaluating", leave=True)
    with torch.no_grad():
        for batch in progress:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device).long()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            
            loss = outputs.loss
            batch_size = labels.size(0)
            total_loss += loss.item() * batch_size
            count += batch_size

            preds = outputs.logits.argmax(dim=-1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    eval_loss = total_loss / count
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average="weighted")
    print(classification_report(all_labels, all_preds, digits=4))
    return eval_loss, acc, f1

def train(model, optimizer, train_loader, test_loader, epochs, train_accs, eval_accs, train_losses, eval_losses):
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")

        train_loss, train_acc = train_per_epoch(
            model=model,
            loader=train_loader,
            optimizer=optimizer
        )
        print(f"Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}")

        val_loss, val_acc, _ = evaluate(model, test_loader)
        print(f"Validation Loss: {val_loss:.4f}, Validation Acc: {val_acc:.4f}\n")
        
        train_accs.append(train_acc)
        train_losses.append(train_loss)
        eval_accs.append(val_acc)
        eval_losses.append(val_loss)

def generate_pseudo_labels(model, loader, original_df, threshold=0.90):
    """
    Generate pseudo labels for unlabeled data
    
    Args:
        model: trained model
        loader: DataLoader for unlabeled data
        original_df: original DataFrame with text column
        threshold: confidence threshold for pseudo-labeling
    """
    model.eval()
    pseudo_indices = []
    pseudo_labels = []
    
    current_idx = 0
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Generating pseudo-labels"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            batch_size = input_ids.size(0)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probs = F.softmax(outputs.logits, dim=-1)
            max_probs, preds = probs.max(dim=-1)

            # Filter by confidence threshold
            mask = max_probs >= threshold

            for i in range(batch_size):
                if mask[i]:
                    pseudo_indices.append(current_idx + i)
                    pseudo_labels.append(preds[i].item())
            
            current_idx += batch_size

    return pseudo_indices, pseudo_labels

def pseudo_labeling(model, optimizer, labeled_dataset, unlabeled_df, unlabeled_loader, 
                   test_loader, epochs, train_acc, eval_acc, train_loss, eval_loss, 
                   current_train_loader, threshold=0.90):
    """
    Perform one round of pseudo-labeling
    
    Returns:
        New combined dataset with pseudo-labeled data added
    """
    print("\n=== Training on labeled data ===")
    train(model=model, optimizer=optimizer, train_loader=current_train_loader, 
          test_loader=test_loader, epochs=epochs, train_accs=train_acc, 
          train_losses=train_loss, eval_accs=eval_acc, eval_losses=eval_loss)

    print("\n=== Generating pseudo labels ===")
    pseudo_indices, pseudo_labels = generate_pseudo_labels(
        model=model, 
        loader=unlabeled_loader,
        original_df=unlabeled_df,
        threshold=threshold
    )
    
    print(f"Generated {len(pseudo_labels)} pseudo-labels from {len(unlabeled_df)} unlabeled samples")
    
    if len(pseudo_labels) == 0:
        print("No pseudo-labels generated. Returning original dataset.")
        return labeled_dataset
    
    pseudo_df = unlabeled_df.iloc[pseudo_indices].copy()
    pseudo_df["labels"] = pseudo_labels
    
    pseudo_dataset = NewsDataset(pseudo_df, tokenizer, labeled=True)
    
    combined_dataset = ConcatDataset([labeled_dataset, pseudo_dataset])
    
    return combined_dataset, pseudo_df

In [None]:
train_acc_history = []
train_loss_history = []
eval_acc_history = []
eval_loss_history = []

combined_dataset = train_dataset
current_train_loader = labeled_train_loader

all_pseudo_labeled_dfs = []

for round_num in range(3):
    print(f"\n{'='*50}")
    print(f"PSEUDO-LABELING ROUND {round_num + 1}/3")
    print(f"{'='*50}")
    
    # Perform pseudo-labeling
    result = pseudo_labeling(
        model=model,
        optimizer=optimizer,
        labeled_dataset=combined_dataset,
        unlabeled_df=unlabeled_df,
        unlabeled_loader=unlabeled_loader,
        test_loader=labeled_test_loader,
        epochs=5,
        train_acc=train_acc_history,
        train_loss=train_loss_history,
        eval_acc=eval_acc_history,
        eval_loss=eval_loss_history,
        current_train_loader=current_train_loader,
        threshold=0.90
    )
    
    if isinstance(result, tuple):
        combined_dataset, pseudo_df = result
        all_pseudo_labeled_dfs.append(pseudo_df)
    else:
        combined_dataset = result
        print("No new pseudo-labels added.")
        break
    
    combined_df_list = [train_df] + all_pseudo_labeled_dfs
    combined_df = pd.concat(combined_df_list, ignore_index=True)
    
    combined_labels_np = combined_df["labels"].values
    class_counts = np.bincount(combined_labels_np)
    class_weights = 1.0 / class_counts
    sample_weights = class_weights[combined_labels_np]
    
    print(f"\nCombined dataset size: {len(combined_df)}")
    print(f"Class distribution: {class_counts}")
    
    sampler = WeightedRandomSampler(
        weights=torch.DoubleTensor(sample_weights),
        num_samples=len(sample_weights),
        replacement=True
    )
    
    current_train_loader = DataLoader(
        combined_dataset,
        batch_size=32,
        sampler=sampler
    )

print("\n=== Pseudo-Labeling Complete ===")
print(f"Final training set size: {len(combined_dataset)}")


PSEUDO-LABELING ROUND 1/3

=== Training on labeled data ===
Epoch 1/5


Training:   0%|          | 0/13 [00:01<?, ?it/s]


KeyboardInterrupt: 