Imports

In [None]:
# Data Ingestion
import feedparser
import newspaper
import pandas as pd
from datetime import datetime
import time

# Tokenization & Embeddings
import torch
import numpy as np
from langchain_community.vectorstores import FAISS  # For storing and retrieving embeddings using the FAISS library
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline

# PCA
from sklearn.decomposition import PCA

# Clustering
from sklearn.mixture import GaussianMixture
import joblib # For saving models

# Cluster Interpretation

# Identifier training
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.preprocessing import StandardScaler


# Clustifier training

ModuleNotFoundError: No module named 'langchain.text_splitter'

Data Ingestion

In [None]:
def scrape_full_articles_from_rss(rss_url, max_articles=50):
    """
    Scrapes a given RSS feed, extracts article links, and uses newspaper3k
    to download and parse the full text of each article.
    """
    print(f"--- Starting scrape for RSS: {rss_url} ---")
    
    feed = feedparser.parse(rss_url)
    articles_data = []
    
    # Iterate through entries, limiting the number of articles
    for i, entry in enumerate(feed.entries):
        if i >= max_articles:
            print(f"Reached max_articles limit of {max_articles}.")
            break
            
        try:
            # 1. Extract link and basic metadata from RSS entry
            link = entry.link
            title = entry.title
            
            # Use feedparser's published_parsed and format to ISO standard
            published_date = datetime.fromtimestamp(time.mktime(entry.published_parsed)).isoformat()
            
            # 2. Use newspaper3k to get the full article text
            article = newspaper.Article(link)
            article.download()
            article.parse()
            
            # Only proceed if we successfully parsed text
            if article.text:
                articles_data.append({
                    'source': 'ECB',
                    'title': title,
                    'link': link,
                    'date': published_date,
                    'full_text': article.text
                })
                print(f"Successfully scraped: {title[:50]}...")
            
        except Exception as e:
            # Skip if an article link is broken or parsing fails
            print(f"Error scraping article at {entry.link}: {e}")
            continue

    return pd.DataFrame(articles_data)

# --- Central Bank RSS Feeds (You will need to verify and add more) ---
# NOTE: The ECB Press Releases feed link is typically very stable.
ECB_RSS_URL = "https://www.ecb.europa.eu/press/pr/date/html/index.rss" 
# FED, BoE, and BoJ links should be found similarly for press releases/statements

# --- EXECUTION ---
ecb_df = scrape_full_articles_from_rss(ECB_RSS_URL, max_articles=10)
# print(ecb_df.head())
# print(f"\nTotal articles scraped: {len(ecb_df)}")

Tokenization & Embeddings

In [None]:
# A specific model for generating embeddings, distinct from the sentiment classifier
EMBEDDING_MODEL_NAME = "ProsusAI/finbert" 

def get_finbert_sentence_embeddings(texts, model_name=EMBEDDING_MODEL_NAME):
    """
    Loads FinBERT, tokenizes texts, and extracts the [CLS] token embedding 
    as the sentence representation.
    """
    # 1. Load Tokenizer and Model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # Using AutoModel for embeddings (not AutoModelForSequenceClassification, which is for sentiment classification)
    model = AutoModel.from_pretrained(model_name)
    
    # Check for GPU and move model if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    print(f"\n--- Generating FinBERT Embeddings on {device} ---")
    
    embeddings = []
    
    for text in texts:
        # 2. Tokenize the text
        # BERT models have a max sequence length (typically 512). 
        # The 'full_text' needs truncation or splitting (advanced step).
        inputs = tokenizer(
            text, 
            return_tensors="pt", 
            padding='max_length', 
            truncation=True, 
            max_length=512
        ).to(device)

        with torch.no_grad():
            # 3. Get model outputs
            outputs = model(**inputs)
            # The last hidden state contains the final embeddings for all tokens
            last_hidden_state = outputs.last_hidden_state
            
            # 4. Extract the [CLS] token vector as the sentence embedding
            # [CLS] token is at index 0, and we squeeze to remove the batch dimension
            cls_embedding = last_hidden_state[:, 0, :].squeeze().cpu().numpy()
            embeddings.append(cls_embedding)

    return np.array(embeddings)

# if not Text DataFrame is empty:
    # Use the full_text column from the DataFrame created in Step 1
    #texts_to_embed = Text DF.tolist()
    
    # Get the 768-dimensional embeddings
    finbert_embeddings = get_finbert_sentence_embeddings(texts_to_embed)

    # Add embeddings to the DataFrame for the next pipeline step
    #Text DF['finbert_embedding'] = list(finbert_embeddings)

    print("\n--- Embeddings Generated ---")
    print(f"Shape of Embeddings: {finbert_embeddings.shape}")
    print("DataFrame with embeddings ready for PCA/Clustering.")
else:
    print("No articles to process. Please check data acquisition step.")

PCA

In [None]:
# Code
def unsupervised_pipeline(embeddings, n_components=100, n_clusters=5):
    """
    Performs scaling, PCA for dimension reduction, and GMM for clustering.
    """
    print("\n--- Starting Unsupervised Pipeline (PCA & GMM) ---")

    # 1. Scaling (Important for PCA/Clustering algorithms)
    scaler = StandardScaler()
    scaled_embeddings = scaler.fit_transform(embeddings)
    print(f"Scaled Embeddings Shape: {scaled_embeddings.shape}")

    # 2. Dimensionality Reduction (PCA)
    # Target: Reduce from 768-dim to a more manageable size (e.g., 100)
    pca = PCA(n_components=n_components, random_state=42)
    reduced_data = pca.fit_transform(scaled_embeddings)
    print(f"PCA Reduced Data Shape: {reduced_data.shape}")
    
    # Optional: Save PCA model for interpretation/reproducibility
    joblib.dump(pca, 'pca_model.pkl')

    # 3. Clustering (Gaussian Mixture Model - GMM)
    # The number of clusters (n_clusters) should be determined via elbow method,
    # silhouette score, or business logic.
    gmm = GaussianMixture(n_components=n_clusters, random_state=42, covariance_type='full')
    gmm_labels = gmm.fit_predict(reduced_data)
    
    print(f"Clustering Complete. Found {n_clusters} clusters.")
    
    return gmm_labels, gmm, reduced_data, pca

# --- EXECUTION ---

# Check if the embeddings were created
if 'finbert_embedding' in ecb_df.columns and not ecb_df.empty:
    
    # Stack the list of embeddings into a single NumPy array
    embedding_matrix = np.stack(ecb_df['finbert_embedding'].values)
    
    # Run the unsupervised steps
    cluster_labels, gmm_model, reduced_data, pca_model = unsupervised_pipeline(
        embedding_matrix, 
        n_components=100, 
        n_clusters=5 # Example value
    )
    
    # Add the cluster label back to the original DataFrame
    ecb_df['cluster_label'] = cluster_labels
    
    # Display cluster distribution
    print("\nCluster Distribution:")
    print(ecb_df['cluster_label'].value_counts())
    print("\nYour data is now clustered and ready for the interpretation (xAI) and the final supervised classification step.")

Clustering

In [None]:
# Code
# probs = gmm.predict_proba(X)

Cluster Interpretation

In [None]:
# code

Identifier training

In [None]:
# Code
# Select features for the model
features = ['rooms', 
            'area', 
            'luxurious', 
            'pop_dens', 
            'mean_taxable_income', 
            'dist_supermarket']
X = df[features]
y = df['expensive']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Display the model coefficients
coefficients = pd.DataFrame({
    'Feature': features,
    'Coefficient': model.coef_[0]
})
coefficients

# Predict on the test set
y_pred = model.predict(X_test_scaled)

Clustifier training

In [None]:
# code

In [None]:
##Footer
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')