In [1]:
# 01_setup_environment_enhanced.ipynb
# Purpose: Enhanced setup environment with support for both JP Morgan and HSBC analysis
# Banks: JP Morgan (JPM) and HSBC
# Quarters: Q1 2025, Q2 2025
# Models: FinBERT (yiyanghkust), FinBERT (ProsusAI), DistilRoBERTa, CardiffNLP (Twitter-RoBERTa)

import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import random
import json

# Constants
SEED = 42
PROJECT_NAME = "CAM_DS_AI_Project_Enhanced"
BANKS = ["jpm", "hsbc"]
QUARTERS = ["q1_2025", "q2_2025"]

# Enhanced model configurations
MODELS = {
    "finbert_yiyanghkust": "yiyanghkust/finbert-tone",
    "finbert_prosusai": "ProsusAI/finbert",
    "distilroberta": "j-hartmann/emotion-english-distilroberta-base",
    "cardiffnlp_roberta": "cardiffnlp/twitter-roberta-base-sentiment-latest"
}

# Set random seeds
random.seed(SEED)
np.random.seed(SEED)

print(f"Enhanced environment setup for {PROJECT_NAME}")
print(f"Target banks: {', '.join([bank.upper() for bank in BANKS])}")
print(f"Quarters: {', '.join(QUARTERS)}")
print(f"Models: {len(MODELS)} sentiment analysis models")

Enhanced environment setup for CAM_DS_AI_Project_Enhanced
Target banks: JPM, HSBC
Quarters: q1_2025, q2_2025
Models: 4 sentiment analysis models


In [2]:
## Install Required Packages - Enhanced Version

# Core ML packages
!pip install -q pandas numpy matplotlib seaborn scikit-learn
!pip install -q transformers torch datasets
!pip install -q plotly wordcloud textblob
!pip install -q bertopic umap-learn hdbscan

# Enhanced sentiment analysis packages
!pip install -q vaderSentiment textstat
!pip install -q sentence-transformers
!pip install -q torch-audio torchaudio --extra-index-url https://download.pytorch.org/whl/cpu

# Additional analysis packages
!pip install -q optuna hyperopt
!pip install -q shap lime

print("✅ Enhanced packages installed")


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.2/239.2 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m50.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: Could not find a version that satisfies the requirement torch-audio (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for torch-audio[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for lime (setup.

In [3]:
## Setup Enhanced Folder Structure

def create_enhanced_folder_structure():
    """Create enhanced project folder structure for both banks."""

    from google.colab import drive
    drive.mount("/content/drive")

    drive_base = Path("/content/drive/MyDrive/CAM_DS_AI_Project_Enhanced")
    colab_base = Path("/content/cam_ds_ai_enhanced")

    locations = {
        "drive": drive_base,
        "colab": colab_base
    }

    # Enhanced folder structure for both banks
    folders_to_create = []

    for bank in BANKS:
        base_folders = [
            f"data/raw/{bank}",
            f"data/clean/{bank}",
            f"data/processed/{bank}",
            f"data/model_ready/{bank}",
            f"data/manual_validation/{bank}",
            f"results/sentiment/{bank}",
            f"results/comparison/{bank}",
            f"results/finetuning/{bank}",
            f"models/finbert_yiyanghkust/{bank}",
            f"models/finbert_prosusai/{bank}",
            f"models/distilroberta/{bank}",
            f"models/cardiffnlp/{bank}",
            f"models/finetuned/{bank}",
            f"outputs/visualizations/{bank}",
            f"outputs/reports/{bank}",
            f"outputs/analysis/{bank}"
        ]
        folders_to_create.extend(base_folders)

    # Global folders
    global_folders = [
        "notebooks",
        "configs",
        "logs",
        "experiments",
        "utils"
    ]
    folders_to_create.extend(global_folders)

    for location_name, base_path in locations.items():
        print(f"\nCreating enhanced folders in {location_name}: {base_path}")

        for folder in folders_to_create:
            folder_path = base_path / folder
            folder_path.mkdir(parents=True, exist_ok=True)

        print(f"  ✅ Created {len(folders_to_create)} folders")

        # Create enhanced README
        readme_path = base_path / "README.md"
        with open(readme_path, "w") as f:
            f.write(f"# {PROJECT_NAME}\n")
            f.write(f"Banks: {', '.join([bank.upper() for bank in BANKS])}\n")
            f.write(f"Quarters: {', '.join(QUARTERS)}\n")
            f.write(f"Models: {', '.join(MODELS.keys())}\n")
            f.write(f"Created: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
            f.write("## Enhanced Features\n")
            f.write("- Multi-bank analysis (JPM & HSBC)\n")
            f.write("- 4 state-of-the-art sentiment models\n")
            f.write("- Manual validation integration\n")
            f.write("- Model fine-tuning capabilities\n")
            f.write("- Comprehensive model comparison\n")
            f.write("- Advanced visualization suite\n")

        print(f"  ✅ Enhanced README.md created")

    return locations

# Create enhanced folder structure
locations = create_enhanced_folder_structure()


Mounted at /content/drive

Creating enhanced folders in drive: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced
  ✅ Created 37 folders
  ✅ Enhanced README.md created

Creating enhanced folders in colab: /content/cam_ds_ai_enhanced
  ✅ Created 37 folders
  ✅ Enhanced README.md created


In [4]:
## Define Enhanced Global Paths

drive_base = locations["drive"]
colab_base = locations["colab"]

# Enhanced path configurations for both banks
enhanced_paths = {}

for bank in BANKS:
    enhanced_paths[bank] = {
        "raw_data": drive_base / f"data/raw/{bank}",
        "clean_data": drive_base / f"data/clean/{bank}",
        "processed_data": drive_base / f"data/processed/{bank}",
        "model_ready": drive_base / f"data/model_ready/{bank}",
        "manual_validation": drive_base / f"data/manual_validation/{bank}",
        "results_sentiment": drive_base / f"results/sentiment/{bank}",
        "results_comparison": drive_base / f"results/comparison/{bank}",
        "results_finetuning": drive_base / f"results/finetuning/{bank}",
        "models": drive_base / f"models",
        "viz": drive_base / f"outputs/visualizations/{bank}",
        "reports": drive_base / f"outputs/reports/{bank}",
        "analysis": drive_base / f"outputs/analysis/{bank}"
    }

# Global paths
global_paths = {
    "configs": drive_base / "configs",
    "logs": drive_base / "logs",
    "experiments": drive_base / "experiments",
    "utils": drive_base / "utils",
    "colab_working": colab_base
}

print("\n" + "="*60)
print("ENHANCED FOLDER STRUCTURE CREATED")
print("="*60)
print(f"Primary storage: {drive_base}")
print(f"Working storage: {colab_base}")
print(f"Banks configured: {len(BANKS)}")
print(f"Models configured: {len(MODELS)}")



ENHANCED FOLDER STRUCTURE CREATED
Primary storage: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced
Working storage: /content/cam_ds_ai_enhanced
Banks configured: 2
Models configured: 4


In [5]:
## Enhanced Helper Functions
from typing import Dict

def save_to_multiple_locations_enhanced(df, filename, bank_code, locations_dict=None):
    """Enhanced save function with bank-specific paths."""
    if locations_dict is None:
        locations_dict = {
            "drive": enhanced_paths[bank_code]["results_sentiment"],
            "colab": colab_base / f"results/sentiment/{bank_code}"
        }

    saved_paths = []
    for location_name, base_path in locations_dict.items():
        file_path = base_path / filename
        file_path.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(file_path, index=False)
        saved_paths.append(str(file_path))
        print(f"✅ Saved to {location_name}: {file_path}")

    return saved_paths

def read_csv_safe_enhanced(path: Path, encoding='utf-8') -> pd.DataFrame:
    """Enhanced CSV reading with multiple encoding attempts."""
    encodings_to_try = [encoding, 'latin-1', 'cp1252', 'iso-8859-1']

    for enc in encodings_to_try:
        try:
            df = pd.read_csv(path, encoding=enc)
            print(f"✅ Loaded {path.name}: {len(df):,} rows × {len(df.columns)} cols (encoding: {enc})")
            return df
        except UnicodeDecodeError:
            continue
        except Exception as e:
            print(f"❌ Error reading {path.name}: {str(e)}")
            break

    return None

def log_dataframe_info_enhanced(df, name, bank_code=None):
    """Enhanced DataFrame logging with bank context."""
    bank_prefix = f"[{bank_code.upper()}] " if bank_code else ""
    print(f"\n{bank_prefix}{name} Info:")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    print(f"  Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # Enhanced missing value analysis
    missing = df.isnull().sum()
    if missing.any():
        print(f"  Missing values: {missing[missing > 0].to_dict()}")
    else:
        print("  ✅ No missing values")

    # Data types summary
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    text_cols = df.select_dtypes(include=['object']).columns.tolist()

    if numeric_cols:
        print(f"  Numeric columns: {len(numeric_cols)}")
    if text_cols:
        print(f"  Text columns: {len(text_cols)}")

def setup_model_paths(model_name: str, bank_code: str) -> Dict:
    """Setup model-specific paths for enhanced analysis."""
    model_key = model_name.replace("/", "_").replace("-", "_")

    return {
        "model_cache": enhanced_paths[bank_code]["models"] / model_key,
        "results": enhanced_paths[bank_code]["results_sentiment"] / model_key,
        "finetuned": enhanced_paths[bank_code]["results_finetuning"] / model_key
    }


In [6]:
## Enhanced Data URLs Configuration

# Enhanced data URLs for both banks
data_urls = {
    "jpm": {
        "q1_2025": "https://drive.google.com/file/d/1-ZiLrQVebxuNVw3FwenZ3Sn5FoAVVarH/view?usp=drive_link",
        "q2_2025": "https://drive.google.com/file/d/1go14liFL_hOWZOQfLYDSwgScM-pjVanw/view?usp=drive_link",
        "manual_labels": "https://drive.google.com/file/d/1NO75zFKvXQ7KT0dPoC95p46YGlTVkC-a/view?usp=drive_link"
    },
    "hsbc": {
        "q1_2025": "https://drive.google.com/file/d/1lv0CeRxIX75YV63Fkh_93mXzlg-8U5lI/view?usp=drive_link",
        "q2_2025": "https://drive.google.com/file/d/1_pbFTGPUDSFbdaKoFgcCxOwl4WAMire4/view?usp=drive_link",
        "manual_labels": "https://drive.google.com/file/d/1tTegea0gPIdvMsWuv03b-bgRgRZ4uKYt/view?usp=drive_link"
    }
}

In [7]:
## Save Enhanced Configuration

enhanced_config = {
    "SEED": SEED,
    "PROJECT_NAME": PROJECT_NAME,
    "BANKS": BANKS,
    "QUARTERS": QUARTERS,
    "MODELS": MODELS,
    "drive_base": str(drive_base),
    "colab_base": str(colab_base),
    "data_urls": data_urls,
    "enhanced_paths": {bank: {k: str(v) for k, v in paths.items()}
                      for bank, paths in enhanced_paths.items()},
    "global_paths": {k: str(v) for k, v in global_paths.items()},
    "timestamp": pd.Timestamp.now().isoformat()
}

# Save enhanced configuration
config_path = drive_base / "configs" / "enhanced_config.json"
config_path.parent.mkdir(parents=True, exist_ok=True)
with open(config_path, "w") as f:
    json.dump(enhanced_config, f, indent=2)

print(f"✅ Enhanced configuration saved to: {config_path}")


✅ Enhanced configuration saved to: /content/drive/MyDrive/CAM_DS_AI_Project_Enhanced/configs/enhanced_config.json


In [8]:
## Model Verification and Setup

def verify_model_accessibility():
    """Verify that all required models are accessible."""
    print("\n" + "="*50)
    print("VERIFYING MODEL ACCESSIBILITY")
    print("="*50)

    try:
        from transformers import AutoTokenizer, AutoModelForSequenceClassification

        verification_results = {}

        for model_key, model_name in MODELS.items():
            try:
                print(f"Testing {model_key}: {model_name}")

                # Test tokenizer loading
                tokenizer = AutoTokenizer.from_pretrained(model_name)

                # Test model info (without loading full model to save memory)
                model_info = AutoModelForSequenceClassification.from_pretrained(
                    model_name,
                    return_dict=True,
                    output_attentions=False,
                    output_hidden_states=False
                )

                verification_results[model_key] = {
                    "status": "accessible",
                    "tokenizer_vocab_size": len(tokenizer.vocab),
                    "model_name": model_name
                }

                print(f"  ✅ {model_key} - Accessible")

                # Clean up memory
                del tokenizer, model_info

            except Exception as e:
                verification_results[model_key] = {
                    "status": "error",
                    "error": str(e),
                    "model_name": model_name
                }
                print(f"  ❌ {model_key} - Error: {str(e)}")

        # Save verification results
        verification_path = global_paths["logs"] / "model_verification.json"
        verification_path.parent.mkdir(parents=True, exist_ok=True)
        with open(verification_path, "w") as f:
            json.dump(verification_results, f, indent=2)

        accessible_models = sum(1 for r in verification_results.values() if r["status"] == "accessible")
        print(f"\nModel Verification Complete: {accessible_models}/{len(MODELS)} models accessible")

        return verification_results

    except ImportError as e:
        print(f"❌ Cannot verify models - missing dependencies: {e}")
        return {}

# Run model verification
model_verification = verify_model_accessibility()

print("\n" + "="*60)
print("ENHANCED ENVIRONMENT SETUP COMPLETE")
print("="*60)
print("Next steps:")
print("1. Run notebook 02_load_data_enhanced.ipynb to download datasets for both banks")
print("2. Run notebook 03_clean_preprocess_enhanced.ipynb to clean the data")
print("3. Run notebook 03b_manual_validation.ipynb for manual label validation")
print("4. Run notebook 04_sentiment_analysis.ipynb for 4-model analysis")
print("5. Run notebook 04b_model_finetuning.ipynb for fine-tuning")
print("6. Run notebook 05_model_comparison.ipynb for comparative analysis")
print("7. Run notebook 06_results_visualization.ipynb for final results")

print(f"\n📊 Enhanced Setup Summary:")
print(f"  Banks: {len(BANKS)} ({', '.join([b.upper() for b in BANKS])})")
print(f"  Quarters: {len(QUARTERS)} ({', '.join(QUARTERS)})")
print(f"  Models: {len(MODELS)} sentiment analysis models")
# Remove or comment out the problematic line:
# print(f"  Folders: {len(folders_to_create)} organized directories")
print(f"  Accessible models: {sum(1 for r in model_verification.values() if r.get('status') == 'accessible')}/{len(MODELS)}")



VERIFYING MODEL ACCESSIBILITY
Testing finbert_yiyanghkust: yiyanghkust/finbert-tone


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

  ✅ finbert_yiyanghkust - Accessible
Testing finbert_prosusai: ProsusAI/finbert


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  ✅ finbert_prosusai - Accessible
Testing distilroberta: j-hartmann/emotion-english-distilroberta-base


tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

  ✅ distilroberta - Accessible
Testing cardiffnlp_roberta: cardiffnlp/twitter-roberta-base-sentiment-latest


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  ✅ cardiffnlp_roberta - Accessible

Model Verification Complete: 4/4 models accessible

ENHANCED ENVIRONMENT SETUP COMPLETE
Next steps:
1. Run notebook 02_load_data_enhanced.ipynb to download datasets for both banks
2. Run notebook 03_clean_preprocess_enhanced.ipynb to clean the data
3. Run notebook 03b_manual_validation.ipynb for manual label validation
4. Run notebook 04_sentiment_analysis.ipynb for 4-model analysis
5. Run notebook 04b_model_finetuning.ipynb for fine-tuning
6. Run notebook 05_model_comparison.ipynb for comparative analysis
7. Run notebook 06_results_visualization.ipynb for final results

📊 Enhanced Setup Summary:
  Banks: 2 (JPM, HSBC)
  Quarters: 2 (q1_2025, q2_2025)
  Models: 4 sentiment analysis models
  Accessible models: 4/4
