In [None]:
# 01_setup_environment.ipynb
# Purpose: Setup environment, install packages, create folder structure
# Bank: JP Morgan (JPM)
# Quarter: Q1 2025, Q2 2025

# Setup (run first)
import os
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import random

# Constants
SEED = 42
PROJECT_NAME = "CAM_DS_AI_Project"
BANK_CODE = "jpm"
QUARTERS = ["q1_2025", "q2_2025"]

# Set random seeds
random.seed(SEED)
np.random.seed(SEED)

print(f"Environment setup for {PROJECT_NAME}")
print(f"Target bank: {BANK_CODE.upper()}")
print(f"Quarters: {', '.join(QUARTERS)}")

Environment setup for CAM_DS_AI_Project
Target bank: JPM
Quarters: q1_2025, q2_2025


In [None]:
## Install Required Packages

# Install core packages
!pip install -q pandas numpy matplotlib seaborn scikit-learn
!pip install -q transformers torch
!pip install -q datasets
!pip install -q plotly
!pip install -q wordcloud
!pip install -q textblob
!pip install -q bertopic
!pip install -q umap-learn
!pip install -q hdbscan

print("✓ Core packages installed")

# Install sentiment analysis packages
!pip install -q vaderSentiment
!pip install -q textstat

print("✓ Sentiment analysis packages installed")


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/153.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m81.9/153.0 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.0/153.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h✓ Core packages installed
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m239.2/239.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25h✓ Sentiment analysis packages installed


In [None]:
## Setup Folder Structure

def create_folder_structure():
    """Create project folder structure in all three locations."""

    # Location A: Google Drive (Primary drive)
    from google.colab import drive
    drive.mount("/content/drive")

    drive_base = Path("/content/drive/MyDrive/CAM_DS_AI_Project")

    # Location B: Local Google Colab drive
    colab_base = Path("/content/cam_ds_ai_project")

    # Create folder structure for both locations
    locations = {
        "drive": drive_base,
        "colab": colab_base
    }

    folders_to_create = [
        "data/raw/jpm",
        "data/clean/jpm",
        "data/processed/jpm",
        "data/model_ready/jpm",
        "results/sentiment/jpm",
        "results/comparison/jpm",
        "models/finbert_tone/jpm",
        "models/prosus_finbert/jpm",
        "notebooks",
        "outputs/visualizations/jpm",
        "outputs/reports/jpm"
    ]

    for location_name, base_path in locations.items():
        print(f"\nCreating folders in {location_name}: {base_path}")

        for folder in folders_to_create:
            folder_path = base_path / folder
            folder_path.mkdir(parents=True, exist_ok=True)
            print(f"  ✓ {folder}")

        # Create a README file
        readme_path = base_path / "README.md"
        with open(readme_path, "w") as f:
            f.write(f"# {PROJECT_NAME}\n")
            f.write(f"Bank: JP Morgan (JPM)\n")
            f.write(f"Quarters: Q1 2025, Q2 2025\n")
            f.write(f"Created: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

        print(f"  ✓ README.md created")

    return locations

# Create folder structure
locations = create_folder_structure()

## Define Global Paths

# Primary paths (Google Drive)
drive_base = locations["drive"]
colab_base = locations["colab"]

# Data paths
raw_data_path = drive_base / "data/raw/jpm"
clean_data_path = drive_base / "data/clean/jpm"
processed_data_path = drive_base / "data/processed/jpm"
model_ready_path = drive_base / "data/model_ready/jpm"

# Results paths
results_sentiment_path = drive_base / "results/sentiment/jpm"
results_comparison_path = drive_base / "results/comparison/jpm"

# Model paths
models_path = drive_base / "models"

# Output paths
viz_path = drive_base / "outputs/visualizations/jpm"
reports_path = drive_base / "outputs/reports/jpm"

# Colab working paths (for easy access)
colab_data_path = colab_base / "data"
colab_results_path = colab_base / "results"

print("\n" + "="*60)
print("FOLDER STRUCTURE CREATED SUCCESSFULLY")
print("="*60)
print(f"Primary storage: {drive_base}")
print(f"Working storage: {colab_base}")
print(f"Raw data: {raw_data_path}")
print(f"Results: {results_sentiment_path}")


Mounted at /content/drive

Creating folders in drive: /content/drive/MyDrive/CAM_DS_AI_Project
  ✓ data/raw/jpm
  ✓ data/clean/jpm
  ✓ data/processed/jpm
  ✓ data/model_ready/jpm
  ✓ results/sentiment/jpm
  ✓ results/comparison/jpm
  ✓ models/finbert_tone/jpm
  ✓ models/prosus_finbert/jpm
  ✓ notebooks
  ✓ outputs/visualizations/jpm
  ✓ outputs/reports/jpm
  ✓ README.md created

Creating folders in colab: /content/cam_ds_ai_project
  ✓ data/raw/jpm
  ✓ data/clean/jpm
  ✓ data/processed/jpm
  ✓ data/model_ready/jpm
  ✓ results/sentiment/jpm
  ✓ results/comparison/jpm
  ✓ models/finbert_tone/jpm
  ✓ models/prosus_finbert/jpm
  ✓ notebooks
  ✓ outputs/visualizations/jpm
  ✓ outputs/reports/jpm
  ✓ README.md created

FOLDER STRUCTURE CREATED SUCCESSFULLY
Primary storage: /content/drive/MyDrive/CAM_DS_AI_Project
Working storage: /content/cam_ds_ai_project
Raw data: /content/drive/MyDrive/CAM_DS_AI_Project/data/raw/jpm
Results: /content/drive/MyDrive/CAM_DS_AI_Project/results/sentiment/jpm


In [None]:
## Helper Functions

def save_to_multiple_locations(df, filename, locations_dict=None):
    """Save DataFrame to multiple locations."""
    if locations_dict is None:
        locations_dict = {
            "drive": drive_base,
            "colab": colab_base
        }

    saved_paths = []
    for location_name, base_path in locations_dict.items():
        file_path = base_path / filename
        file_path.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(file_path, index=False)
        saved_paths.append(str(file_path))
        print(f"✓ Saved to {location_name}: {file_path}")

    return saved_paths

def read_csv_safe(path: Path) -> pd.DataFrame:
    """Safely read CSV with logging."""
    df = pd.read_csv(path)
    print(f"Loaded {path.name}: {len(df):,} rows × {len(df.columns)} cols")
    return df

def log_dataframe_info(df, name):
    """Log DataFrame information."""
    print(f"\n{name} Info:")
    print(f"  Shape: {df.shape}")
    print(f"  Columns: {list(df.columns)}")
    print(f"  Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

    # Check for missing values
    missing = df.isnull().sum()
    if missing.any():
        print(f"  Missing values: {missing[missing > 0].to_dict()}")
    else:
        print("  No missing values")


In [None]:
## Data URLs

# JP Morgan Q1 and Q2 2025 data URLs
data_urls = {
    "q1_2025": "https://drive.google.com/file/d/1-ZiLrQVebxuNVw3FwenZ3Sn5FoAVVarH/view?usp=drive_link",
    "q2_2025": "https://drive.google.com/file/d/1go14liFL_hOWZOQfLYDSwgScM-pjVanw/view?usp=drive_link"
}

print("\n" + "="*60)
print("ENVIRONMENT SETUP COMPLETE")
print("="*60)
print("Next steps:")
print("1. Run notebook 02_load_data.ipynb to download and load datasets")
print("2. Run notebook 03_clean_preprocess.ipynb to clean the data")
print("3. Run notebook 04_sentiment_analysis.ipynb for model analysis")
print("4. Run notebook 05_model_comparison.ipynb for comparative analysis")
print("5. Run notebook 06_results_visualization.ipynb for final results")

# Save configuration for other notebooks
config = {
    "SEED": SEED,
    "PROJECT_NAME": PROJECT_NAME,
    "BANK_CODE": BANK_CODE,
    "QUARTERS": QUARTERS,
    "drive_base": str(drive_base),
    "colab_base": str(colab_base),
    "data_urls": data_urls
}

import json
config_path = drive_base / "config.json"
with open(config_path, "w") as f:
    json.dump(config, f, indent=2)

print(f"✓ Configuration saved to: {config_path}")


ENVIRONMENT SETUP COMPLETE
Next steps:
1. Run notebook 02_load_data.ipynb to download and load datasets
2. Run notebook 03_clean_preprocess.ipynb to clean the data
3. Run notebook 04_sentiment_analysis.ipynb for model analysis
4. Run notebook 05_model_comparison.ipynb for comparative analysis
5. Run notebook 06_results_visualization.ipynb for final results
✓ Configuration saved to: /content/drive/MyDrive/CAM_DS_AI_Project/config.json
