In [3]:
# 02_load_data.ipynb
# Purpose: Load JPM Q1 and Q2 2025 earnings call transcript data
# Input: Google Drive CSV files
# Output: raw_jpm_q1_2025_df, raw_jpm_q2_2025_df, raw_jpm_multi_2025_df

## Import Libraries

import pandas as pd
import numpy as np
import json
from pathlib import Path
import requests
import io
from google.colab import drive

# Location A: Google Drive (Primary drive)
from google.colab import drive
drive.mount("/content/drive")


# Load configuration
config_path = Path("/content/drive/MyDrive/CAM_DS_AI_Project/config.json")
with open(config_path, "r") as f:
    config = json.load(f)

SEED = config["SEED"]
BANK_CODE = config["BANK_CODE"]
drive_base = Path(config["drive_base"])
colab_base = Path(config["colab_base"])

print(f"Loading data for bank: {BANK_CODE.upper()}")


Mounted at /content/drive
Loading data for bank: JPM


In [4]:
## Define Paths

raw_data_path = drive_base / "data/raw/jpm"
colab_data_path = colab_base / "data/raw/jpm"

# Ensure directories exist
raw_data_path.mkdir(parents=True, exist_ok=True)
colab_data_path.mkdir(parents=True, exist_ok=True)

In [5]:
## Helper Functions

def extract_file_id_from_drive_url(url):
    """Extract file ID from Google Drive sharing URL."""
    if "drive.google.com" in url and "/file/d/" in url:
        return url.split("/file/d/")[1].split("/")[0]
    return None

def download_from_drive(file_id, filename):
    """Download file from Google Drive using file ID."""
    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"

    try:
        response = requests.get(download_url)
        response.raise_for_status()

        # Save to both locations
        drive_file_path = raw_data_path / filename
        colab_file_path = colab_data_path / filename

        # Save to drive location
        with open(drive_file_path, 'wb') as f:
            f.write(response.content)

        # Save to colab location
        with open(colab_file_path, 'wb') as f:
            f.write(response.content)

        print(f"✓ Downloaded {filename}")
        print(f"  Drive: {drive_file_path}")
        print(f"  Colab: {colab_file_path}")

        return drive_file_path, colab_file_path

    except Exception as e:
        print(f"❌ Error downloading {filename}: {str(e)}")
        return None, None

def read_csv_safe(path: Path, encoding='utf-8') -> pd.DataFrame:
    """Safely read CSV with multiple encoding attempts."""
    encodings_to_try = [encoding, 'latin-1', 'cp1252', 'iso-8859-1']

    for enc in encodings_to_try:
        try:
            df = pd.read_csv(path, encoding=enc)
            print(f"✓ Loaded {path.name}: {len(df):,} rows × {len(df.columns)} cols (encoding: {enc})")
            return df
        except UnicodeDecodeError:
            continue
        except Exception as e:
            print(f"❌ Error reading {path.name}: {str(e)}")
            break

    return None

In [6]:
## Download Data Files

data_urls = config["data_urls"]

# Download Q1 2025 data
print("Downloading Q1 2025 data...")
q1_file_id = extract_file_id_from_drive_url(data_urls["q1_2025"])
q1_filename = "jpm-1q25-earnings-call-transcript_qa.csv"

if q1_file_id:
    q1_drive_path, q1_colab_path = download_from_drive(q1_file_id, q1_filename)
else:
    print("❌ Could not extract file ID from Q1 URL")

# Download Q2 2025 data
print("\nDownloading Q2 2025 data...")
q2_file_id = extract_file_id_from_drive_url(data_urls["q2_2025"])
q2_filename = "jpm-2q25-earnings-call-transcript_qa.csv"

if q2_file_id:
    q2_drive_path, q2_colab_path = download_from_drive(q2_file_id, q2_filename)
else:
    print("❌ Could not extract file ID from Q2 URL")


Downloading Q1 2025 data...
✓ Downloaded jpm-1q25-earnings-call-transcript_qa.csv
  Drive: /content/drive/MyDrive/CAM_DS_AI_Project/data/raw/jpm/jpm-1q25-earnings-call-transcript_qa.csv
  Colab: /content/cam_ds_ai_project/data/raw/jpm/jpm-1q25-earnings-call-transcript_qa.csv

Downloading Q2 2025 data...
✓ Downloaded jpm-2q25-earnings-call-transcript_qa.csv
  Drive: /content/drive/MyDrive/CAM_DS_AI_Project/data/raw/jpm/jpm-2q25-earnings-call-transcript_qa.csv
  Colab: /content/cam_ds_ai_project/data/raw/jpm/jpm-2q25-earnings-call-transcript_qa.csv


In [7]:
## Load Datasets

print("\n" + "="*50)
print("LOADING DATASETS")
print("="*50)

# Load Q1 2025 data
if q1_drive_path and q1_drive_path.exists():
    raw_jpm_q1_2025_df = read_csv_safe(q1_drive_path)
else:
    print("❌ Q1 2025 file not found")
    raw_jpm_q1_2025_df = None

# Load Q2 2025 data
if q2_drive_path and q2_drive_path.exists():
    raw_jpm_q2_2025_df = read_csv_safe(q2_drive_path)
else:
    print("❌ Q2 2025 file not found")
    raw_jpm_q2_2025_df = None



LOADING DATASETS
✓ Loaded jpm-1q25-earnings-call-transcript_qa.csv: 112 rows × 10 cols (encoding: utf-8)
✓ Loaded jpm-2q25-earnings-call-transcript_qa.csv: 149 rows × 10 cols (encoding: utf-8)


In [8]:
## Initial Data Exploration

def explore_dataset(df, quarter_name):
    """Explore dataset structure and content."""
    if df is None:
        print(f"❌ {quarter_name} dataset is None")
        return

    print(f"\n📊 {quarter_name} DATASET EXPLORATION")
    print("-" * 40)

    # Basic info
    print(f"Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")

    # Data types
    print(f"\nData types:")
    for col, dtype in df.dtypes.items():
        print(f"  {col}: {dtype}")

    # Missing values
    missing = df.isnull().sum()
    if missing.any():
        print(f"\nMissing values:")
        for col, count in missing[missing > 0].items():
            pct = (count / len(df)) * 100
            print(f"  {col}: {count} ({pct:.1f}%)")
    else:
        print("\n✓ No missing values")

    # Memory usage
    memory_mb = df.memory_usage(deep=True).sum() / 1024**2
    print(f"\nMemory usage: {memory_mb:.2f} MB")

    # Sample data
    print(f"\nFirst 3 rows:")
    print(df.head(3).to_string())

    # Unique value counts for key columns
    if 'speaker' in df.columns:
        print(f"\nSpeaker distribution:")
        speaker_counts = df['speaker'].value_counts()
        for speaker, count in speaker_counts.items():
            print(f"  {speaker}: {count}")

    if 'text' in df.columns:
        text_lengths = df['text'].str.len()
        print(f"\nText length stats:")
        print(f"  Mean: {text_lengths.mean():.0f} chars")
        print(f"  Median: {text_lengths.median():.0f} chars")
        print(f"  Min: {text_lengths.min()} chars")
        print(f"  Max: {text_lengths.max()} chars")

# Explore both datasets
explore_dataset(raw_jpm_q1_2025_df, "Q1 2025")
explore_dataset(raw_jpm_q2_2025_df, "Q2 2025")



📊 Q1 2025 DATASET EXPLORATION
----------------------------------------
Shape: (112, 10)
Columns: ['section', 'question_number', 'answer_number', 'speaker_name', 'role', 'company', 'content', 'year', 'quarter', 'is_pleasantry']

Data types:
  section: object
  question_number: float64
  answer_number: float64
  speaker_name: object
  role: object
  company: object
  content: object
  year: int64
  quarter: object
  is_pleasantry: bool

Missing values:
  question_number: 1 (0.9%)
  answer_number: 23 (20.5%)

Memory usage: 0.13 MB

First 3 rows:
        section  question_number  answer_number   speaker_name                     role              company                                                                                                                                                                                                                                                                                                                                                      

In [9]:
## Create Combined Dataset

if raw_jpm_q1_2025_df is not None and raw_jpm_q2_2025_df is not None:
    print("\n" + "="*50)
    print("CREATING COMBINED DATASET")
    print("="*50)

    # Add quarter identifier
    q1_df_with_quarter = raw_jpm_q1_2025_df.copy()
    q1_df_with_quarter['quarter'] = 'q1_2025'

    q2_df_with_quarter = raw_jpm_q2_2025_df.copy()
    q2_df_with_quarter['quarter'] = 'q2_2025'

    # Check column alignment
    q1_cols = set(raw_jpm_q1_2025_df.columns)
    q2_cols = set(raw_jpm_q2_2025_df.columns)

    print(f"Q1 columns: {q1_cols}")
    print(f"Q2 columns: {q2_cols}")
    print(f"Common columns: {q1_cols.intersection(q2_cols)}")

    if q1_cols != q2_cols:
        print("⚠️  Column mismatch detected")
        print(f"Q1 only: {q1_cols - q2_cols}")
        print(f"Q2 only: {q2_cols - q1_cols}")

    # Combine datasets
    raw_jpm_multi_2025_df = pd.concat([
        q1_df_with_quarter,
        q2_df_with_quarter
    ], ignore_index=True, sort=False)

    print(f"✓ Combined dataset created: {raw_jpm_multi_2025_df.shape}")

    # Verify quarter distribution
    quarter_dist = raw_jpm_multi_2025_df['quarter'].value_counts()
    print(f"Quarter distribution: {quarter_dist.to_dict()}")

else:
    print("❌ Cannot create combined dataset - missing data")
    raw_jpm_multi_2025_df = None



CREATING COMBINED DATASET
Q1 columns: {'question_number', 'role', 'year', 'section', 'company', 'is_pleasantry', 'quarter', 'content', 'answer_number', 'speaker_name'}
Q2 columns: {'question_number', 'role', 'year', 'section', 'company', 'is_pleasantry', 'quarter', 'content', 'answer_number', 'speaker_name'}
Common columns: {'question_number', 'role', 'year', 'section', 'company', 'is_pleasantry', 'quarter', 'content', 'answer_number', 'speaker_name'}
✓ Combined dataset created: (261, 10)
Quarter distribution: {'q2_2025': 149, 'q1_2025': 112}


In [10]:
## Save Raw Data

def save_dataset(df, filename, description):
    """Save dataset to multiple locations."""
    if df is None:
        print(f"❌ Cannot save {description} - dataset is None")
        return

    print(f"Saving {description}...")

    # Save to drive
    drive_path = raw_data_path / filename
    df.to_csv(drive_path, index=False)
    print(f"  ✓ Drive: {drive_path}")

    # Save to colab
    colab_path = colab_data_path / filename
    df.to_csv(colab_path, index=False)
    print(f"  ✓ Colab: {colab_path}")

    return drive_path, colab_path

print("\n" + "="*50)
print("SAVING DATASETS")
print("="*50)

# Save individual quarter datasets
save_dataset(raw_jpm_q1_2025_df, "raw_jpm_q1_2025.csv", "Q1 2025 raw data")
save_dataset(raw_jpm_q2_2025_df, "raw_jpm_q2_2025.csv", "Q2 2025 raw data")
save_dataset(raw_jpm_multi_2025_df, "raw_jpm_multi_2025.csv", "Combined 2025 raw data")



SAVING DATASETS
Saving Q1 2025 raw data...
  ✓ Drive: /content/drive/MyDrive/CAM_DS_AI_Project/data/raw/jpm/raw_jpm_q1_2025.csv
  ✓ Colab: /content/cam_ds_ai_project/data/raw/jpm/raw_jpm_q1_2025.csv
Saving Q2 2025 raw data...
  ✓ Drive: /content/drive/MyDrive/CAM_DS_AI_Project/data/raw/jpm/raw_jpm_q2_2025.csv
  ✓ Colab: /content/cam_ds_ai_project/data/raw/jpm/raw_jpm_q2_2025.csv
Saving Combined 2025 raw data...
  ✓ Drive: /content/drive/MyDrive/CAM_DS_AI_Project/data/raw/jpm/raw_jpm_multi_2025.csv
  ✓ Colab: /content/cam_ds_ai_project/data/raw/jpm/raw_jpm_multi_2025.csv


(PosixPath('/content/drive/MyDrive/CAM_DS_AI_Project/data/raw/jpm/raw_jpm_multi_2025.csv'),
 PosixPath('/content/cam_ds_ai_project/data/raw/jpm/raw_jpm_multi_2025.csv'))

In [11]:
## Data Loading Summary

print("\n" + "="*60)
print("DATA LOADING SUMMARY")
print("="*60)

datasets_loaded = []
if raw_jpm_q1_2025_df is not None:
    datasets_loaded.append(f"Q1 2025: {raw_jpm_q1_2025_df.shape}")
if raw_jpm_q2_2025_df is not None:
    datasets_loaded.append(f"Q2 2025: {raw_jpm_q2_2025_df.shape}")
if raw_jpm_multi_2025_df is not None:
    datasets_loaded.append(f"Combined: {raw_jpm_multi_2025_df.shape}")

print("Datasets loaded:")
for dataset in datasets_loaded:
    print(f"  ✓ {dataset}")

total_memory = 0
if raw_jpm_q1_2025_df is not None:
    total_memory += raw_jpm_q1_2025_df.memory_usage(deep=True).sum()
if raw_jpm_q2_2025_df is not None:
    total_memory += raw_jpm_q2_2025_df.memory_usage(deep=True).sum()
if raw_jpm_multi_2025_df is not None:
    total_memory += raw_jpm_multi_2025_df.memory_usage(deep=True).sum()

print(f"Total memory usage: {total_memory / 1024**2:.2f} MB")

print("\nNext step: Run 03_clean_preprocess.ipynb to clean and preprocess the data")

## Export Variables for Next Notebook

# Create data registry for next notebook
data_registry = {
    "raw_jpm_q1_2025_df": {
        "path": str(raw_data_path / "raw_jpm_q1_2025.csv"),
        "shape": raw_jpm_q1_2025_df.shape if raw_jpm_q1_2025_df is not None else None,
        "loaded": raw_jpm_q1_2025_df is not None
    },
    "raw_jpm_q2_2025_df": {
        "path": str(raw_data_path / "raw_jpm_q2_2025.csv"),
        "shape": raw_jpm_q2_2025_df.shape if raw_jpm_q2_2025_df is not None else None,
        "loaded": raw_jpm_q2_2025_df is not None
    },
    "raw_jpm_multi_2025_df": {
        "path": str(raw_data_path / "raw_jpm_multi_2025.csv"),
        "shape": raw_jpm_multi_2025_df.shape if raw_jpm_multi_2025_df is not None else None,
        "loaded": raw_jpm_multi_2025_df is not None
    }
}

# Save registry
registry_path = drive_base / "data_registry.json"
with open(registry_path, "w") as f:
    json.dump(data_registry, f, indent=2)

print(f"✓ Data registry saved to: {registry_path}")


DATA LOADING SUMMARY
Datasets loaded:
  ✓ Q1 2025: (112, 10)
  ✓ Q2 2025: (149, 10)
  ✓ Combined: (261, 10)
Total memory usage: 0.52 MB

Next step: Run 03_clean_preprocess.ipynb to clean and preprocess the data
✓ Data registry saved to: /content/drive/MyDrive/CAM_DS_AI_Project/data_registry.json
