In [72]:
# 03b_manual_validation.ipynb
# Purpose: Load and validate manually labeled sentiment data
# Input: sentiment_sentence_jpm_multi_2025.csv from Google Drive
# Output: Validated manual labels for fine-tuning

## Import Libraries

import pandas as pd
import numpy as np
import json
import requests
import io
import csv
import tempfile
import os
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import warnings
warnings.filterwarnings('ignore')

# Statistical analysis
from scipy import stats
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Location A: Google Drive (Primary drive)
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

# Load configuration
config_path = Path("/content/drive/MyDrive/CAM_DS_AI_Project/config.json")
with open(config_path, "r") as f:
    config = json.load(f)

SEED = config["SEED"]
BANK_CODE = config["BANK_CODE"]
drive_base = Path(config["drive_base"])
colab_base = Path(config["colab_base"])

print(f"Manual validation for bank: {BANK_CODE.upper()}")



Mounted at /content/drive
Manual validation for bank: JPM


In [73]:
## Define Paths

processed_data_path = drive_base / "data/processed/jpm"
manual_validation_path = drive_base / "data/manual_validation/jpm"
results_sentiment_path = drive_base / "results/sentiment/jpm"

# Ensure directories exist
manual_validation_path.mkdir(parents=True, exist_ok=True)

## Download Manually Labeled Data

def extract_file_id_from_drive_url(url):
    """Extract file ID from Google Drive sharing URL."""
    if "drive.google.com" in url and "/file/d/" in url:
        return url.split("/file/d/")[1].split("/")[0]
    return None

def download_manual_labels_from_drive():
    """Download manually labeled dataset from Google Drive."""
    # URL provided by user
    manual_labels_url = "https://drive.google.com/file/d/1aiqBl0Xll6eFgjXKlixSmVfIslA39bnU/view?usp=drive_link"


    file_id = extract_file_id_from_drive_url(manual_labels_url)
    if not file_id:
        print("Error: Could not extract file ID from URL")
        return None

    download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
    filename = "sentiment_sentence_jpm_multi_2025.csv"

    try:
        print(f"Downloading manually labeled data from Google Drive...")
        response = requests.get(download_url)
        response.raise_for_status()

        # Save to manual validation directory
        manual_file_path = manual_validation_path / filename
        with open(manual_file_path, 'wb') as f:
            f.write(response.content)

        # Also save to results directory for consistency
        results_file_path = results_sentiment_path / filename
        with open(results_file_path, 'wb') as f:
            f.write(response.content)

        print(f"Downloaded {filename}")
        print(f"  Manual validation: {manual_file_path}")
        print(f"  Results: {results_file_path}")

        return manual_file_path

    except Exception as e:
        print(f"Error downloading manual labels: {str(e)}")
        return None

# Download the manually labeled data
manual_labels_path = download_manual_labels_from_drive()




Downloading manually labeled data from Google Drive...
Downloaded sentiment_sentence_jpm_multi_2025.csv
  Manual validation: /content/drive/MyDrive/CAM_DS_AI_Project/data/manual_validation/jpm/sentiment_sentence_jpm_multi_2025.csv
  Results: /content/drive/MyDrive/CAM_DS_AI_Project/results/sentiment/jpm/sentiment_sentence_jpm_multi_2025.csv


In [74]:

## Load and Validate Manual Labels

def load_manual_labels(file_path: Path) -> pd.DataFrame:
    """Load manually labeled data with robust error handling for malformed CSV."""
    if not file_path or not file_path.exists():
        print("Error: Manual labels file not found")
        return None

    try:
        # Try multiple approaches to handle malformed CSV
        print("Attempting to load CSV with robust parsing...")

        # Approach 1: Standard loading with error handling
        try:
            df = pd.read_csv(file_path, encoding='utf-8')
            print(f"Loaded manual labels (standard method): {df.shape}")
            return df
        except pd.errors.ParserError as e:
            print(f"Standard parsing failed: {e}")

        # Approach 2: Skip bad lines
        try:
            print("Trying with error_bad_lines=False...")
            df = pd.read_csv(file_path, encoding='utf-8', on_bad_lines='skip')
            print(f"Loaded manual labels (skipped bad lines): {df.shape}")
            return df
        except Exception as e:
            print(f"Skip bad lines failed: {e}")

        # Approach 3: Use different separator or quoting
        try:
            print("Trying with flexible quoting...")
            df = pd.read_csv(file_path, encoding='utf-8', quoting=csv.QUOTE_ALL, on_bad_lines='skip')
            print(f"Loaded manual labels (flexible quoting): {df.shape}")
            return df
        except Exception as e:
            print(f"Flexible quoting failed: {e}")

        # Approach 4: Read as text and clean manually
        try:
            print("Attempting manual cleanup...")
            with open(file_path, 'r', encoding='utf-8') as f:
                lines = f.readlines()

            print(f"File has {len(lines)} lines")

            # Check problematic line 20
            if len(lines) > 20:
                print(f"Line 20 preview: {lines[19][:200]}...")

            # Try to identify the expected number of columns from header
            header_line = lines[0].strip()
            expected_columns = len(header_line.split(','))
            print(f"Expected columns from header: {expected_columns}")

            # Clean lines - remove lines with too many commas
            cleaned_lines = []
            for i, line in enumerate(lines):
                if i == 0:  # Keep header
                    cleaned_lines.append(line)
                else:
                    # Count commas (rough field count estimation)
                    field_count = line.count(',') + 1
                    if field_count <= expected_columns * 1.5:  # Allow some tolerance
                        cleaned_lines.append(line)
                    else:
                        print(f"Skipping line {i+1} with {field_count} fields")

            # Write cleaned data to temporary file
            with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.csv', encoding='utf-8') as tmp_file:
                tmp_file.writelines(cleaned_lines)
                tmp_path = tmp_file.name

            # Load cleaned file
            df = pd.read_csv(tmp_path, encoding='utf-8')

            # Clean up temp file
            os.unlink(tmp_path)

            print(f"Loaded manual labels (manual cleanup): {df.shape}")
            return df

        except Exception as e:
            print(f"Manual cleanup failed: {e}")

        # Approach 5: Last resort - try different encodings
        encodings = ['utf-8', 'latin-1', 'cp1252', 'iso-8859-1']
        for encoding in encodings:
            try:
                print(f"Trying encoding: {encoding}")
                df = pd.read_csv(file_path, encoding=encoding, on_bad_lines='skip')
                print(f"Loaded manual labels (encoding {encoding}): {df.shape}")
                return df
            except Exception as e:
                print(f"Encoding {encoding} failed: {e}")
                continue

        print("All loading approaches failed")
        return None

    except Exception as e:
        print(f"Error loading manual labels: {str(e)}")
        return None

def validate_manual_labels(df: pd.DataFrame) -> Dict:
    """Validate the structure and quality of manual labels."""
    if df is None:
        return {}

    print("\n" + "="*50)
    print("MANUAL LABELS VALIDATION")
    print("="*50)

    validation_report = {
        'total_records': len(df),
        'manually_labeled_count': 0,
        'label_distribution': {},
        'confidence_stats': {},
        'annotator_info': {},
        'data_quality': {}
    }

    # Check for required columns
    required_columns = ['human_label', 'human_confidence', 'annotation_notes', 'annotator_id']
    missing_columns = [col for col in required_columns if col not in df.columns]

    if missing_columns:
        print(f"Warning: Missing required columns: {missing_columns}")
        validation_report['missing_columns'] = missing_columns

        # If no manual label columns at all, see what we have
        print(f"Available columns: {list(df.columns)}")

        # Check for alternative column names
        alternative_mappings = {
            'label': 'human_label',
            'sentiment': 'human_label',
            'manual_label': 'human_label',
            'confidence': 'human_confidence',
            'manual_confidence': 'human_confidence',
            'notes': 'annotation_notes',
            'comments': 'annotation_notes',
            'annotator': 'annotator_id',
            'user_id': 'annotator_id'
        }

        found_alternatives = {}
        for alt_name, standard_name in alternative_mappings.items():
            if alt_name in df.columns and standard_name in missing_columns:
                found_alternatives[standard_name] = alt_name
                print(f"Found alternative column: '{alt_name}' for '{standard_name}'")

        # Rename columns if alternatives found
        if found_alternatives:
            df = df.rename(columns={v: k for k, v in found_alternatives.items()})
            missing_columns = [col for col in required_columns if col not in df.columns]
            print(f"After renaming, missing columns: {missing_columns}")

        if missing_columns:
            return validation_report

    # Count manually labeled records
    manually_labeled_mask = df['human_label'].notna() & (df['human_label'] != '')
    manually_labeled_df = df[manually_labeled_mask].copy()
    validation_report['manually_labeled_count'] = len(manually_labeled_df)

    print(f"Total records: {len(df):,}")
    print(f"Manually labeled: {len(manually_labeled_df):,} ({len(manually_labeled_df)/len(df)*100:.1f}%)")

    if len(manually_labeled_df) == 0:
        print("Warning: No manually labeled records found")
        return validation_report

    # Analyze label distribution
    label_dist = manually_labeled_df['human_label'].value_counts()
    validation_report['label_distribution'] = label_dist.to_dict()

    print(f"\nLabel Distribution:")
    for label, count in label_dist.items():
        pct = (count / len(manually_labeled_df)) * 100
        print(f"  {label}: {count} ({pct:.1f}%)")

    # Analyze confidence scores
    if 'human_confidence' in manually_labeled_df.columns:
        confidence_scores = manually_labeled_df['human_confidence'].dropna()
        if len(confidence_scores) > 0:
            validation_report['confidence_stats'] = {
                'mean': confidence_scores.mean(),
                'std': confidence_scores.std(),
                'min': confidence_scores.min(),
                'max': confidence_scores.max(),
                'median': confidence_scores.median()
            }

            print(f"\nConfidence Scores:")
            print(f"  Mean: {confidence_scores.mean():.2f}")
            print(f"  Std: {confidence_scores.std():.2f}")
            print(f"  Range: {confidence_scores.min():.2f} - {confidence_scores.max():.2f}")

    # Analyze annotator information
    if 'annotator_id' in manually_labeled_df.columns:
        annotator_dist = manually_labeled_df['annotator_id'].value_counts()
        validation_report['annotator_info'] = {
            'unique_annotators': len(annotator_dist),
            'annotations_per_annotator': annotator_dist.to_dict()
        }

        print(f"\nAnnotator Information:")
        print(f"  Unique annotators: {len(annotator_dist)}")
        for annotator, count in annotator_dist.items():
            print(f"  {annotator}: {count} annotations")

    # Data quality checks
    quality_issues = []

    # Check for missing confidence scores
    missing_confidence = manually_labeled_df['human_confidence'].isna().sum()
    if missing_confidence > 0:
        quality_issues.append(f"Missing confidence scores: {missing_confidence}")

    # Check for very low confidence annotations
    if 'human_confidence' in manually_labeled_df.columns:
        low_confidence = (manually_labeled_df['human_confidence'] < 0.5).sum()
        if low_confidence > 0:
            quality_issues.append(f"Low confidence annotations (<0.5): {low_confidence}")

    # Check for missing annotation notes
    missing_notes = manually_labeled_df['annotation_notes'].isna().sum()
    if missing_notes > 0:
        quality_issues.append(f"Missing annotation notes: {missing_notes}")

    validation_report['data_quality']['issues'] = quality_issues

    if quality_issues:
        print(f"\nData Quality Issues:")
        for issue in quality_issues:
            print(f"  Warning: {issue}")
    else:
        print(f"\nNo data quality issues detected")

    return validation_report

# Load and validate manual labels
manual_labels_df = load_manual_labels(manual_labels_path)
validation_report = validate_manual_labels(manual_labels_df)



Attempting to load CSV with robust parsing...
Loaded manual labels (standard method): (1110, 41)

MANUAL LABELS VALIDATION
Total records: 1,110
Manually labeled: 200 (18.0%)

Label Distribution:
  neutral: 126 (63.0%)
  positive: 44 (22.0%)
  negative: 29 (14.5%)
  poistive: 1 (0.5%)

Confidence Scores:
  Mean: 3.69
  Std: 0.66
  Range: 1.00 - 5.00

Annotator Information:
  Unique annotators: 1
  Annotator01: 200 annotations

Data Quality Issues:


In [75]:
## Compare Manual Labels with Model Predictions

def compare_manual_vs_model_predictions(df: pd.DataFrame) -> Dict:
    """Compare manual labels with existing model predictions."""
    if df is None:
        return {}

    print("\n" + "="*50)
    print("MANUAL VS MODEL COMPARISON")
    print("="*50)

    # Filter to manually labeled records
    manually_labeled_mask = df['human_label'].notna() & (df['human_label'] != '')
    manual_df = df[manually_labeled_mask].copy()

    if len(manual_df) == 0:
        print("No manually labeled records for comparison")
        return {}

    comparison_results = {}

    # Compare with FinBERT-tone predictions
    if 'finbert_tone_label' in manual_df.columns:
        finbert_comparison = analyze_model_vs_manual(
            manual_df, 'finbert_tone_label', 'human_label', 'FinBERT-tone'
        )
        comparison_results['finbert_tone'] = finbert_comparison

    # Compare with ProsusAI predictions
    if 'prosus_label' in manual_df.columns:
        prosus_comparison = analyze_model_vs_manual(
            manual_df, 'prosus_label', 'human_label', 'ProsusAI'
        )
        comparison_results['prosus'] = prosus_comparison

    return comparison_results

def analyze_model_vs_manual(df: pd.DataFrame, model_col: str, manual_col: str, model_name: str) -> Dict:
    """Analyze agreement between model predictions and manual labels."""

    # Filter valid comparisons
    valid_mask = df[model_col].notna() & df[manual_col].notna()
    comparison_df = df[valid_mask].copy()

    if len(comparison_df) == 0:
        return {'error': f'No valid comparisons for {model_name}'}

    print(f"\n{model_name} vs Manual Labels ({len(comparison_df)} comparisons)")
    print("-" * 40)

    # Calculate agreement
    agreement = (comparison_df[model_col] == comparison_df[manual_col]).mean()
    print(f"Overall Agreement: {agreement:.3f}")

    # Classification report
    try:
        report = classification_report(
            comparison_df[manual_col],
            comparison_df[model_col],
            output_dict=True,
            zero_division=0
        )

        print(f"Precision: {report['weighted avg']['precision']:.3f}")
        print(f"Recall: {report['weighted avg']['recall']:.3f}")
        print(f"F1-Score: {report['weighted avg']['f1-score']:.3f}")

    except Exception as e:
        print(f"Could not generate classification report: {e}")
        report = {}

    # Confusion matrix
    try:
        labels = sorted(list(set(comparison_df[manual_col].unique()) | set(comparison_df[model_col].unique())))
        cm = confusion_matrix(comparison_df[manual_col], comparison_df[model_col], labels=labels)

        print(f"\nConfusion Matrix:")
        print(f"Manual\\Model: {' '.join([f'{l:>8}' for l in labels])}")
        for i, manual_label in enumerate(labels):
            row_str = f"{manual_label:>12}: {' '.join([f'{cm[i,j]:>8}' for j in range(len(labels))])}"
            print(row_str)

    except Exception as e:
        print(f"Could not generate confusion matrix: {e}")
        cm = None

    # Agreement by confidence level
    confidence_analysis = {}
    if 'human_confidence' in comparison_df.columns:
        # High confidence vs low confidence agreement
        high_conf_mask = comparison_df['human_confidence'] >= 0.8
        if high_conf_mask.sum() > 0:
            high_conf_agreement = (
                comparison_df.loc[high_conf_mask, model_col] ==
                comparison_df.loc[high_conf_mask, manual_col]
            ).mean()
            confidence_analysis['high_confidence_agreement'] = high_conf_agreement

        low_conf_mask = comparison_df['human_confidence'] < 0.6
        if low_conf_mask.sum() > 0:
            low_conf_agreement = (
                comparison_df.loc[low_conf_mask, model_col] ==
                comparison_df.loc[low_conf_mask, manual_col]
            ).mean()
            confidence_analysis['low_confidence_agreement'] = low_conf_agreement

        print(f"\nAgreement by Confidence:")
        if 'high_confidence_agreement' in confidence_analysis:
            print(f"  High confidence (≥0.8): {confidence_analysis['high_confidence_agreement']:.3f}")
        if 'low_confidence_agreement' in confidence_analysis:
            print(f"  Low confidence (<0.6): {confidence_analysis['low_confidence_agreement']:.3f}")

    return {
        'total_comparisons': len(comparison_df),
        'agreement_rate': agreement,
        'classification_report': report,
        'confusion_matrix': cm.tolist() if cm is not None else None,
        'confusion_matrix_labels': labels if cm is not None else None,
        'confidence_analysis': confidence_analysis
    }

# Compare manual labels with model predictions
if manual_labels_df is not None:
    model_comparison_results = compare_manual_vs_model_predictions(manual_labels_df)




MANUAL VS MODEL COMPARISON

FinBERT-tone vs Manual Labels (200 comparisons)
----------------------------------------
Overall Agreement: 0.895
Precision: 0.897
Recall: 0.895
F1-Score: 0.894

Confusion Matrix:
Manual\Model: negative  neutral poistive positive
    negative:       28        1        0        0
     neutral:        9      113        0        4
    poistive:        0        1        0        0
    positive:        0        6        0       38

Agreement by Confidence:
  High confidence (≥0.8): 0.895

ProsusAI vs Manual Labels (200 comparisons)
----------------------------------------
Overall Agreement: 0.855
Precision: 0.855
Recall: 0.855
F1-Score: 0.853

Confusion Matrix:
Manual\Model: negative  neutral poistive positive
    negative:       20        8        0        1
     neutral:        4      111        0       11
    poistive:        0        0        0        1
    positive:        0        4        0       40

Agreement by Confidence:
  High confidence (≥0.8): 0.85

In [76]:
## Prepare Training/Validation Split

def prepare_manual_data_for_training(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame, Dict]:
    """Prepare manually labeled data for model fine-tuning."""
    if df is None:
        return None, None, {}

    print("\n" + "="*50)
    print("PREPARING DATA FOR FINE-TUNING")
    print("="*50)

    # Filter to manually labeled records
    manually_labeled_mask = df['human_label'].notna() & (df['human_label'] != '')
    manual_df = df[manually_labeled_mask].copy()

    if len(manual_df) == 0:
        print("No manually labeled data available for training")
        return None, None, {}

    print(f"Total manually labeled records: {len(manual_df)}")

    # Stratified split by label and confidence
    from sklearn.model_selection import train_test_split

    # Create stratification groups based on label and confidence level
    if 'human_confidence' in manual_df.columns:
        manual_df['confidence_group'] = pd.cut(
            manual_df['human_confidence'].fillna(0.5),
            bins=[0, 0.6, 0.8, 1.0],
            labels=['low', 'medium', 'high'],
            include_lowest=True
        )

        manual_df['stratify_group'] = manual_df['human_label'].astype(str) + '_' + manual_df['confidence_group'].astype(str)
        stratify_col = 'stratify_group'
    else:
        stratify_col = 'human_label'

    # Split data (80/20 for training/validation)
    try:
        train_df, val_df = train_test_split(
            manual_df,
            test_size=0.2,
            random_state=SEED,
            stratify=manual_df[stratify_col]
        )

        print(f"Training set: {len(train_df)} records")
        print(f"Validation set: {len(val_df)} records")

    except ValueError as e:
        # If stratification fails (too few samples), do random split
        print(f"Stratification failed ({e}), using random split")
        train_df, val_df = train_test_split(
            manual_df,
            test_size=0.2,
            random_state=SEED
        )

        print(f"Training set: {len(train_df)} records")
        print(f"Validation set: {len(val_df)} records")

    # Analyze splits
    split_analysis = {
        'total_manual_labels': len(manual_df),
        'train_size': len(train_df),
        'val_size': len(val_df),
        'train_label_dist': train_df['human_label'].value_counts().to_dict(),
        'val_label_dist': val_df['human_label'].value_counts().to_dict()
    }

    print(f"\nTraining Label Distribution:")
    for label, count in split_analysis['train_label_dist'].items():
        pct = (count / len(train_df)) * 100
        print(f"  {label}: {count} ({pct:.1f}%)")

    print(f"\nValidation Label Distribution:")
    for label, count in split_analysis['val_label_dist'].items():
        pct = (count / len(val_df)) * 100
        print(f"  {label}: {count} ({pct:.1f}%)")

    return train_df, val_df, split_analysis

# Prepare data for training
if manual_labels_df is not None:
    train_df, val_df, split_analysis = prepare_manual_data_for_training(manual_labels_df)



PREPARING DATA FOR FINE-TUNING
Total manually labeled records: 200
Stratification failed (The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.), using random split
Training set: 160 records
Validation set: 40 records

Training Label Distribution:
  neutral: 105 (65.6%)
  positive: 32 (20.0%)
  negative: 22 (13.8%)
  poistive: 1 (0.6%)

Validation Label Distribution:
  neutral: 21 (52.5%)
  positive: 12 (30.0%)
  negative: 7 (17.5%)


In [77]:
## Quality Assessment and Recommendations

def assess_manual_data_quality(df: pd.DataFrame, validation_report: Dict, comparison_results: Dict) -> Dict:
    """Assess overall quality and provide recommendations."""
    print("\n" + "="*60)
    print("MANUAL DATA QUALITY ASSESSMENT")
    print("="*60)

    assessment = {
        'overall_quality': 'unknown',
        'recommendations': [],
        'fine_tuning_readiness': False,
        'key_metrics': {}
    }

    if df is None or not validation_report:
        assessment['recommendations'].append("Manual labeling data not available")
        return assessment

    # Extract key metrics
    manually_labeled_count = validation_report.get('manually_labeled_count', 0)
    total_records = validation_report.get('total_records', 0)

    assessment['key_metrics'] = {
        'manual_label_coverage': manually_labeled_count / total_records if total_records > 0 else 0,
        'manual_label_count': manually_labeled_count,
        'label_distribution': validation_report.get('label_distribution', {}),
        'avg_confidence': validation_report.get('confidence_stats', {}).get('mean', 0)
    }

    # Assess data quantity
    if manually_labeled_count < 50:
        assessment['recommendations'].append("Very small dataset - consider active learning or data augmentation")
        quality_score = 1
    elif manually_labeled_count < 100:
        assessment['recommendations'].append("Small dataset - fine-tuning may have limited impact")
        quality_score = 2
    elif manually_labeled_count < 500:
        assessment['recommendations'].append("Moderate dataset size - suitable for fine-tuning")
        quality_score = 3
    else:
        assessment['recommendations'].append("Good dataset size for fine-tuning")
        quality_score = 4

    # Assess label balance
    label_dist = validation_report.get('label_distribution', {})
    if label_dist:
        label_counts = list(label_dist.values())
        min_count = min(label_counts)
        max_count = max(label_counts)
        balance_ratio = min_count / max_count if max_count > 0 else 0

        if balance_ratio < 0.1:
            assessment['recommendations'].append("Highly imbalanced labels - consider resampling techniques")
            quality_score = min(quality_score, 2)
        elif balance_ratio < 0.3:
            assessment['recommendations'].append("Moderately imbalanced labels - use class weights in training")
            quality_score = min(quality_score, 3)
        else:
            assessment['recommendations'].append("Well-balanced label distribution")

    # Assess confidence scores
    confidence_stats = validation_report.get('confidence_stats', {})
    if confidence_stats:
        avg_confidence = confidence_stats.get('mean', 0)
        if avg_confidence < 0.6:
            assessment['recommendations'].append("Low average confidence - review annotation quality")
            quality_score = min(quality_score, 2)
        elif avg_confidence > 0.8:
            assessment['recommendations'].append("High confidence annotations")
        else:
            assessment['recommendations'].append("Moderate confidence annotations")

    # Assess model agreement
    if comparison_results:
        model_agreements = []
        for model_name, results in comparison_results.items():
            if isinstance(results, dict) and 'agreement_rate' in results:
                agreement = results['agreement_rate']
                model_agreements.append(agreement)

                if agreement < 0.5:
                    assessment['recommendations'].append(f"Low agreement with {model_name} ({agreement:.2f}) - manual labels may be needed")
                elif agreement < 0.7:
                    assessment['recommendations'].append(f"Moderate agreement with {model_name} ({agreement:.2f}) - good for fine-tuning")
                else:
                    assessment['recommendations'].append(f"High agreement with {model_name} ({agreement:.2f}) - models already performing well")

        if model_agreements:
            avg_agreement = np.mean(model_agreements)
            assessment['key_metrics']['avg_model_agreement'] = avg_agreement

    # Overall quality assessment
    if quality_score >= 4:
        assessment['overall_quality'] = 'excellent'
        assessment['fine_tuning_readiness'] = True
    elif quality_score >= 3:
        assessment['overall_quality'] = 'good'
        assessment['fine_tuning_readiness'] = True
    elif quality_score >= 2:
        assessment['overall_quality'] = 'fair'
        assessment['fine_tuning_readiness'] = True
        assessment['recommendations'].append("Proceed with caution - may need data augmentation")
    else:
        assessment['overall_quality'] = 'poor'
        assessment['fine_tuning_readiness'] = False
        assessment['recommendations'].append("Not recommended for fine-tuning without more data")

    # Print assessment
    print(f"Overall Quality: {assessment['overall_quality'].upper()}")
    print(f"Fine-tuning Ready: {'Yes' if assessment['fine_tuning_readiness'] else 'No'}")
    print(f"\nKey Metrics:")
    for metric, value in assessment['key_metrics'].items():
        if isinstance(value, float):
            print(f"  {metric}: {value:.3f}")
        else:
            print(f"  {metric}: {value}")

    print(f"\nRecommendations:")
    for rec in assessment['recommendations']:
        print(f"  {rec}")

    return assessment

# Assess overall quality
if manual_labels_df is not None:
    quality_assessment = assess_manual_data_quality(
        manual_labels_df, validation_report, model_comparison_results
    )




MANUAL DATA QUALITY ASSESSMENT
Overall Quality: FAIR
Fine-tuning Ready: Yes

Key Metrics:
  manual_label_coverage: 0.180
  manual_label_count: 200
  label_distribution: {'neutral': 126, 'positive': 44, 'negative': 29, 'poistive': 1}
  avg_confidence: 3.695
  avg_model_agreement: 0.875

Recommendations:
  Moderate dataset size - suitable for fine-tuning
  Highly imbalanced labels - consider resampling techniques
  High confidence annotations
  High agreement with finbert_tone (0.90) - models already performing well
  High agreement with prosus (0.85) - models already performing well
  Proceed with caution - may need data augmentation


In [78]:
## Save Validation Results

def save_validation_results():
    """Save all validation results and prepared data."""
    print("\n" + "="*50)
    print("SAVING VALIDATION RESULTS")
    print("="*50)

    # Save validation report
    validation_report_path = manual_validation_path / "manual_validation_report.json"
    with open(validation_report_path, 'w') as f:
        json.dump(validation_report, f, indent=2, default=str)
    print(f"Validation report: {validation_report_path}")

    # Save model comparison results
    if 'model_comparison_results' in locals():
        comparison_path = manual_validation_path / "manual_vs_model_comparison.json"
        with open(comparison_path, 'w') as f:
            json.dump(model_comparison_results, f, indent=2, default=str)
        print(f"Model comparison: {comparison_path}")

    # Save quality assessment
    if 'quality_assessment' in locals():
        quality_path = manual_validation_path / "data_quality_assessment.json"
        with open(quality_path, 'w') as f:
            json.dump(quality_assessment, f, indent=2, default=str)
        print(f"Quality assessment: {quality_path}")

    # Save training/validation splits
    if 'train_df' in locals() and train_df is not None:
        train_path = manual_validation_path / "train_manual_labels.csv"
        train_df.to_csv(train_path, index=False)
        print(f"Training data: {train_path} ({len(train_df)} records)")

    if 'val_df' in locals() and val_df is not None:
        val_path = manual_validation_path / "val_manual_labels.csv"
        val_df.to_csv(val_path, index=False)
        print(f"Validation data: {val_path} ({len(val_df)} records)")

    # Save full manual dataset to results directory for next notebooks
    if manual_labels_df is not None:
        full_manual_path = results_sentiment_path / "sentiment_sentence_jpm_multi_2025_validated.csv"
        manual_labels_df.to_csv(full_manual_path, index=False)
        print(f"Full validated dataset: {full_manual_path}")

# Save all results
save_validation_results()



SAVING VALIDATION RESULTS
Validation report: /content/drive/MyDrive/CAM_DS_AI_Project/data/manual_validation/jpm/manual_validation_report.json
Full validated dataset: /content/drive/MyDrive/CAM_DS_AI_Project/results/sentiment/jpm/sentiment_sentence_jpm_multi_2025_validated.csv


In [79]:

## Summary and Next Steps

print("\n" + "="*60)
print("MANUAL VALIDATION COMPLETE")
print("="*60)

if manual_labels_df is not None:
    manually_labeled_count = validation_report.get('manually_labeled_count', 0)
    total_records = validation_report.get('total_records', 0)
    coverage = manually_labeled_count / total_records if total_records > 0 else 0

    print(f"Summary:")
    print(f"  Total records: {total_records:,}")
    print(f"  Manually labeled: {manually_labeled_count:,} ({coverage:.1%})")

    if 'quality_assessment' in locals():
        print(f"  Data quality: {quality_assessment['overall_quality']}")
        print(f"  Fine-tuning ready: {'Yes' if quality_assessment['fine_tuning_readiness'] else 'No'}")

    if 'train_df' in locals() and train_df is not None:
        print(f"  Training samples: {len(train_df)}")
        print(f"  Validation samples: {len(val_df)}")

    print(f"\nFiles created:")
    print(f"  Manual validation data: {manual_validation_path}")
    print(f"  Training/validation splits: Ready for fine-tuning")

    print(f"\nNext steps:")
    if quality_assessment.get('fine_tuning_readiness', False):
        print(f"  Proceed to 04b_model_finetuning.ipynb")
        print(f"  Use prepared train/val splits for fine-tuning")
    else:
        print(f"  Consider collecting more manual labels")
        print(f"  Review data quality issues before fine-tuning")

    print(f"  Continue with enhanced sentiment analysis in 04_sentiment_analysis_jpm_enhanced.ipynb")

else:
    print("Manual validation failed - no data available")
    print("   Check Google Drive URL and file permissions")
    print("   Ensure manual labeling is complete")

print(f"\nManual validation process complete!")






MANUAL VALIDATION COMPLETE
Summary:
  Total records: 1,110
  Manually labeled: 200 (18.0%)
  Data quality: fair
  Fine-tuning ready: Yes
  Training samples: 160
  Validation samples: 40

Files created:
  Manual validation data: /content/drive/MyDrive/CAM_DS_AI_Project/data/manual_validation/jpm
  Training/validation splits: Ready for fine-tuning

Next steps:
  Proceed to 04b_model_finetuning.ipynb
  Use prepared train/val splits for fine-tuning
  Continue with enhanced sentiment analysis in 04_sentiment_analysis_jpm_enhanced.ipynb

Manual validation process complete!
