In [2]:
"""
NLTK Fix - Run this cell FIRST to resolve punkt_tab error
"""

import nltk
import ssl

# Fix SSL certificate verification (sometimes needed in Colab)
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# List of all required NLTK packages
nltk_packages = [
    'punkt',           # Sentence tokenizer
    'punkt_tab',       # Updated punkt tokenizer (required for newer NLTK)
    'stopwords',       # Common stopwords
    'wordnet',         # WordNet lexical database
    'omw-1.4',         # Open Multilingual WordNet
    'averaged_perceptron_tagger',  # POS tagger
    'maxent_ne_chunker',  # Named entity chunker
    'words'            # Word corpus
]

print("="*60)
print("DOWNLOADING NLTK DATA")
print("="*60)

success_count = 0
fail_count = 0

for package in nltk_packages:
    try:
        print(f"Downloading {package}...", end=" ")
        nltk.download(package, quiet=True)
        print("✓")
        success_count += 1
    except Exception as e:
        print(f"✗ ({str(e)[:50]})")
        fail_count += 1

print("="*60)
print(f"Summary: {success_count} successful, {fail_count} failed")
print("="*60)

# Verify installation
print("\nVerifying NLTK installation...")
try:
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords
    from nltk.stem import WordNetLemmatizer

    test_text = "This is a test sentence for NLTK verification."
    tokens = word_tokenize(test_text)
    stops = stopwords.words('english')
    lemmatizer = WordNetLemmatizer()

    print("✓ NLTK is working correctly!")
    print(f"  Test tokens: {tokens[:5]}")
    print(f"  Stopwords count: {len(stops)}")

except Exception as e:
    print(f"✗ NLTK verification failed: {e}")
    print("\nTry running this command manually:")
    print("  import nltk")
    print("  nltk.download('all')")

print("\n✅ Ready to proceed with text preprocessing!")

DOWNLOADING NLTK DATA
Downloading punkt... ✓
Downloading punkt_tab... ✓
Downloading stopwords... ✓
Downloading wordnet... ✓
Downloading omw-1.4... ✓
Downloading averaged_perceptron_tagger... ✓
Downloading maxent_ne_chunker... ✓
Downloading words... ✓
Summary: 8 successful, 0 failed

Verifying NLTK installation...
✓ NLTK is working correctly!
  Test tokens: ['This', 'is', 'a', 'test', 'sentence']
  Stopwords count: 198

✅ Ready to proceed with text preprocessing!


In [3]:
"""
HFRP MASTER SCRIPT - Complete Pipeline
Run this single script to execute the entire HFRP project from start to finish

This script combines all 15 parts into one comprehensive pipeline:
1. Setup and Installation
2. Data Loading (Real Financial Data from Yahoo Finance)
3. Data Visualization
4. Text Preprocessing
5. Numerical Data Processing
6. PCA and Clustering
7. Data Preparation for Model
8. Build HFRP Model
9. Compile and Train Model
10. Training Visualization
11. Model Evaluation
12. Risk Mitigation Analysis
13. Comprehensive Dashboard
14. Model Export
15. Final Report Generation

Author: Based on research by Shi et al. (2025)
"""

# ============================================================================
# SECTION 0: CONFIGURATION
# ============================================================================

# Set to True to use real data from Yahoo Finance (slower but real)
# Set to False to use synthetic data (faster for testing)
USE_REAL_DATA = True

# Training configuration
EPOCHS = 50
BATCH_SIZE = 32

# Model configuration
EMBEDDING_DIM = 128
LSTM_UNITS = 64
CNN_FILTERS = 128

print("="*80)
print("HYBRID FINANCIAL RISK PREDICTOR (HFRP) - COMPLETE PIPELINE")
print("="*80)
print(f"\nConfiguration:")
print(f"  - Use Real Data: {USE_REAL_DATA}")
print(f"  - Training Epochs: {EPOCHS}")
print(f"  - Batch Size: {BATCH_SIZE}")
print(f"  - Embedding Dimension: {EMBEDDING_DIM}")
print(f"  - LSTM Units: {LSTM_UNITS}")
print(f"  - CNN Filters: {CNN_FILTERS}")
print("="*80)

# ============================================================================
# SECTION 1: IMPORTS AND SETUP
# ============================================================================

print("\n[1/15] Installing and importing libraries...")

# Install required packages (uncomment if needed)
# !pip install -q tensorflow==2.12.0 scikit-learn pandas numpy matplotlib seaborn nltk wordcloud yfinance

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import warnings
import random
from datetime import datetime, timedelta
import json
import joblib

warnings.filterwarnings('ignore')

# Download NLTK data
for package in ['punkt', 'stopwords', 'wordnet', 'omw-1.4']:
    try:
        nltk.download(package, quiet=True)
    except:
        pass

# Set seeds
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ Libraries loaded successfully!")
print(f"  TensorFlow version: {tf.__version__}")
print(f"  GPU Available: {'Yes' if len(tf.config.list_physical_devices('GPU')) > 0 else 'No'}")

# ============================================================================
# SECTION 2: DATA LOADING
# ============================================================================

print("\n[2/15] Loading financial data...")

if USE_REAL_DATA:
    print("  Using REAL financial data from Yahoo Finance...")
    print("  This will take several minutes...")

    try:
        import yfinance as yf

        # Major company tickers
        TICKERS = [
            'AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META', 'NVDA', 'TSLA', 'JPM',
            'BAC', 'WFC', 'V', 'MA', 'JNJ', 'PFE', 'UNH', 'WMT', 'HD', 'KO',
            'PEP', 'XOM', 'CVX', 'INTC', 'CSCO', 'ORCL', 'CRM', 'ADBE',
            'NFLX', 'PYPL', 'GS', 'MS', 'C', 'GE', 'T', 'VZ', 'MRK'
        ]

        all_data = []
        for ticker in TICKERS:
            try:
                stock = yf.Ticker(ticker)
                financials = stock.financials
                balance_sheet = stock.balance_sheet
                cashflow = stock.cashflow
                info = stock.info

                if not financials.empty and not balance_sheet.empty:
                    for date in financials.columns[:4]:  # Last 4 reports
                        try:
                            revenue = financials.loc['Total Revenue', date] if 'Total Revenue' in financials.index else np.nan
                            net_income = financials.loc['Net Income', date] if 'Net Income' in financials.index else np.nan
                            operating_income = financials.loc['Operating Income', date] if 'Operating Income' in financials.index else np.nan
                            total_assets = balance_sheet.loc['Total Assets', date] if 'Total Assets' in balance_sheet.index else np.nan
                            total_liabilities = balance_sheet.loc['Total Liabilities Net Minority Interest', date] if 'Total Liabilities Net Minority Interest' in balance_sheet.index else np.nan

                            shares = info.get('sharesOutstanding', 1)
                            eps = net_income / shares if shares > 0 and not np.isnan(net_income) else np.nan
                            cash_flow = cashflow.loc['Operating Cash Flow', date] if 'Operating Cash Flow' in cashflow.index and date in cashflow.columns else np.nan

                            disclosure = f"{info.get('longName', ticker)} reported financial results. The company operates in {info.get('sector', 'various')} sector. Management focuses on operational efficiency and growth."

                            all_data.append({
                                'Company_Name': info.get('longName', ticker),
                                'Ticker': ticker,
                                'Report_Date': date,
                                'Revenue': float(revenue) if not np.isnan(revenue) else None,
                                'Net_Income': float(net_income) if not np.isnan(net_income) else None,
                                'EPS': float(eps) if not np.isnan(eps) else None,
                                'Total_Assets': float(total_assets) if not np.isnan(total_assets) else None,
                                'Total_Liabilities': float(total_liabilities) if not np.isnan(total_liabilities) else None,
                                'Operating_Income': float(operating_income) if not np.isnan(operating_income) else None,
                                'Cash_Flow_Operations': float(cash_flow) if not np.isnan(cash_flow) else None,
                                'Textual_Disclosures': disclosure
                            })
                        except:
                            continue
                print(f"  ✓ {ticker}", end=" ", flush=True)
            except:
                print(f"  ✗ {ticker}", end=" ", flush=True)

        df = pd.DataFrame(all_data)
        df = df.dropna(thresh=7)
        numerical_cols = ['Revenue', 'Net_Income', 'EPS', 'Total_Assets', 'Total_Liabilities', 'Operating_Income', 'Cash_Flow_Operations']
        for col in numerical_cols:
            if col in df.columns:
                df[col] = df.groupby('Ticker')[col].fillna(method='ffill').fillna(method='bfill')
        df = df.dropna(subset=numerical_cols)
        print("\n  ✓ Real data loaded successfully!")

    except Exception as e:
        print(f"\n  ✗ Error loading real data: {str(e)}")
        print("  Falling back to synthetic data...")
        USE_REAL_DATA = False

if not USE_REAL_DATA:
    print("  Using SYNTHETIC financial data...")

    companies = ['Apple Inc.', 'Microsoft Corporation', 'Amazon.com Inc.', 'Alphabet Inc.',
                 'Tesla Inc.', 'Meta Platforms Inc.', 'NVIDIA Corporation', 'JPMorgan Chase',
                 'Bank of America', 'Wells Fargo', 'Visa Inc.', 'Mastercard Inc.',
                 'Johnson & Johnson', 'Pfizer Inc.', 'UnitedHealth Group', 'Walmart Inc.'] * 5

    data = []
    start_date = datetime(2014, 1, 1)

    for i, company in enumerate(companies):
        base_revenue = np.random.uniform(10e9, 500e9)
        base_assets = np.random.uniform(20e9, 1000e9)
        growth_rate = np.random.uniform(0.02, 0.15)

        for year in range(8):
            report_date = start_date + timedelta(days=365 * year + i * 30)
            revenue = base_revenue * (1 + growth_rate) ** year * np.random.uniform(0.95, 1.05)
            net_income = revenue * np.random.uniform(0.05, 0.25)
            total_assets = base_assets * (1 + growth_rate * 0.8) ** year
            total_liabilities = total_assets * np.random.uniform(0.3, 0.7)
            operating_income = revenue * np.random.uniform(0.1, 0.3)
            cash_flow = net_income * np.random.uniform(0.8, 1.2)
            eps = net_income / np.random.uniform(1e9, 10e9)

            disclosure = f"{company} reported strong financial performance. The company maintains solid market position with strategic growth initiatives and effective cost management."

            data.append({
                'Company_Name': company,
                'Report_Date': report_date,
                'Revenue': revenue,
                'Net_Income': net_income,
                'EPS': eps,
                'Total_Assets': total_assets,
                'Total_Liabilities': total_liabilities,
                'Operating_Income': operating_income,
                'Cash_Flow_Operations': cash_flow,
                'Textual_Disclosures': disclosure
            })

    df = pd.DataFrame(data)
    print("  ✓ Synthetic data generated successfully!")

print(f"\n✓ Dataset ready: {len(df)} records, {df['Company_Name'].nunique()} companies")

# ============================================================================
# SECTION 3-6: DATA ANALYSIS AND PREPROCESSING
# ============================================================================

print("\n[3/15] Creating visualizations...")

# Quick visualizations
numerical_cols = ['Revenue', 'Net_Income', 'EPS', 'Total_Assets', 'Total_Liabilities', 'Operating_Income', 'Cash_Flow_Operations']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes[0,0].hist(df['Revenue']/1e9, bins=30, color='skyblue', edgecolor='black')
axes[0,0].set_title('Revenue Distribution')
axes[0,1].hist(df['Net_Income']/1e9, bins=30, color='coral', edgecolor='black')
axes[0,1].set_title('Net Income Distribution')
axes[1,0].hist(df['EPS'], bins=30, color='lightgreen', edgecolor='black')
axes[1,0].set_title('EPS Distribution')
axes[1,1].scatter(df['Total_Assets']/1e9, df['Total_Liabilities']/1e9, alpha=0.5)
axes[1,1].set_title('Assets vs Liabilities')
plt.tight_layout()
plt.savefig('distributions.png', dpi=150)
plt.close()
print("✓ Visualizations saved")

print("\n[4/15] Preprocessing text data...")

class TextPreprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))
        self.tokenizer = None
        self.max_length = 200

    def clean_text(self, text):
        text = str(text).lower()
        text = ''.join([c for c in text if c.isalpha() or c.isspace()])
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(t) for t in tokens if t not in self.stop_words and len(t) > 2]
        return ' '.join(tokens)

    def fit_transform(self, texts, max_words=5000):
        cleaned = [self.clean_text(t) for t in texts]
        self.tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
        self.tokenizer.fit_on_texts(cleaned)
        sequences = self.tokenizer.texts_to_sequences(cleaned)
        padded = pad_sequences(sequences, maxlen=self.max_length, padding='post')
        return padded, cleaned

preprocessor = TextPreprocessor()
text_sequences, cleaned_texts = preprocessor.fit_transform(df['Textual_Disclosures'].values)
print(f"✓ Text preprocessed: vocab={len(preprocessor.tokenizer.word_index)}")

print("\n[5/15] Processing numerical features...")

df_numerical = df[numerical_cols].copy()
df_numerical['Debt_to_Asset_Ratio'] = df['Total_Liabilities'] / df['Total_Assets']
df_numerical['Profit_Margin'] = df['Net_Income'] / df['Revenue']
df_numerical['Asset_Turnover'] = df['Revenue'] / df['Total_Assets']
df_numerical['ROA'] = df['Net_Income'] / df['Total_Assets']
df_numerical['Operating_Margin'] = df['Operating_Income'] / df['Revenue']

# Calculate risk scores
credit_risk = (df_numerical['Debt_to_Asset_Ratio'] - df_numerical['Debt_to_Asset_Ratio'].min()) / (df_numerical['Debt_to_Asset_Ratio'].max() - df_numerical['Debt_to_Asset_Ratio'].min())
market_risk = 1 - ((df_numerical['Profit_Margin'] - df_numerical['Profit_Margin'].min()) / (df_numerical['Profit_Margin'].max() - df_numerical['Profit_Margin'].min()))
operational_risk = 1 - ((df_numerical['ROA'] - df_numerical['ROA'].min()) / (df_numerical['ROA'].max() - df_numerical['ROA'].min()))
liquidity_risk = np.random.uniform(0.3, 0.8, size=len(df_numerical))
risk_score = 0.3*credit_risk + 0.25*market_risk + 0.25*operational_risk + 0.2*liquidity_risk

df_numerical['Credit_Risk'] = credit_risk
df_numerical['Market_Risk'] = market_risk
df_numerical['Operational_Risk'] = operational_risk
df_numerical['Liquidity_Risk'] = liquidity_risk
df_numerical['Risk_Score'] = risk_score
print("✓ Features engineered and risk scores calculated")

print("\n[6/15] Performing PCA and clustering...")

feature_cols_cluster = ['Revenue', 'Net_Income', 'Total_Assets', 'Total_Liabilities']
X_cluster = df_numerical[feature_cols_cluster].copy()
scaler_pca = StandardScaler()
X_scaled = scaler_pca.fit_transform(X_cluster)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_scaled)
df_numerical['Cluster'] = clusters
df['Cluster'] = clusters

plt.figure(figsize=(10, 8))
for i in range(3):
    mask = clusters == i
    plt.scatter(X_pca[mask, 0], X_pca[mask, 1], label=f'Cluster {i}', alpha=0.6)
plt.title('PCA Clustering')
plt.legend()
plt.savefig('pca_clustering.png', dpi=150)
plt.close()
print(f"✓ Clustering complete: {3} clusters identified")

# ============================================================================
# SECTION 7: DATA PREPARATION
# ============================================================================

print("\n[7/15] Preparing data for model training...")

feature_cols = ['Revenue', 'Net_Income', 'EPS', 'Total_Assets', 'Total_Liabilities',
                'Operating_Income', 'Cash_Flow_Operations', 'Debt_to_Asset_Ratio',
                'Profit_Margin', 'Asset_Turnover', 'ROA', 'Operating_Margin']

X_numerical = df_numerical[feature_cols].values
scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X_numerical)

y_financial = np.column_stack([df['Revenue'].values, df['Net_Income'].values, df['EPS'].values])
y_risk = np.column_stack([df_numerical['Credit_Risk'].values, df_numerical['Market_Risk'].values,
                          df_numerical['Operational_Risk'].values, df_numerical['Liquidity_Risk'].values,
                          df_numerical['Risk_Score'].values])

indices = np.arange(len(X_numerical_scaled))
train_idx, temp_idx = train_test_split(indices, test_size=0.3, random_state=42)
val_idx, test_idx = train_test_split(temp_idx, test_size=0.5, random_state=42)

X_num_train, X_num_val, X_num_test = X_numerical_scaled[train_idx], X_numerical_scaled[val_idx], X_numerical_scaled[test_idx]
X_text_train, X_text_val, X_text_test = text_sequences[train_idx], text_sequences[val_idx], text_sequences[test_idx]
y_financial_train, y_financial_val, y_financial_test = y_financial[train_idx], y_financial[val_idx], y_financial[test_idx]
y_risk_train, y_risk_val, y_risk_test = y_risk[train_idx], y_risk[val_idx], y_risk[test_idx]

print(f"✓ Train: {len(train_idx)}, Val: {len(val_idx)}, Test: {len(test_idx)}")

# ============================================================================
# SECTION 8-9: BUILD AND TRAIN MODEL
# ============================================================================

print("\n[8/15] Building HFRP model architecture...")

vocab_size = len(preprocessor.tokenizer.word_index) + 1
text_max_length = preprocessor.max_length
num_features = len(feature_cols)

# Text branch (CNN)
text_input = Input(shape=(text_max_length,), name='text_input')
text_embedded = Embedding(vocab_size, EMBEDDING_DIM, input_length=text_max_length)(text_input)
conv1 = Conv1D(CNN_FILTERS, 3, activation='relu', padding='same')(text_embedded)
conv1 = BatchNormalization()(conv1)
pool1 = MaxPooling1D(2)(conv1)
drop1 = Dropout(0.3)(pool1)
conv2 = Conv1D(CNN_FILTERS*2, 3, activation='relu', padding='same')(drop1)
conv2 = BatchNormalization()(conv2)
pool2 = MaxPooling1D(2)(conv2)
text_features = GlobalMaxPooling1D()(pool2)
text_dense = Dense(128, activation='relu')(text_features)
text_output = Dropout(0.3)(text_dense)

# Numerical branch (LSTM)
num_input = Input(shape=(num_features,), name='numerical_input')
num_reshaped = tf.keras.layers.Reshape((num_features, 1))(num_input)
lstm1 = LSTM(LSTM_UNITS, return_sequences=True)(num_reshaped)
lstm1 = BatchNormalization()(lstm1)
lstm1 = Dropout(0.3)(lstm1)
lstm2 = LSTM(LSTM_UNITS)(lstm1)
lstm2 = BatchNormalization()(lstm2)
num_output = Dropout(0.3)(lstm2)

# Combined
combined = Concatenate()([text_output, num_output])
dense1 = Dense(256, activation='relu')(combined)
dense1 = BatchNormalization()(dense1)
dense1 = Dropout(0.4)(dense1)
dense2 = Dense(128, activation='relu')(dense1)
dense2 = BatchNormalization()(dense2)
dense2 = Dropout(0.4)(dense2)
dense3 = Dense(64, activation='relu')(dense2)
dense3 = Dropout(0.3)(dense3)

financial_output = Dense(3, activation='linear', name='financial_output')(dense3)
risk_output = Dense(5, activation='sigmoid', name='risk_output')(dense3)

model = Model(inputs=[text_input, num_input], outputs=[financial_output, risk_output], name='HFRP')
print(f"✓ Model built: {model.count_params():,} parameters")

print("\n[9/15] Training HFRP model...")

model.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss={'financial_output': 'mse', 'risk_output': 'mse'},
    loss_weights={'financial_output': 1.0, 'risk_output': 1.0},
    metrics={'financial_output': ['mae'], 'risk_output': ['mae']}
)

callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
]

history = model.fit(
    [X_text_train, X_num_train],
    [y_financial_train, y_risk_train],
    validation_data=([X_text_val, X_num_val], [y_financial_val, y_risk_val]),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

print(f"✓ Training complete! Final loss: {history.history['loss'][-1]:.6f}")

# ============================================================================
# SECTION 10-12: EVALUATION
# ============================================================================

print("\n[10/15] Plotting training history...")

plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Val')
plt.title('Training Loss')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['financial_output_mae'], label='Financial MAE')
plt.plot(history.history['risk_output_mae'], label='Risk MAE')
plt.title('MAE Progress')
plt.legend()
plt.tight_layout()
plt.savefig('training_history.png', dpi=150)
plt.close()
print("✓ Training plots saved")

print("\n[11/15] Evaluating model...")

predictions = model.predict([X_text_test, X_num_test], verbose=0)
financial_pred, risk_pred = predictions[0], predictions[1]

def calc_metrics(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    return {'MSE': mse, 'RMSE': rmse, 'MAE': mae, 'R2': r2}

print("\nFinancial Metrics:")
for i, label in enumerate(['Revenue', 'Net Income', 'EPS']):
    m = calc_metrics(y_financial_test[:, i], financial_pred[:, i])
    print(f"  {label}: R²={m['R2']:.4f}, RMSE={m['RMSE']:.2e}")

print("\nRisk Metrics:")
for i, label in enumerate(['Credit', 'Market', 'Operational', 'Liquidity', 'Overall']):
    m = calc_metrics(y_risk_test[:, i], risk_pred[:, i])
    print(f"  {label}: R²={m['R2']:.4f}, MAE={m['MAE']:.4f}")

print("\n[12/15] Analyzing risk mitigation...")

risk_before = y_risk_test.mean(axis=0)
risk_after = risk_pred.mean(axis=0)
risk_reduction = ((risk_before - risk_after) / risk_before) * 100

print("\nRisk Reduction:")
risk_labels = ['Credit', 'Market', 'Operational', 'Liquidity', 'Overall']
for i, label in enumerate(risk_labels):
    print(f"  {label}: {risk_before[i]:.4f} → {risk_after[i]:.4f} ({risk_reduction[i]:.1f}%)")

# ============================================================================
# SECTION 13-15: EXPORT AND REPORTING
# ============================================================================

print("\n[13/15] Creating comprehensive dashboard...")

fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes[0,0].plot(history.history['loss'])
axes[0,0].set_title('Training Loss')
axes[0,1].bar(risk_labels, risk_reduction)
axes[0,1].set_title('Risk Reduction %')
axes[1,0].scatter(y_financial_test[:,0], financial_pred[:,0], alpha=0.5)
axes[1,0].plot([y_financial_test[:,0].min(), y_financial_test[:,0].max()],
               [y_financial_test[:,0].min(), y_financial_test[:,0].max()], 'r--')
axes[1,0].set_title('Revenue: Actual vs Predicted')
x = np.arange(len(risk_labels))
axes[1,1].bar(x-0.2, risk_before, 0.4, label='Before')
axes[1,1].bar(x+0.2, risk_after, 0.4, label='After')
axes[1,1].set_xticks(x)
axes[1,1].set_xticklabels(risk_labels, rotation=45)
axes[1,1].set_title('Risk Comparison')
axes[1,1].legend()
plt.tight_layout()
plt.savefig('dashboard.png', dpi=150)
plt.close()
print("✓ Dashboard saved")

print("\n[14/15] Exporting model...")

model.save('hfrp_model.h5')
joblib.dump(scaler, 'scaler.pkl')
with open('tokenizer.json', 'w') as f:
    json.dump(preprocessor.tokenizer.to_json(), f)
print("✓ Model exported (hfrp_model.h5, scaler.pkl, tokenizer.json)")

print("\n[15/15] Generating final report...")

report = f"""# HFRP Final Report

## Summary
- Dataset: {len(df)} records, {df['Company_Name'].nunique()} companies
- Training samples: {len(train_idx)}
- Final loss: {history.history['loss'][-1]:.6f}

## Performance
- Avg Financial R²: {np.mean([calc_metrics(y_financial_test[:,i], financial_pred[:,i])['R2'] for i in range(3)]):.4f}
- Avg Risk R²: {np.mean([calc_metrics(y_risk_test[:,i], risk_pred[:,i])['R2'] for i in range(5)]):.4f}
- Avg Risk Reduction: {risk_reduction.mean():.2f}%

Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
"""

with open('HFRP_Report.md', 'w') as f:
    f.write(report)
print("✓ Report saved (HFRP_Report.md)")

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("\n" + "="*80)
print("🎉 HFRP PROJECT COMPLETE! 🎉")
print("="*80)
print("\nGenerated Files:")
print("  📊 distributions.png - Data visualizations")
print("  📊 pca_clustering.png - Cluster analysis")
print("  📈 training_history.png - Training curves")
print("  📊 dashboard.png - Comprehensive dashboard")
print("  🤖 hfrp_model.h5 - Trained model")
print("  ⚙️  scaler.pkl - Feature scaler")
print("  ⚙️  tokenizer.json - Text tokenizer")
print("  📄 HFRP_Report.md - Final report")
print("\nKey Results:")
print(f"  ✓ Model Parameters: {model.count_params():,}")
print(f"  ✓ Final Validation Loss: {history.history['val_loss'][-1]:.6f}")
print(f"  ✓ Average R² Score: {np.mean([calc_metrics(y_financial_test[:,i], financial_pred[:,i])['R2'] for i in range(3)]):.4f}")
print(f"  ✓ Average Risk Reduction: {risk_reduction.mean():.2f}%")
print("="*80)
print("\n✅ All processes completed successfully!")
print("You can now use the trained model for financial risk prediction.")
print("="*80)

HYBRID FINANCIAL RISK PREDICTOR (HFRP) - COMPLETE PIPELINE

Configuration:
  - Use Real Data: True
  - Training Epochs: 50
  - Batch Size: 32
  - Embedding Dimension: 128
  - LSTM Units: 64
  - CNN Filters: 128

[1/15] Installing and importing libraries...
✓ Libraries loaded successfully!
  TensorFlow version: 2.19.0
  GPU Available: No

[2/15] Loading financial data...
  Using REAL financial data from Yahoo Finance...
  This will take several minutes...
  ✓ AAPL   ✓ MSFT   ✓ GOOGL   ✓ AMZN   ✓ META   ✓ NVDA   ✓ TSLA   ✓ JPM   ✓ BAC   ✓ WFC   ✓ V   ✓ MA   ✓ JNJ   ✓ PFE   ✓ UNH   ✓ WMT   ✓ HD   ✓ KO   ✓ PEP   ✓ XOM   ✓ CVX   ✓ INTC   ✓ CSCO   ✓ ORCL   ✓ CRM   ✓ ADBE   ✓ NFLX   ✓ PYPL   ✓ GS   ✓ MS   ✓ C   ✓ GE   ✓ T   ✓ VZ   ✓ MRK 
  ✓ Real data loaded successfully!

✓ Dataset ready: 140 records, 35 companies

[3/15] Creating visualizations...
✓ Visualizations saved

[4/15] Preprocessing text data...
✓ Text preprocessed: vocab=70

[5/15] Processing numerical features...
✓ Features engin



✓ Dashboard saved

[14/15] Exporting model...
✓ Model exported (hfrp_model.h5, scaler.pkl, tokenizer.json)

[15/15] Generating final report...
✓ Report saved (HFRP_Report.md)

🎉 HFRP PROJECT COMPLETE! 🎉

Generated Files:
  📊 distributions.png - Data visualizations
  📊 pca_clustering.png - Cluster analysis
  📈 training_history.png - Training curves
  📊 dashboard.png - Comprehensive dashboard
  🤖 hfrp_model.h5 - Trained model
  ⚙️  scaler.pkl - Feature scaler
  ⚙️  tokenizer.json - Text tokenizer
  📄 HFRP_Report.md - Final report

Key Results:
  ✓ Model Parameters: 334,408
  ✓ Final Validation Loss: 20796920517655080730624.000000
  ✓ Average R² Score: -1.6571
  ✓ Average Risk Reduction: -4.93%

✅ All processes completed successfully!
You can now use the trained model for financial risk prediction.
