In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
import xgboost as xgb
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, model_type, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.model_type = model_type
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class ClimateNewsOpenAIPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'OpenAI_MLP_XGB/visualizations_mlp/COP'
        self.xgb_viz_dir = 'OpenAI_MLP_XGB/visualizations_xgb/COP'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs(self.xgb_viz_dir, exist_ok=True)
        os.makedirs('OpenAI_MLP_XGB/visualizations_summary/COP', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert embeddings to numpy arrays
        self.data['Title_embedding'] = self.data['Title_embedding_vector'].apply(parse_embedding)
        self.data['Fulltext_embedding'] = self.data['Full_text_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_title_embedding = self.data['Title_embedding'].iloc[0]
        sample_fulltext_embedding = self.data['Fulltext_embedding'].iloc[0]
        
        print(f"Sample Title embedding shape: {sample_title_embedding.shape}")
        print(f"Sample Fulltext embedding shape: {sample_fulltext_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            text_col: The column containing the embeddings ('Title_embedding' or 'Fulltext_embedding')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data[text_col].values)
        X_val = np.stack(val_data[text_col].values)
        X_test = np.stack(test_data[text_col].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def get_xgb_parameters(self):
        """
        Get XGBoost parameters and parameter tuning space optimized for high-dimensional embeddings.
        Uses a single parameter set for both Title and Full text embeddings since they have the same dimension (1536).
        
        Returns:
            Dictionary of base parameters and parameter grid for tuning
        """
        # Setup XGBoost base parameters optimized for high-dimensional embeddings
        base_params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'n_estimators': 100,
            'max_depth': 3,
            'learning_rate': 0.03,
            'subsample': 0.7,          # Row subsampling to prevent overfitting
            'colsample_bytree': 0.7,   # Column subsampling to handle high dimensionality
            'min_child_weight': 3,     # Prevents overfitting on high-dimensional embeddings
            'reg_alpha': 0.5,          # L1 regularization
            'reg_lambda': 1.0,         # L2 regularization
            'random_state': 42
        }
        
        # Parameter grid for tuning
        param_grid = {
            'n_estimators': [50, 100, 150],
            'max_depth': [2, 3, 4],
            'learning_rate': [0.01, 0.03, 0.05],
            'min_child_weight': [2, 3, 5],
            'subsample': [0.6, 0.7, 0.8],
            'colsample_bytree': [0.6, 0.7, 0.8],
            'reg_alpha': [0.1, 0.5, 1.0],
            'reg_lambda': [1.0, 2.0, 3.0]
        }
        
        return base_params, param_grid
    
    def create_xgb_model(self):
        """
        Create an XGBoost model for document-level embeddings.
        
        Returns:
            XGBoost model
        """
        # Get base parameters
        base_params, _ = self.get_xgb_parameters()
        
        # Create XGBoost model with additional required parameters
        base_params['use_label_encoder'] = False  # Avoid deprecation warning
        base_params['eval_metric'] = 'auc'        # Explicitly set eval metric
        
        model = xgb.XGBClassifier(**base_params)
        
        print(f"Created XGBoost model")
        return model
    
    def train_xgb_model(self, model, X_train, y_train, X_val, y_val, early_stopping_rounds=10):
        """
        Train XGBoost model with early stopping using scikit-learn API.
        
        Args:
            model: XGBoost model
            X_train, y_train: Training data
            X_val, y_val: Validation data
            early_stopping_rounds: Early stopping patience
        
        Returns:
            Trained model, training history
        """
        # Set up evaluation set for early stopping
        eval_set = [(X_train, y_train), (X_val, y_val)]
        
        # Train the model with the scikit-learn interface
        model.fit(
            X_train, y_train,
            eval_set=eval_set,
            eval_metric='auc',
            early_stopping_rounds=early_stopping_rounds,
            verbose=True
        )
    
        # Get the evaluation results
        evals_result = {
            'train': {'error': [], 'auc': []},
            'validation': {'error': [], 'auc': []}
        }
    
        # Extract evaluation results if available
        if hasattr(model, 'evals_result'):
            results = model.evals_result()
            if results:
                # Map the results to our expected format
                for i, metric in enumerate(['error', 'auc']):
                    if f'validation_{i}' in results:
                        validation_key = f'validation_{i}'
                        if 'auc' in results[validation_key]:
                            evals_result['validation']['auc'] = results[validation_key]['auc']
                        if 'error' in results[validation_key]:
                            evals_result['validation']['error'] = results[validation_key]['error']
                
                    if f'train_{i}' in results:
                        train_key = f'train_{i}'
                        if 'auc' in results[train_key]:
                            evals_result['train']['auc'] = results[train_key]['auc']
                        if 'error' in results[train_key]:
                            evals_result['train']['error'] = results[train_key]['error']
    
        return model, evals_result
    
    def plot_xgb_learning_curves(self, evals_result, text_col, label_col, layer_num):
        """
        Plot XGBoost final learning curves.
        
        Args:
            evals_result: Results from training
            text_col: Text column used
            label_col: Label column used
            layer_num: Layer number
        """
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        
        plt.figure(figsize=(15, 6))
        plt.subplot(1, 2, 1)
        
        # Plot training error
        plt.plot(evals_result['train']['error'], label='Training Error', color='blue')
        plt.plot(evals_result['validation']['error'], label='Validation Error', color='orange')
        plt.title(f'XGBoost Error Curves ({display_text}, {label_col}, Layer {layer_num})')
        plt.xlabel('Boosting Rounds')
        plt.ylabel('Error')
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.subplot(1, 2, 2)
        
        # Plot AUC
        plt.plot(evals_result['train']['auc'], label='Training AUC', color='blue')
        plt.plot(evals_result['validation']['auc'], label='Validation AUC', color='orange')
        plt.title(f'XGBoost AUC Curves ({display_text}, {label_col}, Layer {layer_num})')
        plt.xlabel('Boosting Rounds')
        plt.ylabel('AUC')
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        
        # Save plot
        plt.savefig(f"{self.xgb_viz_dir}/xgb_{display_text.replace(' ', '_')}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def plot_final_learning_curves(self, history, model_type, text_col, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            model_type: Model type (MLP)
            text_col: Text column used
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_{model_type.lower()}_{display_text.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, text_col, label_col, model_type):
        """
        Train and evaluate a model for a specific text column and label column.
        
        Args:
            text_col: The embedding column to use ('Title_embedding' or 'Fulltext_embedding')
            label_col: The label column to use ('S_label' or 'L_label')
            model_type: The model type to use ('MLP' or 'XGBoost')
        """
        # Store results
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        combination_key = f"{model_type}|{display_text}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training {model_type} model for {display_text} and {label_col}")
        print(f"{'='*80}")
        
        visualization_dir = self.mlp_viz_dir if model_type == 'MLP' else self.xgb_viz_dir
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            if model_type == 'MLP':
                # MLP model training
                model = self.create_mlp_model((1536,))
                print(f"Created MLP model for {display_text} ({label_col})")
                model.summary()
                
                # Setup callbacks
                plot_callback = PlotLearningCallback(model_type, display_text, label_col, i+1, visualization_dir)
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=15,
                    restore_best_weights=True,
                    verbose=1
                )
                model_checkpoint = ModelCheckpoint(
                    filepath=f"{visualization_dir}/best_{model_type.lower()}_{display_text.lower().replace(' ', '_')}_{label_col}_layer_{i+1}.weights.h5",
                    monitor='val_loss',
                    save_weights_only=True,
                    save_best_only=True,
                    verbose=1
                )
                
                # Train model
                print(f"Training MLP model...")
                batch_size = 32  # Larger batch size for embeddings
                history = model.fit(
                    X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                    batch_size=batch_size,
                    callbacks=[early_stopping, model_checkpoint, plot_callback],
                    verbose=1
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict(X_test)
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create final learning curve visualization
                self.plot_final_learning_curves(history, model_type, text_col, label_col, i+1, visualization_dir)
                
                # Store training history
                training_history = history.history
                
            else:  # XGBoost
                # XGBoost model training
                model = self.create_xgb_model()
                print(f"Training XGBoost model...")
                
                # Train model with early stopping
                model, evals_result = self.train_xgb_model(
                    model,
                    X_train, y_train,
                    X_val, y_val,
                    early_stopping_rounds=10
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict_proba(X_test)[:, 1]
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Plot learning curves
                self.plot_xgb_learning_curves(evals_result, text_col, label_col, i+1)
                
                # Store training history
                training_history = evals_result
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model_type': model_type,
                'accuracy': accuracy,
                'auc': auc,
                'history': training_history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text inputs, label columns, and model types."""
        # Define all combinations
        embedding_cols = ['Title_embedding', 'Fulltext_embedding']
        label_cols = ['S_label', 'L_label']
        model_types = ['MLP', 'XGBoost']
        
        # Run analysis for each combination
        for model_type in model_types:
            for embedding_col in embedding_cols:
                for label_col in label_cols:
                    self.train_and_evaluate_model(embedding_col, label_col, model_type)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Model': model_type,
                'Text': text_col,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing all model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Split by model type
        mlp_data = df[df['Model'] == 'MLP']
        xgb_data = df[df['Model'] == 'XGBoost']
        
        # 1. Performance comparison for MLP
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for MLP
        x = np.arange(len(mlp_data))
        width = 0.35
        
        plt.bar(x - width/2, mlp_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, mlp_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('MLP Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in mlp_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(mlp_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(mlp_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(mlp_data['Avg Accuracy'].max(), mlp_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/COP', "mlp_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # 2. Performance comparison for XGBoost
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for XGBoost
        x = np.arange(len(xgb_data))
        
        plt.bar(x - width/2, xgb_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, xgb_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('XGBoost Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of XGBoost Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in xgb_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(xgb_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(xgb_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(xgb_data['Avg Accuracy'].max(), xgb_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/COP', "xgb_performance_comparison.png")
        plt.savefig(save_path)
        print(f"XGBoost summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP and XGBoost models."""
        # Prepare data for visualization
        mlp_layer_data = []
        xgb_layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Model': model_type,
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                if model_type == 'MLP':
                    mlp_layer_data.append(layer_info)
                else:  # XGBoost
                    xgb_layer_data.append(layer_info)
        
        # Create visualizations for each model type
        self._create_model_layer_visualization(mlp_layer_data, 'MLP')
        self._create_model_layer_visualization(xgb_layer_data, 'XGBoost')
    
    def _create_model_layer_visualization(self, layer_data, model_type):
        """Create layer-specific visualizations for a given model type."""
        if not layer_data:
            print(f"No layer data available for {model_type}")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(14, 10))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title(f'{model_type} Accuracy by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title(f'{model_type} AUC by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/COP', f"{model_type.lower()}_layer_performance.png")
        plt.savefig(save_path)
        print(f"{model_type} layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['OpenAI_MLP_XGB/visualizations_mlp/COP', 'OpenAI_MLP_XGB/visualizations_xgb/COP', 
                      'OpenAI_MLP_XGB/visualizations_summary/COP']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_COP_completed_openai.csv'
    
    # Initialize the predictor with OpenAI embeddings
    predictor = ClimateNewsOpenAIPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'OpenAI_MLP_XGB' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing OpenAI embedding vectors from string format...
Sample Title embedding shape: (1536,)
Sample Fulltext embedding shape: (1536,)
Loaded 838 climate change news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0: 431, 1: 407}
Class distribution for long-term prediction: {1: 440, 0: 398}

Training MLP model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Embeddings shapes - Train: (401, 1536), Val: (131, 1536), Test: (133, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (S_label)
Model: "sequential_84"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   


 dense_316 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_231 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_170 (Dropout)       (None, 512)               0         
                                                                 
 dense_317 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_232 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_171 (Dropout)       (None, 256)               0         
                                                                 
 dense_318 (Dense)           (None, 128)               32896     
          

Epoch 17/100
Epoch 17: val_loss did not improve from 0.68725
Epoch 18/100
Epoch 18: val_loss did not improve from 0.68725
Epoch 19/100
Epoch 19: val_loss did not improve from 0.68725
Epoch 20/100
Epoch 20: val_loss did not improve from 0.68725
Epoch 21/100
Epoch 21: val_loss did not improve from 0.68725
Epoch 22/100
Epoch 22: val_loss did not improve from 0.68725
Epoch 23/100
Epoch 23: val_loss did not improve from 0.68725
Epoch 24/100
Epoch 24: val_loss did not improve from 0.68725
Epoch 25/100
Epoch 25: val_loss did not improve from 0.68725
Epoch 26/100
Epoch 26: val_loss did not improve from 0.68725
Epoch 27/100
Epoch 27: val_loss did not improve from 0.68725
Epoch 28/100

Epoch 28: val_loss did not improve from 0.68725
Epoch 28: early stopping
Saved learning curves: final_mlp_Title_S_label_layer_3.png
Test Accuracy for Layer 3: 0.4714
Test AUC for Layer 3: 0.4719

Average Test Accuracy across all layers: 0.5397
Average Test AUC across all layers: 0.5193

Training MLP model for Titl

Epoch 7/100
Epoch 7: val_loss improved from 0.65974 to 0.65949, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_title_L_label_layer_1.weights.h5
Epoch 8/100
Epoch 8: val_loss did not improve from 0.65949
Epoch 9/100
Epoch 9: val_loss did not improve from 0.65949
Epoch 10/100
Epoch 10: val_loss did not improve from 0.65949
Epoch 11/100
Epoch 11: val_loss did not improve from 0.65949
Epoch 12/100
Epoch 12: val_loss did not improve from 0.65949
Epoch 13/100
Epoch 13: val_loss did not improve from 0.65949
Epoch 14/100
Epoch 14: val_loss did not improve from 0.65949
Epoch 15/100
Epoch 15: val_loss did not improve from 0.65949
Epoch 16/100
Epoch 16: val_loss did not improve from 0.65949
Epoch 17/100
Epoch 17: val_loss did not improve from 0.65949
Epoch 18/100
Epoch 18: val_loss did not improve from 0.65949
Epoch 19/100
Epoch 19: val_loss did not improve from 0.65949
Epoch 20/100
Epoch 20: val_loss did not improve from 0.65949
Epoch 21/100
Epoch 21: val_loss did not improve fro

Epoch 4/100
Epoch 4: val_loss improved from 0.63323 to 0.62323, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_title_L_label_layer_2.weights.h5
Epoch 5/100
Epoch 5: val_loss improved from 0.62323 to 0.61691, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_title_L_label_layer_2.weights.h5
Epoch 6/100
Epoch 6: val_loss improved from 0.61691 to 0.61412, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_title_L_label_layer_2.weights.h5
Epoch 7/100
Epoch 7: val_loss improved from 0.61412 to 0.61243, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_title_L_label_layer_2.weights.h5
Epoch 8/100
Epoch 8: val_loss improved from 0.61243 to 0.60969, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_title_L_label_layer_2.weights.h5
Epoch 9/100
Epoch 9: val_loss improved from 0.60969 to 0.60835, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_title_L_label_layer_2.weights.h5
Epoch 10/100
Epoch 10: val_loss improved

                                                                 
 dropout_178 (Dropout)       (None, 512)               0         
                                                                 
 dense_333 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_244 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_179 (Dropout)       (None, 256)               0         
                                                                 
 dense_334 (Dense)           (None, 128)               32896     
                                                                 
 batch_normalization_245 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_335

                                                                 
 dense_339 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.69426, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_full_text_S_label_layer_1.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.69426
Epoch 3/100
Epoch 3: val_loss did not improve from 0.69426
Epoch 4/100
Epoch 4: val_loss did not improve from 0.69426
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69426
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69426
Epoch 7/100
Epoch 7: val_loss did not improve from 0.69426
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69426
Epoch 9/100
Epoch 9: val_loss did not 

Epoch 3: val_loss improved from 0.69260 to 0.69246, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_full_text_S_label_layer_2.weights.h5
Epoch 4/100
Epoch 4: val_loss did not improve from 0.69246
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69246
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69246
Epoch 7/100
Epoch 7: val_loss did not improve from 0.69246
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69246
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69246
Epoch 10/100
Epoch 10: val_loss did not improve from 0.69246
Epoch 11/100
Epoch 11: val_loss did not improve from 0.69246
Epoch 12/100
Epoch 12: val_loss did not improve from 0.69246
Epoch 13/100
Epoch 13: val_loss did not improve from 0.69246
Epoch 14/100
Epoch 14: val_loss did not improve from 0.69246
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69246
Epoch 16/100
Epoch 16: val_loss did not improve from 0.69246
Epoch 17/100
Epoch 17: val_loss did not improve from 0.69246
Epoch 

Epoch 7/100
Epoch 7: val_loss improved from 0.66411 to 0.66145, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 8/100
Epoch 8: val_loss improved from 0.66145 to 0.66090, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 9/100
Epoch 9: val_loss improved from 0.66090 to 0.65987, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 10/100
Epoch 10: val_loss improved from 0.65987 to 0.65895, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 11/100
Epoch 11: val_loss improved from 0.65895 to 0.65866, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 12/100
Epoch 12: val_loss did not improve from 0.65866
Epoch 13/100
Epoch 13: val_loss did not improve from 0.65866
Epoch 14/100
Epoch 14: val_loss did not improve from 0

                                                                 
 dense_355 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.71341, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.71341
Epoch 3/100
Epoch 3: val_loss did not improve from 0.71341
Epoch 4/100
Epoch 4: val_loss did not improve from 0.71341
Epoch 5/100
Epoch 5: val_loss did not improve from 0.71341
Epoch 6/100
Epoch 6: val_loss did not improve from 0.71341
Epoch 7/100
Epoch 7: val_loss did not improve from 0.71341
Epoch 8/100
Epoch 8: val_loss did not improve from 0.71341
Epoch 9/100
Epoch 9: val_loss did not 

Epoch 4/100
Epoch 4: val_loss did not improve from 0.69400
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69400
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69400
Epoch 7/100
Epoch 7: val_loss did not improve from 0.69400
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69400
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69400
Epoch 10/100
Epoch 10: val_loss did not improve from 0.69400
Epoch 11/100
Epoch 11: val_loss did not improve from 0.69400
Epoch 12/100
Epoch 12: val_loss did not improve from 0.69400
Epoch 13/100
Epoch 13: val_loss did not improve from 0.69400
Epoch 14/100
Epoch 14: val_loss did not improve from 0.69400
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69400
Epoch 16/100

Epoch 16: val_loss did not improve from 0.69400
Epoch 16: early stopping
Saved learning curves: final_mlp_Full_text_L_label_layer_3.png
Test Accuracy for Layer 3: 0.5571
Test AUC for Layer 3: 0.4012

Average Test Accuracy across all layers: 0.5963
Average Test AUC a

TypeError: XGBClassifier.fit() got an unexpected keyword argument 'eval_metric'

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
import xgboost as xgb
from xgboost import callback
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, model_type, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.model_type = model_type
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class ClimateNewsOpenAIPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'OpenAI_MLP_XGB/visualizations_mlp/COP'
        self.xgb_viz_dir = 'OpenAI_MLP_XGB/visualizations_xgb/COP'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs(self.xgb_viz_dir, exist_ok=True)
        os.makedirs('OpenAI_MLP_XGB/visualizations_summary/COP', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert embeddings to numpy arrays
        self.data['Title_embedding'] = self.data['Title_embedding_vector'].apply(parse_embedding)
        self.data['Fulltext_embedding'] = self.data['Full_text_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_title_embedding = self.data['Title_embedding'].iloc[0]
        sample_fulltext_embedding = self.data['Fulltext_embedding'].iloc[0]
        
        print(f"Sample Title embedding shape: {sample_title_embedding.shape}")
        print(f"Sample Fulltext embedding shape: {sample_fulltext_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            text_col: The column containing the embeddings ('Title_embedding' or 'Fulltext_embedding')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data[text_col].values)
        X_val = np.stack(val_data[text_col].values)
        X_test = np.stack(test_data[text_col].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def get_xgb_parameters(self):
        """
        Get XGBoost parameters optimized for high-dimensional embeddings.
        Uses a single parameter set for both Title and Full text embeddings since they have the same dimension (1536).
        
        Returns:
            Dictionary of base parameters
        """
        # Setup XGBoost base parameters optimized for high-dimensional embeddings
        base_params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'n_estimators': 100,
            'max_depth': 3,
            'learning_rate': 0.05,
            'subsample': 0.7,          # Row subsampling to prevent overfitting
            'colsample_bytree': 0.5,   # Column subsampling to handle high dimensionality
            'min_child_weight': 3,     # Prevents overfitting on high-dimensional embeddings
            'reg_alpha': 1.0,          # L1 regularization
            'reg_lambda': 2.0,         # L2 regularization
            'random_state': 42,
            'use_label_encoder': False # Avoid deprecation warning
        }
        
        return base_params
    
    def plot_final_learning_curves(self, history, model_type, text_col, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            model_type: Model type (MLP)
            text_col: Text column used
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_{model_type.lower()}_{display_text.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, text_col, label_col, model_type):
        """
        Train and evaluate a model for a specific text column and label column.
        
        Args:
            text_col: The embedding column to use ('Title_embedding' or 'Fulltext_embedding')
            label_col: The label column to use ('S_label' or 'L_label')
            model_type: The model type to use ('MLP' or 'XGBoost')
        """
        # Store results
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        combination_key = f"{model_type}|{display_text}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training {model_type} model for {display_text} and {label_col}")
        print(f"{'='*80}")
        
        visualization_dir = self.mlp_viz_dir if model_type == 'MLP' else self.xgb_viz_dir
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            if model_type == 'MLP':
                # MLP model training
                model = self.create_mlp_model((1536,))
                print(f"Created MLP model for {display_text} ({label_col})")
                model.summary()
                
                # Setup callbacks
                plot_callback = PlotLearningCallback(model_type, display_text, label_col, i+1, visualization_dir)
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=15,
                    restore_best_weights=True,
                    verbose=1
                )
                model_checkpoint = ModelCheckpoint(
                    filepath=f"{visualization_dir}/best_{model_type.lower()}_{display_text.lower().replace(' ', '_')}_{label_col}_layer_{i+1}.weights.h5",
                    monitor='val_loss',
                    save_weights_only=True,
                    save_best_only=True,
                    verbose=1
                )
                
                # Train model
                print(f"Training MLP model...")
                batch_size = 32  # Larger batch size for embeddings
                history = model.fit(
                    X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                    batch_size=batch_size,
                    callbacks=[early_stopping, model_checkpoint, plot_callback],
                    verbose=1
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict(X_test)
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create final learning curve visualization
                self.plot_final_learning_curves(history, model_type, text_col, label_col, i+1, visualization_dir)
                
                # Store training history
                training_history = history.history
                
            else:  # XGBoost - simplified approach to fix API issues
                # Get XGBoost parameters
                base_params = self.get_xgb_parameters()
                
                # Create and train XGBoost model
                print(f"Creating and training XGBoost model...")
                model = xgb.XGBClassifier(**base_params)
                
                # Only use validation set for evaluation (not training set)
                eval_set = [(X_val, y_val)]
                
                # Train the model
                model.fit(
                    X_train, y_train,
                    eval_set=eval_set,
                    verbose=True
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict_proba(X_test)[:, 1]
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create a simple summary for XGBoost (no detailed learning curves available)
                plt.figure(figsize=(10, 6))
                plt.text(0.5, 0.5, f'XGBoost Model Trained Successfully\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nAUC: {roc_auc_score(y_test, y_pred_proba):.4f}',
                         ha='center', va='center', size=14, fontweight='bold')
                plt.title(f'XGBoost Results ({display_text}, {label_col}, Layer {i+1})')
                plt.axis('off')
                plt.tight_layout()
                plt.savefig(f"{visualization_dir}/xgb_{display_text.replace(' ', '_')}_{label_col}_layer_{i+1}.png")
                plt.close()
                
                # Use None for training history since detailed learning curves aren't available
                training_history = None
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model_type': model_type,
                'accuracy': accuracy,
                'auc': auc,
                'history': training_history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text inputs, label columns, and model types."""
        # Define all combinations
        embedding_cols = ['Title_embedding', 'Fulltext_embedding']
        label_cols = ['S_label', 'L_label']
        model_types = ['MLP', 'XGBoost']
        
        # Run analysis for each combination
        for model_type in model_types:
            for embedding_col in embedding_cols:
                for label_col in label_cols:
                    self.train_and_evaluate_model(embedding_col, label_col, model_type)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Model': model_type,
                'Text': text_col,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing all model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Split by model type
        mlp_data = df[df['Model'] == 'MLP']
        xgb_data = df[df['Model'] == 'XGBoost']
        
        # 1. Performance comparison for MLP
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for MLP
        x = np.arange(len(mlp_data))
        width = 0.35
        
        plt.bar(x - width/2, mlp_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, mlp_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('MLP Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in mlp_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(mlp_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(mlp_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(mlp_data['Avg Accuracy'].max(), mlp_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/COP', "mlp_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # 2. Performance comparison for XGBoost
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for XGBoost
        x = np.arange(len(xgb_data))
        
        plt.bar(x - width/2, xgb_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, xgb_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('XGBoost Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of XGBoost Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in xgb_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(xgb_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(xgb_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(xgb_data['Avg Accuracy'].max(), xgb_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/COP', "xgb_performance_comparison.png")
        plt.savefig(save_path)
        print(f"XGBoost summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP and XGBoost models."""
        # Prepare data for visualization
        mlp_layer_data = []
        xgb_layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Model': model_type,
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                if model_type == 'MLP':
                    mlp_layer_data.append(layer_info)
                else:  # XGBoost
                    xgb_layer_data.append(layer_info)
        
        # Create visualizations for each model type
        self._create_model_layer_visualization(mlp_layer_data, 'MLP')
        self._create_model_layer_visualization(xgb_layer_data, 'XGBoost')
    
    def _create_model_layer_visualization(self, layer_data, model_type):
        """Create layer-specific visualizations for a given model type."""
        if not layer_data:
            print(f"No layer data available for {model_type}")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(14, 10))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title(f'{model_type} Accuracy by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title(f'{model_type} AUC by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/COP', f"{model_type.lower()}_layer_performance.png")
        plt.savefig(save_path)
        print(f"{model_type} layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['OpenAI_MLP_XGB/visualizations_mlp/COP', 'OpenAI_MLP_XGB/visualizations_xgb/COP', 
                      'OpenAI_MLP_XGB/visualizations_summary/COP']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_COP_completed_openai.csv'
    
    # Initialize the predictor with OpenAI embeddings
    predictor = ClimateNewsOpenAIPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'OpenAI_MLP_XGB' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing OpenAI embedding vectors from string format...
Sample Title embedding shape: (1536,)
Sample Fulltext embedding shape: (1536,)
Loaded 838 climate change news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0: 431, 1: 407}
Class distribution for long-term prediction: {1: 440, 0: 398}

Training MLP model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Embeddings shapes - Train: (401, 1536), Val: (131, 1536), Test: (133, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (S_label)
Model: "sequential_121"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_464 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_342 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_244 (Dropout)       (None, 512)               0         
                                                                 
 dense_465 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_343 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_245 (Dropout)       (None, 256)               0         
          

 batch_normalization_347 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_471 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.68800, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_title_S_label_layer_3.weights.h5
Epoch 2/100
Epoch 2: val_loss improved from 0.68800 to 0.68662, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_title_S_label_layer_3.weights.h5
Epoch 3/100
Epoch 3: val_loss did not improve from 0.68662
Epoch 4/100
Epoch 4: val_loss did not improve from 0.68662
Epoch 5/100
Epoch 5: val_loss did not improve

Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.72254, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_title_L_label_layer_1.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.72254
Epoch 3/100
Epoch 3: val_loss did not improve from 0.72254
Epoch 4/100
Epoch 4: val_loss did not improve from 0.72254
Epoch 5/100
Epoch 5: val_loss did not improve from 0.72254
Epoch 6/100
Epoch 6: val_loss did not improve from 0.72254
Epoch 7/100
Epoch 7: val_loss did not improve from 0.72254
Epoch 8/100
Epoch 8: val_loss did not improve from 0.72254
Epoch 9/100
Epoch 9: val_loss did not improve from 0.72254
Epoch 10/100
Epoch 10: val_loss did not improve from 0.72254
Epoch 11/100
Epoch 11: val_loss did not improve from 0.72254
Epoch 12/100
Epoch 12: val_loss did not improve from 0.72254
Epoch 13/100
Epoch 13: val_loss did not improve from 0.72254
Epoch 14/100
Epoch 14: val_loss did not improve from 0.72254
Epoch 15/100
Epoch 15: val_loss did not impro

Epoch 5: val_loss did not improve from 0.71472
Epoch 6/100
Epoch 6: val_loss did not improve from 0.71472
Epoch 7/100
Epoch 7: val_loss did not improve from 0.71472
Epoch 8/100
Epoch 8: val_loss did not improve from 0.71472
Epoch 9/100
Epoch 9: val_loss did not improve from 0.71472
Epoch 10/100
Epoch 10: val_loss did not improve from 0.71472
Epoch 11/100
Epoch 11: val_loss improved from 0.71472 to 0.70516, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_title_L_label_layer_2.weights.h5
Epoch 12/100
Epoch 12: val_loss improved from 0.70516 to 0.70046, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_title_L_label_layer_2.weights.h5
Epoch 13/100
Epoch 13: val_loss improved from 0.70046 to 0.69573, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_title_L_label_layer_2.weights.h5
Epoch 14/100
Epoch 14: val_loss improved from 0.69573 to 0.69247, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_title_L_label_layer_2.weights.h5
Epoch 15

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_480 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_354 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_252 (Dropout)       (None, 512)               0         
                                                                 
 dense_481 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_355 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_253 (Dropout)       (None, 256)               0         
          

                                                                 
 dropout_255 (Dropout)       (None, 256)               0         
                                                                 
 dense_486 (Dense)           (None, 128)               32896     
                                                                 
 batch_normalization_359 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_487 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.69842, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_full_text_S_label_layer_1.

Epoch 2/100
Epoch 2: val_loss did not improve from 0.70404
Epoch 3/100
Epoch 3: val_loss did not improve from 0.70404
Epoch 4/100
Epoch 4: val_loss did not improve from 0.70404
Epoch 5/100
Epoch 5: val_loss did not improve from 0.70404
Epoch 6/100
Epoch 6: val_loss did not improve from 0.70404
Epoch 7/100
Epoch 7: val_loss did not improve from 0.70404
Epoch 8/100
Epoch 8: val_loss did not improve from 0.70404
Epoch 9/100
Epoch 9: val_loss did not improve from 0.70404
Epoch 10/100
Epoch 10: val_loss did not improve from 0.70404
Epoch 11/100
Epoch 11: val_loss did not improve from 0.70404
Epoch 12/100
Epoch 12: val_loss did not improve from 0.70404
Epoch 13/100
Epoch 13: val_loss did not improve from 0.70404
Epoch 14/100
Epoch 14: val_loss did not improve from 0.70404
Epoch 15/100
Epoch 15: val_loss did not improve from 0.70404
Epoch 16/100

Epoch 16: val_loss did not improve from 0.70404
Epoch 16: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_2.png
Test Accurac

Epoch 7/100
Epoch 7: val_loss did not improve from 0.69301
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69301
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69301
Epoch 10/100
Epoch 10: val_loss did not improve from 0.69301
Epoch 11/100
Epoch 11: val_loss did not improve from 0.69301
Epoch 12/100
Epoch 12: val_loss did not improve from 0.69301
Epoch 13/100
Epoch 13: val_loss did not improve from 0.69301
Epoch 14/100
Epoch 14: val_loss did not improve from 0.69301
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69301
Epoch 16/100

Epoch 16: val_loss did not improve from 0.69301
Epoch 16: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_3.png
Test Accuracy for Layer 3: 0.5857
Test AUC for Layer 3: 0.6122

Average Test Accuracy across all layers: 0.4991
Average Test AUC across all layers: 0.5030

Training MLP model for Full text and L_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing peri

Epoch 8/100
Epoch 8: val_loss improved from 0.63722 to 0.63531, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 9/100
Epoch 9: val_loss improved from 0.63531 to 0.63440, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 10/100
Epoch 10: val_loss improved from 0.63440 to 0.63380, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 11/100
Epoch 11: val_loss improved from 0.63380 to 0.63328, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 12/100
Epoch 12: val_loss improved from 0.63328 to 0.63287, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 13/100
Epoch 13: val_loss improved from 0.63287 to 0.63272, saving model to OpenAI_MLP_XGB/visualizations_mlp/COP\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 14

 batch_normalization_370 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_263 (Dropout)       (None, 256)               0         
                                                                 
 dense_502 (Dense)           (None, 128)               32896     
                                                                 
 batch_normalization_371 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_503 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch

Epoch 2/100
Epoch 2: val_loss did not improve from 0.69511
Epoch 3/100
Epoch 3: val_loss did not improve from 0.69511
Epoch 4/100
Epoch 4: val_loss did not improve from 0.69511
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69511
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69511
Epoch 7/100
Epoch 7: val_loss did not improve from 0.69511
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69511
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69511
Epoch 10/100
Epoch 10: val_loss did not improve from 0.69511
Epoch 11/100
Epoch 11: val_loss did not improve from 0.69511
Epoch 12/100
Epoch 12: val_loss did not improve from 0.69511
Epoch 13/100
Epoch 13: val_loss did not improve from 0.69511
Epoch 14/100
Epoch 14: val_loss did not improve from 0.69511
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69511
Epoch 16/100

Epoch 16: val_loss did not improve from 0.69511
Epoch 16: early stopping
Saved learning curves: final_mlp_Full_text_L_label_layer_3.png
Test Accurac

Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Embeddings shapes - Train: (532, 1536), Val: (133, 1536), Test: (101, 1536)
Creating and training XGBoost model...
[0]	validation_0-auc:0.49523
[1]	validation_0-auc:0.38823
[2]	validation_0-auc:0.38130
[3]	validation_0-auc:0.39539
[4]	validation_0-auc:0.40572
[5]	validation_0-auc:0.46365
[6]	validation_0-auc:0.46615
[7]	validation_0-auc:0.46729
[8]	validation_0-auc:0.47388
[9]	validation_0-auc:0.48228
[10]	validation_0-auc:0.47478
[11]	validation_0-auc:0.49364
[12]	validation_0-auc:0.49364
[13]	validation_0-auc:0.50931
[14]	validation_0-auc:0.49500
[15]	validation_0-auc:0.49114
[16]	validation_0-auc:0.48523
[17]	validation_0-auc:0.49409
[18]	validation_0-auc:0.49886
[19]	validation_0-auc:0.49977
[20]	validation_0-auc:0.50477
[21]	validation_0-auc:0.50023
[22]	validation_0-auc:0.51090
[23]	validation_0-auc:0.51340
[24]	validation_0-auc:0.51045
[25]	validation_0-auc:0.51386
[26]	validation_0-auc:0.52272
[27]	v

[79]	validation_0-auc:0.54941
[80]	validation_0-auc:0.54392
[81]	validation_0-auc:0.54980
[82]	validation_0-auc:0.55686
[83]	validation_0-auc:0.55882
[84]	validation_0-auc:0.55647
[85]	validation_0-auc:0.55216
[86]	validation_0-auc:0.54392
[87]	validation_0-auc:0.54275
[88]	validation_0-auc:0.54235
[89]	validation_0-auc:0.54863
[90]	validation_0-auc:0.54902
[91]	validation_0-auc:0.54627
[92]	validation_0-auc:0.54314
[93]	validation_0-auc:0.54627
[94]	validation_0-auc:0.54549
[95]	validation_0-auc:0.54627
[96]	validation_0-auc:0.54980
[97]	validation_0-auc:0.54588
[98]	validation_0-auc:0.54627
[99]	validation_0-auc:0.54039
Test Accuracy for Layer 3: 0.5286
Test AUC for Layer 3: 0.5964

Average Test Accuracy across all layers: 0.4970
Average Test AUC across all layers: 0.5443

Training XGBoost model for Full text and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Vali

[4]	validation_0-auc:0.39861
[5]	validation_0-auc:0.43591
[6]	validation_0-auc:0.47024
[7]	validation_0-auc:0.45437
[8]	validation_0-auc:0.43135
[9]	validation_0-auc:0.44802
[10]	validation_0-auc:0.44683
[11]	validation_0-auc:0.43333
[12]	validation_0-auc:0.42500
[13]	validation_0-auc:0.42540
[14]	validation_0-auc:0.42183
[15]	validation_0-auc:0.41270
[16]	validation_0-auc:0.39683
[17]	validation_0-auc:0.37381
[18]	validation_0-auc:0.39048
[19]	validation_0-auc:0.39683
[20]	validation_0-auc:0.39683
[21]	validation_0-auc:0.38452
[22]	validation_0-auc:0.41270
[23]	validation_0-auc:0.41032
[24]	validation_0-auc:0.42500
[25]	validation_0-auc:0.41667
[26]	validation_0-auc:0.42103
[27]	validation_0-auc:0.42222
[28]	validation_0-auc:0.41310
[29]	validation_0-auc:0.40317
[30]	validation_0-auc:0.41865
[31]	validation_0-auc:0.42063
[32]	validation_0-auc:0.40992
[33]	validation_0-auc:0.42619
[34]	validation_0-auc:0.42024
[35]	validation_0-auc:0.41587
[36]	validation_0-auc:0.41230
[37]	validation_

[42]	validation_0-auc:0.55889
[43]	validation_0-auc:0.55160
[44]	validation_0-auc:0.54548
[45]	validation_0-auc:0.55306
[46]	validation_0-auc:0.56618
[47]	validation_0-auc:0.56939
[48]	validation_0-auc:0.56297
[49]	validation_0-auc:0.55656
[50]	validation_0-auc:0.56443
[51]	validation_0-auc:0.57172
[52]	validation_0-auc:0.56618
[53]	validation_0-auc:0.56997
[54]	validation_0-auc:0.57172
[55]	validation_0-auc:0.57289
[56]	validation_0-auc:0.57493
[57]	validation_0-auc:0.57201
[58]	validation_0-auc:0.55948
[59]	validation_0-auc:0.55773
[60]	validation_0-auc:0.55015
[61]	validation_0-auc:0.54898
[62]	validation_0-auc:0.56385
[63]	validation_0-auc:0.56327
[64]	validation_0-auc:0.56181
[65]	validation_0-auc:0.56414
[66]	validation_0-auc:0.56589
[67]	validation_0-auc:0.55918
[68]	validation_0-auc:0.56356
[69]	validation_0-auc:0.56210
[70]	validation_0-auc:0.56093
[71]	validation_0-auc:0.56035
[72]	validation_0-auc:0.56006
[73]	validation_0-auc:0.55802
[74]	validation_0-auc:0.55918
[75]	valid

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
import xgboost as xgb
from xgboost import callback
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, model_type, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.model_type = model_type
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class ClimateNewsOpenAIPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'OpenAI_MLP_XGB/visualizations_mlp/CVX'
        self.xgb_viz_dir = 'OpenAI_MLP_XGB/visualizations_xgb/CVX'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs(self.xgb_viz_dir, exist_ok=True)
        os.makedirs('OpenAI_MLP_XGB/visualizations_summary/CVX', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert embeddings to numpy arrays
        self.data['Title_embedding'] = self.data['Title_embedding_vector'].apply(parse_embedding)
        self.data['Fulltext_embedding'] = self.data['Full_text_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_title_embedding = self.data['Title_embedding'].iloc[0]
        sample_fulltext_embedding = self.data['Fulltext_embedding'].iloc[0]
        
        print(f"Sample Title embedding shape: {sample_title_embedding.shape}")
        print(f"Sample Fulltext embedding shape: {sample_fulltext_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            text_col: The column containing the embeddings ('Title_embedding' or 'Fulltext_embedding')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data[text_col].values)
        X_val = np.stack(val_data[text_col].values)
        X_test = np.stack(test_data[text_col].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def get_xgb_parameters(self):
        """
        Get XGBoost parameters optimized for high-dimensional embeddings.
        Uses a single parameter set for both Title and Full text embeddings since they have the same dimension (1536).
        
        Returns:
            Dictionary of base parameters
        """
        # Setup XGBoost base parameters optimized for high-dimensional embeddings
        base_params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'n_estimators': 80,
            'max_depth': 3,
            'learning_rate': 0.001,
            'subsample': 0.7,          # Row subsampling to prevent overfitting
            'colsample_bytree': 0.5,   # Column subsampling to handle high dimensionality
            'min_child_weight': 3,     # Prevents overfitting on high-dimensional embeddings
            'reg_alpha': 1.0,          # L1 regularization
            'reg_lambda': 2.0,         # L2 regularization
            'random_state': 42,
            'use_label_encoder': False # Avoid deprecation warning
        }
        
        return base_params
    
    def plot_final_learning_curves(self, history, model_type, text_col, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            model_type: Model type (MLP)
            text_col: Text column used
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_{model_type.lower()}_{display_text.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, text_col, label_col, model_type):
        """
        Train and evaluate a model for a specific text column and label column.
        
        Args:
            text_col: The embedding column to use ('Title_embedding' or 'Fulltext_embedding')
            label_col: The label column to use ('S_label' or 'L_label')
            model_type: The model type to use ('MLP' or 'XGBoost')
        """
        # Store results
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        combination_key = f"{model_type}|{display_text}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training {model_type} model for {display_text} and {label_col}")
        print(f"{'='*80}")
        
        visualization_dir = self.mlp_viz_dir if model_type == 'MLP' else self.xgb_viz_dir
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            if model_type == 'MLP':
                # MLP model training
                model = self.create_mlp_model((1536,))
                print(f"Created MLP model for {display_text} ({label_col})")
                model.summary()
                
                # Setup callbacks
                plot_callback = PlotLearningCallback(model_type, display_text, label_col, i+1, visualization_dir)
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=15,
                    restore_best_weights=True,
                    verbose=1
                )
                model_checkpoint = ModelCheckpoint(
                    filepath=f"{visualization_dir}/best_{model_type.lower()}_{display_text.lower().replace(' ', '_')}_{label_col}_layer_{i+1}.weights.h5",
                    monitor='val_loss',
                    save_weights_only=True,
                    save_best_only=True,
                    verbose=1
                )
                
                # Train model
                print(f"Training MLP model...")
                batch_size = 32  # Larger batch size for embeddings
                history = model.fit(
                    X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                    batch_size=batch_size,
                    callbacks=[early_stopping, model_checkpoint, plot_callback],
                    verbose=1
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict(X_test)
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create final learning curve visualization
                self.plot_final_learning_curves(history, model_type, text_col, label_col, i+1, visualization_dir)
                
                # Store training history
                training_history = history.history
                
            else:  # XGBoost - simplified approach to fix API issues
                # Get XGBoost parameters
                base_params = self.get_xgb_parameters()
                
                # Create and train XGBoost model
                print(f"Creating and training XGBoost model...")
                model = xgb.XGBClassifier(**base_params)
                
                # Only use validation set for evaluation (not training set)
                eval_set = [(X_val, y_val)]
                
                # Train the model
                model.fit(
                    X_train, y_train,
                    eval_set=eval_set,
                    verbose=True
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict_proba(X_test)[:, 1]
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create a simple summary for XGBoost (no detailed learning curves available)
                plt.figure(figsize=(10, 6))
                plt.text(0.5, 0.5, f'XGBoost Model Trained Successfully\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nAUC: {roc_auc_score(y_test, y_pred_proba):.4f}',
                         ha='center', va='center', size=14, fontweight='bold')
                plt.title(f'XGBoost Results ({display_text}, {label_col}, Layer {i+1})')
                plt.axis('off')
                plt.tight_layout()
                plt.savefig(f"{visualization_dir}/xgb_{display_text.replace(' ', '_')}_{label_col}_layer_{i+1}.png")
                plt.close()
                
                # Use None for training history since detailed learning curves aren't available
                training_history = None
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model_type': model_type,
                'accuracy': accuracy,
                'auc': auc,
                'history': training_history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text inputs, label columns, and model types."""
        # Define all combinations
        embedding_cols = ['Title_embedding', 'Fulltext_embedding']
        label_cols = ['S_label', 'L_label']
        model_types = ['MLP', 'XGBoost']
        
        # Run analysis for each combination
        for model_type in model_types:
            for embedding_col in embedding_cols:
                for label_col in label_cols:
                    self.train_and_evaluate_model(embedding_col, label_col, model_type)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Model': model_type,
                'Text': text_col,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing all model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Split by model type
        mlp_data = df[df['Model'] == 'MLP']
        xgb_data = df[df['Model'] == 'XGBoost']
        
        # 1. Performance comparison for MLP
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for MLP
        x = np.arange(len(mlp_data))
        width = 0.35
        
        plt.bar(x - width/2, mlp_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, mlp_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('MLP Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in mlp_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(mlp_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(mlp_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(mlp_data['Avg Accuracy'].max(), mlp_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/CVX', "mlp_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # 2. Performance comparison for XGBoost
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for XGBoost
        x = np.arange(len(xgb_data))
        
        plt.bar(x - width/2, xgb_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, xgb_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('XGBoost Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of XGBoost Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in xgb_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(xgb_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(xgb_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(xgb_data['Avg Accuracy'].max(), xgb_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/CVX', "xgb_performance_comparison.png")
        plt.savefig(save_path)
        print(f"XGBoost summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP and XGBoost models."""
        # Prepare data for visualization
        mlp_layer_data = []
        xgb_layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Model': model_type,
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                if model_type == 'MLP':
                    mlp_layer_data.append(layer_info)
                else:  # XGBoost
                    xgb_layer_data.append(layer_info)
        
        # Create visualizations for each model type
        self._create_model_layer_visualization(mlp_layer_data, 'MLP')
        self._create_model_layer_visualization(xgb_layer_data, 'XGBoost')
    
    def _create_model_layer_visualization(self, layer_data, model_type):
        """Create layer-specific visualizations for a given model type."""
        if not layer_data:
            print(f"No layer data available for {model_type}")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(14, 10))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title(f'{model_type} Accuracy by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title(f'{model_type} AUC by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/CVX', f"{model_type.lower()}_layer_performance.png")
        plt.savefig(save_path)
        print(f"{model_type} layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['OpenAI_MLP_XGB/visualizations_mlp/CVX', 'OpenAI_MLP_XGB/visualizations_xgb/CVX', 
                      'OpenAI_MLP_XGB/visualizations_summary/CVX']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_CVX_completed_openai.csv'
    
    # Initialize the predictor with OpenAI embeddings
    predictor = ClimateNewsOpenAIPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'OpenAI_MLP_XGB' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing OpenAI embedding vectors from string format...
Sample Title embedding shape: (1536,)
Sample Fulltext embedding shape: (1536,)
Loaded 838 climate change news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 452, 0: 386}
Class distribution for long-term prediction: {1: 464, 0: 374}

Training MLP model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Embeddings shapes - Train: (401, 1536), Val: (131, 1536), Test: (133, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (S_label)
Model: "sequential_133"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   


Epoch 19: val_loss did not improve from 0.68468
Epoch 19: early stopping
Saved learning curves: final_mlp_Title_S_label_layer_1.png
Test Accuracy for Layer 1: 0.6090
Test AUC for Layer 1: 0.6256

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Embeddings shapes - Train: (532, 1536), Val: (133, 1536), Test: (101, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (S_label)
Model: "sequential_134"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_512 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_378 (B  (None, 512)               2048      
 atchNormalization)                                         

 batch_normalization_381 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_270 (Dropout)       (None, 512)               0         
                                                                 
 dense_517 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_382 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_271 (Dropout)       (None, 256)               0         
                                                                 
 dense_518 (Dense)           (None, 128)               32896     
                                                                 
 batch_normalization_383 (B  (None, 128)               512       
 atchNorma

 batch_normalization_386 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_523 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.69371, saving model to OpenAI_MLP_XGB/visualizations_mlp/CVX\best_mlp_title_L_label_layer_1.weights.h5
Epoch 2/100
Epoch 2: val_loss improved from 0.69371 to 0.68678, saving model to OpenAI_MLP_XGB/visualizations_mlp/CVX\best_mlp_title_L_label_layer_1.weights.h5
Epoch 3/100
Epoch 3: val_loss improved from 0.68678 to 0.67956, saving model to OpenAI_MLP_XGB/visualizations_mlp/CVX\best_mlp_title_L_label_layer_1.weights.h5
Epo

Epoch 10/100
Epoch 10: val_loss did not improve from 0.73995
Epoch 11/100
Epoch 11: val_loss did not improve from 0.73995
Epoch 12/100
Epoch 12: val_loss did not improve from 0.73995
Epoch 13/100
Epoch 13: val_loss did not improve from 0.73995
Epoch 14/100
Epoch 14: val_loss did not improve from 0.73995
Epoch 15/100
Epoch 15: val_loss did not improve from 0.73995
Epoch 16/100

Epoch 16: val_loss did not improve from 0.73995
Epoch 16: early stopping
Saved learning curves: final_mlp_Title_L_label_layer_2.png
Test Accuracy for Layer 2: 0.5842
Test AUC for Layer 2: 0.5125

Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Embeddings shapes - Train: (666, 1536), Val: (101, 1536), Test: (70, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (L_label)
Model: "sequential_138"
_____________

Epoch 14/100
Epoch 14: val_loss did not improve from 0.69057
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69057
Epoch 16/100
Epoch 16: val_loss did not improve from 0.69057
Epoch 17/100

Epoch 17: val_loss did not improve from 0.69057
Epoch 17: early stopping
Saved learning curves: final_mlp_Title_L_label_layer_3.png
Test Accuracy for Layer 3: 0.4429
Test AUC for Layer 3: 0.5798

Average Test Accuracy across all layers: 0.5880
Average Test AUC across all layers: 0.5019

Training MLP model for Full text and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Embeddings shapes - Train: (401, 1536), Val: (131, 1536), Test: (133, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Full text (S_label)
Model: "sequential_139"
____________________________________________________

Epoch 18: val_loss did not improve from 0.67070
Epoch 19/100
Epoch 19: val_loss did not improve from 0.67070
Epoch 20/100
Epoch 20: val_loss did not improve from 0.67070
Epoch 21/100
Epoch 21: val_loss did not improve from 0.67070
Epoch 22/100

Epoch 22: val_loss did not improve from 0.67070
Epoch 22: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_2.png
Test Accuracy for Layer 2: 0.5347
Test AUC for Layer 2: 0.5386

Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Embeddings shapes - Train: (666, 1536), Val: (101, 1536), Test: (70, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Full text (S_label)
Model: "sequential_141"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_5

Epoch 14/100
Epoch 14: val_loss improved from 0.63636 to 0.63540, saving model to OpenAI_MLP_XGB/visualizations_mlp/CVX\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 15/100
Epoch 15: val_loss improved from 0.63540 to 0.63399, saving model to OpenAI_MLP_XGB/visualizations_mlp/CVX\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 16/100
Epoch 16: val_loss improved from 0.63399 to 0.63370, saving model to OpenAI_MLP_XGB/visualizations_mlp/CVX\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 17/100
Epoch 17: val_loss improved from 0.63370 to 0.63306, saving model to OpenAI_MLP_XGB/visualizations_mlp/CVX\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 18/100
Epoch 18: val_loss did not improve from 0.63306
Epoch 19/100
Epoch 19: val_loss did not improve from 0.63306
Epoch 20/100
Epoch 20: val_loss did not improve from 0.63306
Epoch 21/100
Epoch 21: val_loss improved from 0.63306 to 0.63241, saving model to OpenAI_MLP_XGB/visualizations_mlp/CVX\best_mlp_full_text_L_label_layer_

                                                                 
 batch_normalization_406 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_287 (Dropout)       (None, 256)               0         
                                                                 
 dense_550 (Dense)           (None, 128)               32896     
                                                                 
 batch_normalization_407 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_551 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_______________________________________

[57]	validation_0-auc:0.38454
[58]	validation_0-auc:0.37672
[59]	validation_0-auc:0.37340
[60]	validation_0-auc:0.38028
[61]	validation_0-auc:0.38383
[62]	validation_0-auc:0.37933
[63]	validation_0-auc:0.37767
[64]	validation_0-auc:0.38573
[65]	validation_0-auc:0.38146
[66]	validation_0-auc:0.37648
[67]	validation_0-auc:0.37482
[68]	validation_0-auc:0.37364
[69]	validation_0-auc:0.37150
[70]	validation_0-auc:0.37553
[71]	validation_0-auc:0.37790
[72]	validation_0-auc:0.38051
[73]	validation_0-auc:0.38407
[74]	validation_0-auc:0.38620
[75]	validation_0-auc:0.38407
[76]	validation_0-auc:0.38525
[77]	validation_0-auc:0.39142
[78]	validation_0-auc:0.40043
[79]	validation_0-auc:0.39545
Test Accuracy for Layer 1: 0.6090
Test AUC for Layer 1: 0.4734

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Embeddings shapes - Train: (532, 1

[26]	validation_0-auc:0.42769
[27]	validation_0-auc:0.42390
[28]	validation_0-auc:0.42366
[29]	validation_0-auc:0.42627
[30]	validation_0-auc:0.43030
[31]	validation_0-auc:0.43575
[32]	validation_0-auc:0.44168
[33]	validation_0-auc:0.42722
[34]	validation_0-auc:0.42769
[35]	validation_0-auc:0.42437
[36]	validation_0-auc:0.42485
[37]	validation_0-auc:0.42461
[38]	validation_0-auc:0.43480
[39]	validation_0-auc:0.43125
[40]	validation_0-auc:0.43101
[41]	validation_0-auc:0.43528
[42]	validation_0-auc:0.42982
[43]	validation_0-auc:0.43860
[44]	validation_0-auc:0.44476
[45]	validation_0-auc:0.44618
[46]	validation_0-auc:0.45211
[47]	validation_0-auc:0.45116
[48]	validation_0-auc:0.44855
[49]	validation_0-auc:0.45330
[50]	validation_0-auc:0.45519
[51]	validation_0-auc:0.45092
[52]	validation_0-auc:0.45567
[53]	validation_0-auc:0.45282
[54]	validation_0-auc:0.44239
[55]	validation_0-auc:0.44429
[56]	validation_0-auc:0.43646
[57]	validation_0-auc:0.44026
[58]	validation_0-auc:0.44286
[59]	valid

[11]	validation_0-auc:0.44462
[12]	validation_0-auc:0.45185
[13]	validation_0-auc:0.45613
[14]	validation_0-auc:0.45452
[15]	validation_0-auc:0.46977
[16]	validation_0-auc:0.49465
[17]	validation_0-auc:0.49973
[18]	validation_0-auc:0.50963
[19]	validation_0-auc:0.49572
[20]	validation_0-auc:0.49759
[21]	validation_0-auc:0.49572
[22]	validation_0-auc:0.49117
[23]	validation_0-auc:0.50027
[24]	validation_0-auc:0.50147
[25]	validation_0-auc:0.50107
[26]	validation_0-auc:0.50696
[27]	validation_0-auc:0.51632
[28]	validation_0-auc:0.51445
[29]	validation_0-auc:0.51364
[30]	validation_0-auc:0.52622
[31]	validation_0-auc:0.52996
[32]	validation_0-auc:0.53237
[33]	validation_0-auc:0.52782
[34]	validation_0-auc:0.51097
[35]	validation_0-auc:0.50776
[36]	validation_0-auc:0.50562
[37]	validation_0-auc:0.49906
[38]	validation_0-auc:0.49050
[39]	validation_0-auc:0.49973
[40]	validation_0-auc:0.49973
[41]	validation_0-auc:0.50669
[42]	validation_0-auc:0.50428
[43]	validation_0-auc:0.50803
[44]	valid

MLP summary comparison visualization saved as: OpenAI_MLP_XGB/visualizations_summary/CVX\mlp_performance_comparison.png
XGBoost summary comparison visualization saved as: OpenAI_MLP_XGB/visualizations_summary/CVX\xgb_performance_comparison.png
MLP layer performance visualization saved as: OpenAI_MLP_XGB/visualizations_summary/CVX\mlp_layer_performance.png
XGBoost layer performance visualization saved as: OpenAI_MLP_XGB/visualizations_summary/CVX\xgboost_layer_performance.png

Analysis complete! Results saved to 'OpenAI_MLP_XGB' directory.


In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
import xgboost as xgb
from xgboost import callback
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, model_type, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.model_type = model_type
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class ClimateNewsOpenAIPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'OpenAI_MLP_XGB/visualizations_mlp/MPC'
        self.xgb_viz_dir = 'OpenAI_MLP_XGB/visualizations_xgb/MPC'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs(self.xgb_viz_dir, exist_ok=True)
        os.makedirs('OpenAI_MLP_XGB/visualizations_summary/MPC', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert embeddings to numpy arrays
        self.data['Title_embedding'] = self.data['Title_embedding_vector'].apply(parse_embedding)
        self.data['Fulltext_embedding'] = self.data['Full_text_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_title_embedding = self.data['Title_embedding'].iloc[0]
        sample_fulltext_embedding = self.data['Fulltext_embedding'].iloc[0]
        
        print(f"Sample Title embedding shape: {sample_title_embedding.shape}")
        print(f"Sample Fulltext embedding shape: {sample_fulltext_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            text_col: The column containing the embeddings ('Title_embedding' or 'Fulltext_embedding')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data[text_col].values)
        X_val = np.stack(val_data[text_col].values)
        X_test = np.stack(test_data[text_col].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def get_xgb_parameters(self):
        """
        Get XGBoost parameters optimized for high-dimensional embeddings.
        Uses a single parameter set for both Title and Full text embeddings since they have the same dimension (1536).
        
        Returns:
            Dictionary of base parameters
        """
        # Setup XGBoost base parameters optimized for high-dimensional embeddings
        base_params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'n_estimators': 80,
            'max_depth': 3,
            'learning_rate': 0.0001,
            'subsample': 0.7,          # Row subsampling to prevent overfitting
            'colsample_bytree': 0.5,   # Column subsampling to handle high dimensionality
            'min_child_weight': 3,     # Prevents overfitting on high-dimensional embeddings
            'reg_alpha': 1.0,          # L1 regularization
            'reg_lambda': 2.0,         # L2 regularization
            'random_state': 42,
            'use_label_encoder': False # Avoid deprecation warning
        }
        
        return base_params
    
    def plot_final_learning_curves(self, history, model_type, text_col, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            model_type: Model type (MLP)
            text_col: Text column used
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_{model_type.lower()}_{display_text.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, text_col, label_col, model_type):
        """
        Train and evaluate a model for a specific text column and label column.
        
        Args:
            text_col: The embedding column to use ('Title_embedding' or 'Fulltext_embedding')
            label_col: The label column to use ('S_label' or 'L_label')
            model_type: The model type to use ('MLP' or 'XGBoost')
        """
        # Store results
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        combination_key = f"{model_type}|{display_text}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training {model_type} model for {display_text} and {label_col}")
        print(f"{'='*80}")
        
        visualization_dir = self.mlp_viz_dir if model_type == 'MLP' else self.xgb_viz_dir
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            if model_type == 'MLP':
                # MLP model training
                model = self.create_mlp_model((1536,))
                print(f"Created MLP model for {display_text} ({label_col})")
                model.summary()
                
                # Setup callbacks
                plot_callback = PlotLearningCallback(model_type, display_text, label_col, i+1, visualization_dir)
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=15,
                    restore_best_weights=True,
                    verbose=1
                )
                model_checkpoint = ModelCheckpoint(
                    filepath=f"{visualization_dir}/best_{model_type.lower()}_{display_text.lower().replace(' ', '_')}_{label_col}_layer_{i+1}.weights.h5",
                    monitor='val_loss',
                    save_weights_only=True,
                    save_best_only=True,
                    verbose=1
                )
                
                # Train model
                print(f"Training MLP model...")
                batch_size = 32  # Larger batch size for embeddings
                history = model.fit(
                    X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                    batch_size=batch_size,
                    callbacks=[early_stopping, model_checkpoint, plot_callback],
                    verbose=1
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict(X_test)
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create final learning curve visualization
                self.plot_final_learning_curves(history, model_type, text_col, label_col, i+1, visualization_dir)
                
                # Store training history
                training_history = history.history
                
            else:  # XGBoost - simplified approach to fix API issues
                # Get XGBoost parameters
                base_params = self.get_xgb_parameters()
                
                # Create and train XGBoost model
                print(f"Creating and training XGBoost model...")
                model = xgb.XGBClassifier(**base_params)
                
                # Only use validation set for evaluation (not training set)
                eval_set = [(X_val, y_val)]
                
                # Train the model
                model.fit(
                    X_train, y_train,
                    eval_set=eval_set,
                    verbose=True
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict_proba(X_test)[:, 1]
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create a simple summary for XGBoost (no detailed learning curves available)
                plt.figure(figsize=(10, 6))
                plt.text(0.5, 0.5, f'XGBoost Model Trained Successfully\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nAUC: {roc_auc_score(y_test, y_pred_proba):.4f}',
                         ha='center', va='center', size=14, fontweight='bold')
                plt.title(f'XGBoost Results ({display_text}, {label_col}, Layer {i+1})')
                plt.axis('off')
                plt.tight_layout()
                plt.savefig(f"{visualization_dir}/xgb_{display_text.replace(' ', '_')}_{label_col}_layer_{i+1}.png")
                plt.close()
                
                # Use None for training history since detailed learning curves aren't available
                training_history = None
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model_type': model_type,
                'accuracy': accuracy,
                'auc': auc,
                'history': training_history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text inputs, label columns, and model types."""
        # Define all combinations
        embedding_cols = ['Title_embedding', 'Fulltext_embedding']
        label_cols = ['S_label', 'L_label']
        model_types = ['MLP', 'XGBoost']
        
        # Run analysis for each combination
        for model_type in model_types:
            for embedding_col in embedding_cols:
                for label_col in label_cols:
                    self.train_and_evaluate_model(embedding_col, label_col, model_type)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Model': model_type,
                'Text': text_col,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing all model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Split by model type
        mlp_data = df[df['Model'] == 'MLP']
        xgb_data = df[df['Model'] == 'XGBoost']
        
        # 1. Performance comparison for MLP
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for MLP
        x = np.arange(len(mlp_data))
        width = 0.35
        
        plt.bar(x - width/2, mlp_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, mlp_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('MLP Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in mlp_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(mlp_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(mlp_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(mlp_data['Avg Accuracy'].max(), mlp_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/MPC', "mlp_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # 2. Performance comparison for XGBoost
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for XGBoost
        x = np.arange(len(xgb_data))
        
        plt.bar(x - width/2, xgb_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, xgb_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('XGBoost Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of XGBoost Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in xgb_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(xgb_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(xgb_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(xgb_data['Avg Accuracy'].max(), xgb_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/MPC', "xgb_performance_comparison.png")
        plt.savefig(save_path)
        print(f"XGBoost summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP and XGBoost models."""
        # Prepare data for visualization
        mlp_layer_data = []
        xgb_layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Model': model_type,
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                if model_type == 'MLP':
                    mlp_layer_data.append(layer_info)
                else:  # XGBoost
                    xgb_layer_data.append(layer_info)
        
        # Create visualizations for each model type
        self._create_model_layer_visualization(mlp_layer_data, 'MLP')
        self._create_model_layer_visualization(xgb_layer_data, 'XGBoost')
    
    def _create_model_layer_visualization(self, layer_data, model_type):
        """Create layer-specific visualizations for a given model type."""
        if not layer_data:
            print(f"No layer data available for {model_type}")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(14, 10))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title(f'{model_type} Accuracy by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title(f'{model_type} AUC by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/MPC', f"{model_type.lower()}_layer_performance.png")
        plt.savefig(save_path)
        print(f"{model_type} layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['OpenAI_MLP_XGB/visualizations_mlp/MPC', 'OpenAI_MLP_XGB/visualizations_xgb/MPC', 
                      'OpenAI_MLP_XGB/visualizations_summary/MPC']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_MPC_completed_openai.csv'
    
    # Initialize the predictor with OpenAI embeddings
    predictor = ClimateNewsOpenAIPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'OpenAI_MLP_XGB' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing OpenAI embedding vectors from string format...
Sample Title embedding shape: (1536,)
Sample Fulltext embedding shape: (1536,)
Loaded 838 climate change news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 460, 0: 378}
Class distribution for long-term prediction: {1: 503, 0: 335}

Training MLP model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Embeddings shapes - Train: (401, 1536), Val: (131, 1536), Test: (133, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (S_label)
Model: "sequential_145"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   

 dense_560 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_414 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_292 (Dropout)       (None, 512)               0         
                                                                 
 dense_561 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_415 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_293 (Dropout)       (None, 256)               0         
                                                                 
 dense_562 (Dense)           (None, 128)               32896     
          


Epoch 22: val_loss did not improve from 0.68787
Epoch 22: early stopping
Saved learning curves: final_mlp_Title_S_label_layer_2.png
Test Accuracy for Layer 2: 0.5347
Test AUC for Layer 2: 0.4350

Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Embeddings shapes - Train: (666, 1536), Val: (101, 1536), Test: (70, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (S_label)
Model: "sequential_147"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_564 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_417 (B  (None, 512)               2048      
 atchNormalization)                                           


Epoch 16: val_loss did not improve from 0.70761
Epoch 16: early stopping
Saved learning curves: final_mlp_Title_L_label_layer_1.png
Test Accuracy for Layer 1: 0.2632
Test AUC for Layer 1: 0.5032

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Embeddings shapes - Train: (532, 1536), Val: (133, 1536), Test: (101, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (L_label)
Model: "sequential_149"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_572 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_423 (B  (None, 512)               2048      
 atchNormalization)                                         

Epoch 15/100
Epoch 15: val_loss did not improve from 0.65775
Epoch 16/100
Epoch 16: val_loss improved from 0.65775 to 0.65757, saving model to OpenAI_MLP_XGB/visualizations_mlp/MPC\best_mlp_title_L_label_layer_3.weights.h5
Epoch 17/100
Epoch 17: val_loss did not improve from 0.65757
Epoch 18/100
Epoch 18: val_loss improved from 0.65757 to 0.65694, saving model to OpenAI_MLP_XGB/visualizations_mlp/MPC\best_mlp_title_L_label_layer_3.weights.h5
Epoch 19/100
Epoch 19: val_loss improved from 0.65694 to 0.65368, saving model to OpenAI_MLP_XGB/visualizations_mlp/MPC\best_mlp_title_L_label_layer_3.weights.h5
Epoch 20/100
Epoch 20: val_loss improved from 0.65368 to 0.65265, saving model to OpenAI_MLP_XGB/visualizations_mlp/MPC\best_mlp_title_L_label_layer_3.weights.h5
Epoch 21/100
Epoch 21: val_loss did not improve from 0.65265
Epoch 22/100
Epoch 22: val_loss did not improve from 0.65265
Epoch 23/100
Epoch 23: val_loss did not improve from 0.65265
Epoch 24/100
Epoch 24: val_loss did not improve

 dense_582 (Dense)           (None, 128)               32896     
                                                                 
 batch_normalization_431 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_583 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.69252, saving model to OpenAI_MLP_XGB/visualizations_mlp/MPC\best_mlp_full_text_S_label_layer_1.weights.h5
Epoch 2/100
Epoch 2: val_loss improved from 0.69252 to 0.68964, saving model to OpenAI_MLP_XGB/visualizations_mlp/MPC\best_mlp_full_text_S_label_layer_1.weights.h5
Epoch 3/100
Epoch 3: va

Epoch 24/100
Epoch 24: val_loss did not improve from 0.67754
Epoch 25/100
Epoch 25: val_loss did not improve from 0.67754
Epoch 26/100
Epoch 26: val_loss did not improve from 0.67754
Epoch 27/100

Epoch 27: val_loss did not improve from 0.67754
Epoch 27: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_1.png
Test Accuracy for Layer 1: 0.5489
Test AUC for Layer 1: 0.5619

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Embeddings shapes - Train: (532, 1536), Val: (133, 1536), Test: (101, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Full text (S_label)
Model: "sequential_152"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_584 (Dense)           (None, 512)              

 batch_normalization_438 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_308 (Dropout)       (None, 512)               0         
                                                                 
 dense_593 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_439 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_309 (Dropout)       (None, 256)               0         
                                                                 
 dense_594 (Dense)           (None, 128)               32896     
                                                                 
 batch_normalization_440 (B  (None, 128)               512       
 atchNorma

Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.71755, saving model to OpenAI_MLP_XGB/visualizations_mlp/MPC\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.71755
Epoch 3/100
Epoch 3: val_loss did not improve from 0.71755
Epoch 4/100
Epoch 4: val_loss did not improve from 0.71755
Epoch 5/100
Epoch 5: val_loss did not improve from 0.71755
Epoch 6/100
Epoch 6: val_loss did not improve from 0.71755
Epoch 7/100
Epoch 7: val_loss did not improve from 0.71755
Epoch 8/100
Epoch 8: val_loss did not improve from 0.71755
Epoch 9/100
Epoch 9: val_loss did not improve from 0.71755
Epoch 10/100
Epoch 10: val_loss did not improve from 0.71755
Epoch 11/100
Epoch 11: val_loss did not improve from 0.71755
Epoch 12/100
Epoch 12: val_loss did not improve from 0.

Epoch 29/100
Epoch 29: val_loss did not improve from 0.65441
Epoch 30/100
Epoch 30: val_loss did not improve from 0.65441
Epoch 31/100
Epoch 31: val_loss did not improve from 0.65441
Epoch 32/100

Epoch 32: val_loss did not improve from 0.65441
Epoch 32: early stopping
Saved learning curves: final_mlp_Full_text_L_label_layer_3.png
Test Accuracy for Layer 3: 0.5000
Test AUC for Layer 3: 0.4485

Average Test Accuracy across all layers: 0.3831
Average Test AUC across all layers: 0.5082

Training XGBoost model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Embeddings shapes - Train: (401, 1536), Val: (131, 1536), Test: (133, 1536)
Creating and training XGBoost model...
[0]	validation_0-auc:0.62034
[1]	validation_0-auc:0.55285
[2]	validation_0-auc:0.54572
[3]	validation_0-auc:0.52516
[4]	validation_0-auc:0

[6]	validation_0-auc:0.56266
[7]	validation_0-auc:0.59181
[8]	validation_0-auc:0.59553
[9]	validation_0-auc:0.61993
[10]	validation_0-auc:0.58644
[11]	validation_0-auc:0.56121
[12]	validation_0-auc:0.56307
[13]	validation_0-auc:0.55252
[14]	validation_0-auc:0.58065
[15]	validation_0-auc:0.54363
[16]	validation_0-auc:0.53929
[17]	validation_0-auc:0.52192
[18]	validation_0-auc:0.52316
[19]	validation_0-auc:0.53991
[20]	validation_0-auc:0.52771
[21]	validation_0-auc:0.52192
[22]	validation_0-auc:0.51261
[23]	validation_0-auc:0.51489
[24]	validation_0-auc:0.51406
[25]	validation_0-auc:0.50124
[26]	validation_0-auc:0.51427
[27]	validation_0-auc:0.51572
[28]	validation_0-auc:0.51799
[29]	validation_0-auc:0.52523
[30]	validation_0-auc:0.53102
[31]	validation_0-auc:0.52833
[32]	validation_0-auc:0.52978
[33]	validation_0-auc:0.52667
[34]	validation_0-auc:0.52502
[35]	validation_0-auc:0.52068
[36]	validation_0-auc:0.51861
[37]	validation_0-auc:0.52357
[38]	validation_0-auc:0.52026
[39]	validatio

Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Embeddings shapes - Train: (666, 1536), Val: (101, 1536), Test: (70, 1536)
Creating and training XGBoost model...
[0]	validation_0-auc:0.50020
[1]	validation_0-auc:0.54058
[2]	validation_0-auc:0.55713
[3]	validation_0-auc:0.57742
[4]	validation_0-auc:0.58649
[5]	validation_0-auc:0.56738
[6]	validation_0-auc:0.53704
[7]	validation_0-auc:0.55162
[8]	validation_0-auc:0.59338
[9]	validation_0-auc:0.60126
[10]	validation_0-auc:0.58156
[11]	validation_0-auc:0.58806
[12]	validation_0-auc:0.59456
[13]	validation_0-auc:0.56974
[14]	validation_0-auc:0.54610
[15]	validation_0-auc:0.53999
[16]	validation_0-auc:0.51872
[17]	validation_0-auc:0.51458
[18]	validation_0-auc:0.50709
[19]	validation_0-auc:0.50552
[20]	validation_0-auc:0.50532
[21]	validation_0-auc:0.51024
[22]	validation_0-auc:0.51478
[23]	validation_0-auc:0.52246
[24]	validation_0-auc:0.52009
[25]	validation_0-auc:0.53073
[26]	validation_0-auc:0.52522
[27]	val

In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
import xgboost as xgb
from xgboost import callback
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, model_type, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.model_type = model_type
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class ClimateNewsOpenAIPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'OpenAI_MLP_XGB/visualizations_mlp/SLB'
        self.xgb_viz_dir = 'OpenAI_MLP_XGB/visualizations_xgb/SLB'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs(self.xgb_viz_dir, exist_ok=True)
        os.makedirs('OpenAI_MLP_XGB/visualizations_summary/SLB', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert embeddings to numpy arrays
        self.data['Title_embedding'] = self.data['Title_embedding_vector'].apply(parse_embedding)
        self.data['Fulltext_embedding'] = self.data['Full_text_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_title_embedding = self.data['Title_embedding'].iloc[0]
        sample_fulltext_embedding = self.data['Fulltext_embedding'].iloc[0]
        
        print(f"Sample Title embedding shape: {sample_title_embedding.shape}")
        print(f"Sample Fulltext embedding shape: {sample_fulltext_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            text_col: The column containing the embeddings ('Title_embedding' or 'Fulltext_embedding')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data[text_col].values)
        X_val = np.stack(val_data[text_col].values)
        X_test = np.stack(test_data[text_col].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def get_xgb_parameters(self):
        """
        Get XGBoost parameters optimized for high-dimensional embeddings.
        Uses a single parameter set for both Title and Full text embeddings since they have the same dimension (1536).
        
        Returns:
            Dictionary of base parameters
        """
        # Setup XGBoost base parameters optimized for high-dimensional embeddings
        base_params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'n_estimators': 150,
            'max_depth': 3,
            'learning_rate': 0.00005,
            'subsample': 0.7,          # Row subsampling to prevent overfitting
            'colsample_bytree': 0.5,   # Column subsampling to handle high dimensionality
            'min_child_weight': 3,     # Prevents overfitting on high-dimensional embeddings
            'reg_alpha': 1.0,          # L1 regularization
            'reg_lambda': 2.0,         # L2 regularization
            'random_state': 42,
            'use_label_encoder': False # Avoid deprecation warning
        }
        
        return base_params
    
    def plot_final_learning_curves(self, history, model_type, text_col, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            model_type: Model type (MLP)
            text_col: Text column used
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_{model_type.lower()}_{display_text.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, text_col, label_col, model_type):
        """
        Train and evaluate a model for a specific text column and label column.
        
        Args:
            text_col: The embedding column to use ('Title_embedding' or 'Fulltext_embedding')
            label_col: The label column to use ('S_label' or 'L_label')
            model_type: The model type to use ('MLP' or 'XGBoost')
        """
        # Store results
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        combination_key = f"{model_type}|{display_text}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training {model_type} model for {display_text} and {label_col}")
        print(f"{'='*80}")
        
        visualization_dir = self.mlp_viz_dir if model_type == 'MLP' else self.xgb_viz_dir
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            if model_type == 'MLP':
                # MLP model training
                model = self.create_mlp_model((1536,))
                print(f"Created MLP model for {display_text} ({label_col})")
                model.summary()
                
                # Setup callbacks
                plot_callback = PlotLearningCallback(model_type, display_text, label_col, i+1, visualization_dir)
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=15,
                    restore_best_weights=True,
                    verbose=1
                )
                model_checkpoint = ModelCheckpoint(
                    filepath=f"{visualization_dir}/best_{model_type.lower()}_{display_text.lower().replace(' ', '_')}_{label_col}_layer_{i+1}.weights.h5",
                    monitor='val_loss',
                    save_weights_only=True,
                    save_best_only=True,
                    verbose=1
                )
                
                # Train model
                print(f"Training MLP model...")
                batch_size = 32  # Larger batch size for embeddings
                history = model.fit(
                    X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                    batch_size=batch_size,
                    callbacks=[early_stopping, model_checkpoint, plot_callback],
                    verbose=1
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict(X_test)
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create final learning curve visualization
                self.plot_final_learning_curves(history, model_type, text_col, label_col, i+1, visualization_dir)
                
                # Store training history
                training_history = history.history
                
            else:  # XGBoost - simplified approach to fix API issues
                # Get XGBoost parameters
                base_params = self.get_xgb_parameters()
                
                # Create and train XGBoost model
                print(f"Creating and training XGBoost model...")
                model = xgb.XGBClassifier(**base_params)
                
                # Only use validation set for evaluation (not training set)
                eval_set = [(X_val, y_val)]
                
                # Train the model
                model.fit(
                    X_train, y_train,
                    eval_set=eval_set,
                    verbose=True
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict_proba(X_test)[:, 1]
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create a simple summary for XGBoost (no detailed learning curves available)
                plt.figure(figsize=(10, 6))
                plt.text(0.5, 0.5, f'XGBoost Model Trained Successfully\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nAUC: {roc_auc_score(y_test, y_pred_proba):.4f}',
                         ha='center', va='center', size=14, fontweight='bold')
                plt.title(f'XGBoost Results ({display_text}, {label_col}, Layer {i+1})')
                plt.axis('off')
                plt.tight_layout()
                plt.savefig(f"{visualization_dir}/xgb_{display_text.replace(' ', '_')}_{label_col}_layer_{i+1}.png")
                plt.close()
                
                # Use None for training history since detailed learning curves aren't available
                training_history = None
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model_type': model_type,
                'accuracy': accuracy,
                'auc': auc,
                'history': training_history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text inputs, label columns, and model types."""
        # Define all combinations
        embedding_cols = ['Title_embedding', 'Fulltext_embedding']
        label_cols = ['S_label', 'L_label']
        model_types = ['MLP', 'XGBoost']
        
        # Run analysis for each combination
        for model_type in model_types:
            for embedding_col in embedding_cols:
                for label_col in label_cols:
                    self.train_and_evaluate_model(embedding_col, label_col, model_type)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Model': model_type,
                'Text': text_col,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing all model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Split by model type
        mlp_data = df[df['Model'] == 'MLP']
        xgb_data = df[df['Model'] == 'XGBoost']
        
        # 1. Performance comparison for MLP
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for MLP
        x = np.arange(len(mlp_data))
        width = 0.35
        
        plt.bar(x - width/2, mlp_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, mlp_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('MLP Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in mlp_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(mlp_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(mlp_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(mlp_data['Avg Accuracy'].max(), mlp_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/SLB', "mlp_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # 2. Performance comparison for XGBoost
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for XGBoost
        x = np.arange(len(xgb_data))
        
        plt.bar(x - width/2, xgb_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, xgb_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('XGBoost Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of XGBoost Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in xgb_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(xgb_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(xgb_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(xgb_data['Avg Accuracy'].max(), xgb_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/SLB', "xgb_performance_comparison.png")
        plt.savefig(save_path)
        print(f"XGBoost summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP and XGBoost models."""
        # Prepare data for visualization
        mlp_layer_data = []
        xgb_layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Model': model_type,
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                if model_type == 'MLP':
                    mlp_layer_data.append(layer_info)
                else:  # XGBoost
                    xgb_layer_data.append(layer_info)
        
        # Create visualizations for each model type
        self._create_model_layer_visualization(mlp_layer_data, 'MLP')
        self._create_model_layer_visualization(xgb_layer_data, 'XGBoost')
    
    def _create_model_layer_visualization(self, layer_data, model_type):
        """Create layer-specific visualizations for a given model type."""
        if not layer_data:
            print(f"No layer data available for {model_type}")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(14, 10))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title(f'{model_type} Accuracy by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title(f'{model_type} AUC by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/SLB', f"{model_type.lower()}_layer_performance.png")
        plt.savefig(save_path)
        print(f"{model_type} layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['OpenAI_MLP_XGB/visualizations_mlp/SLB', 'OpenAI_MLP_XGB/visualizations_xgb/SLB', 
                      'OpenAI_MLP_XGB/visualizations_summary/SLB']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_SLB_completed_openai.csv'
    
    # Initialize the predictor with OpenAI embeddings
    predictor = ClimateNewsOpenAIPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'OpenAI_MLP_XGB' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing OpenAI embedding vectors from string format...
Sample Title embedding shape: (1536,)
Sample Fulltext embedding shape: (1536,)
Loaded 838 climate change news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0: 433, 1: 405}
Class distribution for long-term prediction: {0: 437, 1: 401}

Training MLP model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Embeddings shapes - Train: (401, 1536), Val: (131, 1536), Test: (133, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (S_label)
Model: "sequential_157"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_608 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_450 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_316 (Dropout)       (None, 512)               0         
                                                                 
 dense_609 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_451 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_317 (Dropout)       (None, 256)               0         
          

Epoch 21/100
Epoch 21: val_loss did not improve from 0.69033
Epoch 22/100
Epoch 22: val_loss did not improve from 0.69033
Epoch 23/100
Epoch 23: val_loss did not improve from 0.69033
Epoch 24/100
Epoch 24: val_loss did not improve from 0.69033
Epoch 25/100

Epoch 25: val_loss did not improve from 0.69033
Epoch 25: early stopping
Saved learning curves: final_mlp_Title_S_label_layer_2.png
Test Accuracy for Layer 2: 0.5347
Test AUC for Layer 2: 0.3793

Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Embeddings shapes - Train: (666, 1536), Val: (101, 1536), Test: (70, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (S_label)
Model: "sequential_159"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 de

Epoch 15/100
Epoch 15: val_loss did not improve from 0.69471
Epoch 16/100
Epoch 16: val_loss did not improve from 0.69471
Epoch 17/100
Epoch 17: val_loss did not improve from 0.69471
Epoch 18/100
Epoch 18: val_loss did not improve from 0.69471
Epoch 19/100
Epoch 19: val_loss did not improve from 0.69471
Epoch 20/100
Epoch 20: val_loss did not improve from 0.69471
Epoch 21/100
Epoch 21: val_loss did not improve from 0.69471
Epoch 22/100
Epoch 22: val_loss did not improve from 0.69471
Epoch 23/100

Epoch 23: val_loss did not improve from 0.69471
Epoch 23: early stopping
Saved learning curves: final_mlp_Title_S_label_layer_3.png
Test Accuracy for Layer 3: 0.4429
Test AUC for Layer 3: 0.4036

Average Test Accuracy across all layers: 0.4963
Average Test AUC across all layers: 0.4262

Training MLP model for Title and L_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validati

Epoch 8/100
Epoch 8: val_loss did not improve from 0.70230
Epoch 9/100
Epoch 9: val_loss did not improve from 0.70230
Epoch 10/100
Epoch 10: val_loss did not improve from 0.70230
Epoch 11/100
Epoch 11: val_loss did not improve from 0.70230
Epoch 12/100
Epoch 12: val_loss did not improve from 0.70230
Epoch 13/100
Epoch 13: val_loss did not improve from 0.70230
Epoch 14/100
Epoch 14: val_loss did not improve from 0.70230
Epoch 15/100
Epoch 15: val_loss did not improve from 0.70230
Epoch 16/100

Epoch 16: val_loss did not improve from 0.70230
Epoch 16: early stopping
Saved learning curves: final_mlp_Title_L_label_layer_2.png
Test Accuracy for Layer 2: 0.5149
Test AUC for Layer 2: 0.5000

Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Embeddings shapes - Train: (666, 1536), Val: (101, 1536), Test: (70, 1536)
Creating MLP model w

Epoch 10/100
Epoch 10: val_loss improved from 0.68991 to 0.68960, saving model to OpenAI_MLP_XGB/visualizations_mlp/SLB\best_mlp_title_L_label_layer_3.weights.h5
Epoch 11/100
Epoch 11: val_loss improved from 0.68960 to 0.68910, saving model to OpenAI_MLP_XGB/visualizations_mlp/SLB\best_mlp_title_L_label_layer_3.weights.h5
Epoch 12/100
Epoch 12: val_loss improved from 0.68910 to 0.68847, saving model to OpenAI_MLP_XGB/visualizations_mlp/SLB\best_mlp_title_L_label_layer_3.weights.h5
Epoch 13/100
Epoch 13: val_loss did not improve from 0.68847
Epoch 14/100
Epoch 14: val_loss improved from 0.68847 to 0.68808, saving model to OpenAI_MLP_XGB/visualizations_mlp/SLB\best_mlp_title_L_label_layer_3.weights.h5
Epoch 15/100
Epoch 15: val_loss improved from 0.68808 to 0.68736, saving model to OpenAI_MLP_XGB/visualizations_mlp/SLB\best_mlp_title_L_label_layer_3.weights.h5
Epoch 16/100
Epoch 16: val_loss improved from 0.68736 to 0.68556, saving model to OpenAI_MLP_XGB/visualizations_mlp/SLB\best_mlp_

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_628 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_465 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_326 (Dropout)       (None, 512)               0         
                                                                 
 dense_629 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_466 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_327 (Dropout)       (None, 256)               0         
          

 batch_normalization_470 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_635 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.69666, saving model to OpenAI_MLP_XGB/visualizations_mlp/SLB\best_mlp_full_text_S_label_layer_2.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.69666
Epoch 3/100
Epoch 3: val_loss did not improve from 0.69666
Epoch 4/100
Epoch 4: val_loss did not improve from 0.69666
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69666
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69666
Epoch 7/100
Epoch 7: val

Epoch 3/100
Epoch 3: val_loss did not improve from 0.70086
Epoch 4/100
Epoch 4: val_loss did not improve from 0.70086
Epoch 5/100
Epoch 5: val_loss did not improve from 0.70086
Epoch 6/100
Epoch 6: val_loss did not improve from 0.70086
Epoch 7/100
Epoch 7: val_loss did not improve from 0.70086
Epoch 8/100
Epoch 8: val_loss did not improve from 0.70086
Epoch 9/100
Epoch 9: val_loss did not improve from 0.70086
Epoch 10/100
Epoch 10: val_loss did not improve from 0.70086
Epoch 11/100
Epoch 11: val_loss improved from 0.70086 to 0.70020, saving model to OpenAI_MLP_XGB/visualizations_mlp/SLB\best_mlp_full_text_S_label_layer_3.weights.h5
Epoch 12/100
Epoch 12: val_loss improved from 0.70020 to 0.69869, saving model to OpenAI_MLP_XGB/visualizations_mlp/SLB\best_mlp_full_text_S_label_layer_3.weights.h5
Epoch 13/100
Epoch 13: val_loss improved from 0.69869 to 0.69868, saving model to OpenAI_MLP_XGB/visualizations_mlp/SLB\best_mlp_full_text_S_label_layer_3.weights.h5
Epoch 14/100
Epoch 14: val_l

Epoch 31/100

Epoch 31: val_loss did not improve from 0.69670
Epoch 31: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_3.png
Test Accuracy for Layer 3: 0.5143
Test AUC for Layer 3: 0.5037

Average Test Accuracy across all layers: 0.4737
Average Test AUC across all layers: 0.4774

Training MLP model for Full text and L_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Embeddings shapes - Train: (401, 1536), Val: (131, 1536), Test: (133, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Full text (L_label)
Model: "sequential_166"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_640 (Dense)           (None, 512)               786944    
                                 


Epoch 18: val_loss did not improve from 0.66273
Epoch 18: early stopping
Saved learning curves: final_mlp_Full_text_L_label_layer_1.png
Test Accuracy for Layer 1: 0.6316
Test AUC for Layer 1: 0.5211

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Embeddings shapes - Train: (532, 1536), Val: (133, 1536), Test: (101, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Full text (L_label)
Model: "sequential_167"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_644 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_477 (B  (None, 512)               2048      
 atchNormalization)                                 

Epoch 17/100
Epoch 17: val_loss did not improve from 0.68133
Epoch 18/100
Epoch 18: val_loss improved from 0.68133 to 0.67931, saving model to OpenAI_MLP_XGB/visualizations_mlp/SLB\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 19/100
Epoch 19: val_loss improved from 0.67931 to 0.67744, saving model to OpenAI_MLP_XGB/visualizations_mlp/SLB\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 20/100
Epoch 20: val_loss did not improve from 0.67744
Epoch 21/100
Epoch 21: val_loss improved from 0.67744 to 0.67645, saving model to OpenAI_MLP_XGB/visualizations_mlp/SLB\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 22/100
Epoch 22: val_loss improved from 0.67645 to 0.67569, saving model to OpenAI_MLP_XGB/visualizations_mlp/SLB\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 23/100
Epoch 23: val_loss improved from 0.67569 to 0.67476, saving model to OpenAI_MLP_XGB/visualizations_mlp/SLB\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 24/100
Epoch 24: val_loss did not improve 

 atchNormalization)                                              
                                                                 
 dropout_337 (Dropout)       (None, 256)               0         
                                                                 
 dense_650 (Dense)           (None, 128)               32896     
                                                                 
 batch_normalization_482 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_651 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.69648, saving model to OpenAI_

[63]	validation_0-auc:0.48121
[64]	validation_0-auc:0.48389
[65]	validation_0-auc:0.48121
[66]	validation_0-auc:0.47572
[67]	validation_0-auc:0.47409
[68]	validation_0-auc:0.47806
[69]	validation_0-auc:0.48063
[70]	validation_0-auc:0.48436
[71]	validation_0-auc:0.49020
[72]	validation_0-auc:0.49288
[73]	validation_0-auc:0.48296
[74]	validation_0-auc:0.48471
[75]	validation_0-auc:0.48413
[76]	validation_0-auc:0.48366
[77]	validation_0-auc:0.48786
[78]	validation_0-auc:0.48751
[79]	validation_0-auc:0.48564
[80]	validation_0-auc:0.49148
[81]	validation_0-auc:0.48739
[82]	validation_0-auc:0.48401
[83]	validation_0-auc:0.48926
[84]	validation_0-auc:0.48926
[85]	validation_0-auc:0.49218
[86]	validation_0-auc:0.48950
[87]	validation_0-auc:0.48378
[88]	validation_0-auc:0.48413
[89]	validation_0-auc:0.48646
[90]	validation_0-auc:0.48763
[91]	validation_0-auc:0.48599
[92]	validation_0-auc:0.48051
[93]	validation_0-auc:0.48191
[94]	validation_0-auc:0.48284
[95]	validation_0-auc:0.48284
[96]	valid

[105]	validation_0-auc:0.55152
[106]	validation_0-auc:0.55587
[107]	validation_0-auc:0.55538
[108]	validation_0-auc:0.55525
[109]	validation_0-auc:0.55674
[110]	validation_0-auc:0.55177
[111]	validation_0-auc:0.55550
[112]	validation_0-auc:0.55338
[113]	validation_0-auc:0.55450
[114]	validation_0-auc:0.55376
[115]	validation_0-auc:0.55326
[116]	validation_0-auc:0.55824
[117]	validation_0-auc:0.56272
[118]	validation_0-auc:0.55774
[119]	validation_0-auc:0.55923
[120]	validation_0-auc:0.56010
[121]	validation_0-auc:0.55438
[122]	validation_0-auc:0.55127
[123]	validation_0-auc:0.54978
[124]	validation_0-auc:0.55202
[125]	validation_0-auc:0.55301
[126]	validation_0-auc:0.55027
[127]	validation_0-auc:0.55090
[128]	validation_0-auc:0.55276
[129]	validation_0-auc:0.55351
[130]	validation_0-auc:0.55376
[131]	validation_0-auc:0.55002
[132]	validation_0-auc:0.55301
[133]	validation_0-auc:0.55152
[134]	validation_0-auc:0.55002
[135]	validation_0-auc:0.55152
[136]	validation_0-auc:0.54953
[137]	va

[146]	validation_0-auc:0.43779
[147]	validation_0-auc:0.43487
[148]	validation_0-auc:0.43651
[149]	validation_0-auc:0.43441
Test Accuracy for Layer 1: 0.4887
Test AUC for Layer 1: 0.6656

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Embeddings shapes - Train: (532, 1536), Val: (133, 1536), Test: (101, 1536)
Creating and training XGBoost model...
[0]	validation_0-auc:0.48891
[1]	validation_0-auc:0.44106
[2]	validation_0-auc:0.46980
[3]	validation_0-auc:0.48371
[4]	validation_0-auc:0.46663
[5]	validation_0-auc:0.48224
[6]	validation_0-auc:0.48247
[7]	validation_0-auc:0.46437
[8]	validation_0-auc:0.45645
[9]	validation_0-auc:0.44581
[10]	validation_0-auc:0.41324
[11]	validation_0-auc:0.40328
[12]	validation_0-auc:0.38100
[13]	validation_0-auc:0.38914
[14]	validation_0-auc:0.38111
[15]	validation_0-auc:0.37579
[16]	validation

[25]	validation_0-auc:0.64723
[26]	validation_0-auc:0.65962
[27]	validation_0-auc:0.65428
[28]	validation_0-auc:0.65938
[29]	validation_0-auc:0.65986
[30]	validation_0-auc:0.65792
[31]	validation_0-auc:0.65136
[32]	validation_0-auc:0.65792
[33]	validation_0-auc:0.65075
[34]	validation_0-auc:0.64529
[35]	validation_0-auc:0.65403
[36]	validation_0-auc:0.65403
[37]	validation_0-auc:0.64832
[38]	validation_0-auc:0.64431
[39]	validation_0-auc:0.63338
[40]	validation_0-auc:0.64504
[41]	validation_0-auc:0.64480
[42]	validation_0-auc:0.64444
[43]	validation_0-auc:0.63958
[44]	validation_0-auc:0.63739
[45]	validation_0-auc:0.64128
[46]	validation_0-auc:0.63290
[47]	validation_0-auc:0.63022
[48]	validation_0-auc:0.62949
[49]	validation_0-auc:0.63630
[50]	validation_0-auc:0.64407
[51]	validation_0-auc:0.64456
[52]	validation_0-auc:0.64359
[53]	validation_0-auc:0.64662
[54]	validation_0-auc:0.63776
[55]	validation_0-auc:0.63776
[56]	validation_0-auc:0.63520
[57]	validation_0-auc:0.63435
[58]	valid

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
import xgboost as xgb
from xgboost import callback
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, model_type, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.model_type = model_type
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class ClimateNewsOpenAIPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'OpenAI_MLP_XGB/visualizations_mlp/XOM'
        self.xgb_viz_dir = 'OpenAI_MLP_XGB/visualizations_xgb/XOM'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs(self.xgb_viz_dir, exist_ok=True)
        os.makedirs('OpenAI_MLP_XGB/visualizations_summary/XOM', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert embeddings to numpy arrays
        self.data['Title_embedding'] = self.data['Title_embedding_vector'].apply(parse_embedding)
        self.data['Fulltext_embedding'] = self.data['Full_text_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_title_embedding = self.data['Title_embedding'].iloc[0]
        sample_fulltext_embedding = self.data['Fulltext_embedding'].iloc[0]
        
        print(f"Sample Title embedding shape: {sample_title_embedding.shape}")
        print(f"Sample Fulltext embedding shape: {sample_fulltext_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            text_col: The column containing the embeddings ('Title_embedding' or 'Fulltext_embedding')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data[text_col].values)
        X_val = np.stack(val_data[text_col].values)
        X_test = np.stack(test_data[text_col].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def get_xgb_parameters(self):
        """
        Get XGBoost parameters optimized for high-dimensional embeddings.
        Uses a single parameter set for both Title and Full text embeddings since they have the same dimension (1536).
        
        Returns:
            Dictionary of base parameters
        """
        # Setup XGBoost base parameters optimized for high-dimensional embeddings
        base_params = {
            'objective': 'binary:logistic',
            'eval_metric': 'auc',
            'n_estimators': 100,
            'max_depth': 3,
            'learning_rate': 0.00005,
            'subsample': 0.7,          # Row subsampling to prevent overfitting
            'colsample_bytree': 0.5,   # Column subsampling to handle high dimensionality
            'min_child_weight': 3,     # Prevents overfitting on high-dimensional embeddings
            'reg_alpha': 1.0,          # L1 regularization
            'reg_lambda': 2.0,         # L2 regularization
            'random_state': 42,
            'use_label_encoder': False # Avoid deprecation warning
        }
        
        return base_params
    
    def plot_final_learning_curves(self, history, model_type, text_col, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            model_type: Model type (MLP)
            text_col: Text column used
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_{model_type.lower()}_{display_text.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, text_col, label_col, model_type):
        """
        Train and evaluate a model for a specific text column and label column.
        
        Args:
            text_col: The embedding column to use ('Title_embedding' or 'Fulltext_embedding')
            label_col: The label column to use ('S_label' or 'L_label')
            model_type: The model type to use ('MLP' or 'XGBoost')
        """
        # Store results
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        combination_key = f"{model_type}|{display_text}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training {model_type} model for {display_text} and {label_col}")
        print(f"{'='*80}")
        
        visualization_dir = self.mlp_viz_dir if model_type == 'MLP' else self.xgb_viz_dir
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            if model_type == 'MLP':
                # MLP model training
                model = self.create_mlp_model((1536,))
                print(f"Created MLP model for {display_text} ({label_col})")
                model.summary()
                
                # Setup callbacks
                plot_callback = PlotLearningCallback(model_type, display_text, label_col, i+1, visualization_dir)
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=15,
                    restore_best_weights=True,
                    verbose=1
                )
                model_checkpoint = ModelCheckpoint(
                    filepath=f"{visualization_dir}/best_{model_type.lower()}_{display_text.lower().replace(' ', '_')}_{label_col}_layer_{i+1}.weights.h5",
                    monitor='val_loss',
                    save_weights_only=True,
                    save_best_only=True,
                    verbose=1
                )
                
                # Train model
                print(f"Training MLP model...")
                batch_size = 32  # Larger batch size for embeddings
                history = model.fit(
                    X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                    batch_size=batch_size,
                    callbacks=[early_stopping, model_checkpoint, plot_callback],
                    verbose=1
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict(X_test)
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create final learning curve visualization
                self.plot_final_learning_curves(history, model_type, text_col, label_col, i+1, visualization_dir)
                
                # Store training history
                training_history = history.history
                
            else:  # XGBoost - simplified approach to fix API issues
                # Get XGBoost parameters
                base_params = self.get_xgb_parameters()
                
                # Create and train XGBoost model
                print(f"Creating and training XGBoost model...")
                model = xgb.XGBClassifier(**base_params)
                
                # Only use validation set for evaluation (not training set)
                eval_set = [(X_val, y_val)]
                
                # Train the model
                model.fit(
                    X_train, y_train,
                    eval_set=eval_set,
                    verbose=True
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict_proba(X_test)[:, 1]
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create a simple summary for XGBoost (no detailed learning curves available)
                plt.figure(figsize=(10, 6))
                plt.text(0.5, 0.5, f'XGBoost Model Trained Successfully\nAccuracy: {accuracy_score(y_test, y_pred):.4f}\nAUC: {roc_auc_score(y_test, y_pred_proba):.4f}',
                         ha='center', va='center', size=14, fontweight='bold')
                plt.title(f'XGBoost Results ({display_text}, {label_col}, Layer {i+1})')
                plt.axis('off')
                plt.tight_layout()
                plt.savefig(f"{visualization_dir}/xgb_{display_text.replace(' ', '_')}_{label_col}_layer_{i+1}.png")
                plt.close()
                
                # Use None for training history since detailed learning curves aren't available
                training_history = None
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model_type': model_type,
                'accuracy': accuracy,
                'auc': auc,
                'history': training_history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text inputs, label columns, and model types."""
        # Define all combinations
        embedding_cols = ['Title_embedding', 'Fulltext_embedding']
        label_cols = ['S_label', 'L_label']
        model_types = ['MLP', 'XGBoost']
        
        # Run analysis for each combination
        for model_type in model_types:
            for embedding_col in embedding_cols:
                for label_col in label_cols:
                    self.train_and_evaluate_model(embedding_col, label_col, model_type)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Model': model_type,
                'Text': text_col,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing all model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Split by model type
        mlp_data = df[df['Model'] == 'MLP']
        xgb_data = df[df['Model'] == 'XGBoost']
        
        # 1. Performance comparison for MLP
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for MLP
        x = np.arange(len(mlp_data))
        width = 0.35
        
        plt.bar(x - width/2, mlp_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, mlp_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('MLP Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in mlp_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(mlp_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(mlp_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(mlp_data['Avg Accuracy'].max(), mlp_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/XOM', "mlp_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # 2. Performance comparison for XGBoost
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for XGBoost
        x = np.arange(len(xgb_data))
        
        plt.bar(x - width/2, xgb_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, xgb_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('XGBoost Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of XGBoost Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in xgb_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(xgb_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(xgb_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(xgb_data['Avg Accuracy'].max(), xgb_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/XOM', "xgb_performance_comparison.png")
        plt.savefig(save_path)
        print(f"XGBoost summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP and XGBoost models."""
        # Prepare data for visualization
        mlp_layer_data = []
        xgb_layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Model': model_type,
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                if model_type == 'MLP':
                    mlp_layer_data.append(layer_info)
                else:  # XGBoost
                    xgb_layer_data.append(layer_info)
        
        # Create visualizations for each model type
        self._create_model_layer_visualization(mlp_layer_data, 'MLP')
        self._create_model_layer_visualization(xgb_layer_data, 'XGBoost')
    
    def _create_model_layer_visualization(self, layer_data, model_type):
        """Create layer-specific visualizations for a given model type."""
        if not layer_data:
            print(f"No layer data available for {model_type}")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(14, 10))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title(f'{model_type} Accuracy by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title(f'{model_type} AUC by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP_XGB/visualizations_summary/XOM', f"{model_type.lower()}_layer_performance.png")
        plt.savefig(save_path)
        print(f"{model_type} layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['OpenAI_MLP_XGB/visualizations_mlp/XOM', 'OpenAI_MLP_XGB/visualizations_xgb/XOM', 
                      'OpenAI_MLP_XGB/visualizations_summary/XOM']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_XOM_completed_openai.csv'
    
    # Initialize the predictor with OpenAI embeddings
    predictor = ClimateNewsOpenAIPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'OpenAI_MLP_XGB' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing OpenAI embedding vectors from string format...
Sample Title embedding shape: (1536,)
Sample Fulltext embedding shape: (1536,)
Loaded 838 climate change news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 427, 0: 411}
Class distribution for long-term prediction: {1: 428, 0: 410}

Training MLP model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Embeddings shapes - Train: (401, 1536), Val: (131, 1536), Test: (133, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (S_label)
Model: "sequential_169"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   

 dense_656 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_486 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_340 (Dropout)       (None, 512)               0         
                                                                 
 dense_657 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_487 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_341 (Dropout)       (None, 256)               0         
                                                                 
 dense_658 (Dense)           (None, 128)               32896     
          

                                                                 
 batch_normalization_490 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_343 (Dropout)       (None, 256)               0         
                                                                 
 dense_662 (Dense)           (None, 128)               32896     
                                                                 
 batch_normalization_491 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_663 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_______________________________________

Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.70047, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_title_L_label_layer_1.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.70047
Epoch 3/100
Epoch 3: val_loss did not improve from 0.70047
Epoch 4/100
Epoch 4: val_loss did not improve from 0.70047
Epoch 5/100
Epoch 5: val_loss did not improve from 0.70047
Epoch 6/100
Epoch 6: val_loss did not improve from 0.70047
Epoch 7/100
Epoch 7: val_loss did not improve from 0.70047
Epoch 8/100
Epoch 8: val_loss did not improve from 0.70047
Epoch 9/100
Epoch 9: val_loss did not improve from 0.70047
Epoch 10/100
Epoch 10: val_loss did not improve from 0.70047
Epoch 11/100
Epoch 11: val_loss did not improve from 0.70047
Epoch 12/100
Epoch 12: val_loss did not improve from 0.7004

Epoch 5/100
Epoch 5: val_loss did not improve from 0.71194
Epoch 6/100
Epoch 6: val_loss did not improve from 0.71194
Epoch 7/100
Epoch 7: val_loss did not improve from 0.71194
Epoch 8/100
Epoch 8: val_loss did not improve from 0.71194
Epoch 9/100
Epoch 9: val_loss improved from 0.71194 to 0.70502, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_title_L_label_layer_2.weights.h5
Epoch 10/100
Epoch 10: val_loss improved from 0.70502 to 0.69758, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_title_L_label_layer_2.weights.h5
Epoch 11/100
Epoch 11: val_loss improved from 0.69758 to 0.68913, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_title_L_label_layer_2.weights.h5
Epoch 12/100
Epoch 12: val_loss improved from 0.68913 to 0.68111, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_title_L_label_layer_2.weights.h5
Epoch 13/100
Epoch 13: val_loss improved from 0.68111 to 0.67405, saving model to OpenAI_MLP_XGB/visualizations_mlp/XO

Epoch 30: val_loss did not improve from 0.64814
Epoch 31/100
Epoch 31: val_loss did not improve from 0.64814
Epoch 32/100
Epoch 32: val_loss did not improve from 0.64814
Epoch 33/100
Epoch 33: val_loss did not improve from 0.64814
Epoch 34/100
Epoch 34: val_loss did not improve from 0.64814
Epoch 35/100

Epoch 35: val_loss did not improve from 0.64814
Epoch 35: early stopping
Saved learning curves: final_mlp_Title_L_label_layer_2.png
Test Accuracy for Layer 2: 0.4653
Test AUC for Layer 2: 0.5807

Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Embeddings shapes - Train: (666, 1536), Val: (101, 1536), Test: (70, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (L_label)
Model: "sequential_174"
_________________________________________________________________
 Layer (type)        

Epoch 13/100
Epoch 13: val_loss improved from 0.67543 to 0.67453, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_title_L_label_layer_3.weights.h5
Epoch 14/100
Epoch 14: val_loss improved from 0.67453 to 0.67425, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_title_L_label_layer_3.weights.h5
Epoch 15/100
Epoch 15: val_loss improved from 0.67425 to 0.67377, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_title_L_label_layer_3.weights.h5
Epoch 16/100
Epoch 16: val_loss improved from 0.67377 to 0.67196, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_title_L_label_layer_3.weights.h5
Epoch 17/100
Epoch 17: val_loss did not improve from 0.67196
Epoch 18/100
Epoch 18: val_loss did not improve from 0.67196
Epoch 19/100
Epoch 19: val_loss did not improve from 0.67196
Epoch 20/100
Epoch 20: val_loss did not improve from 0.67196
Epoch 21/100
Epoch 21: val_loss did not improve from 0.67196
Epoch 22/100
Epoch 22: val_loss did not improve

Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.70095, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_full_text_S_label_layer_1.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.70095
Epoch 3/100
Epoch 3: val_loss did not improve from 0.70095
Epoch 4/100
Epoch 4: val_loss did not improve from 0.70095
Epoch 5/100
Epoch 5: val_loss did not improve from 0.70095
Epoch 6/100
Epoch 6: val_loss did not improve from 0.70095
Epoch 7/100
Epoch 7: val_loss did not improve from 0.70095
Epoch 8/100
Epoch 8: val_loss did not improve from 0.70095
Epoch 9/100
Epoch 9: val_loss did not improve from 0.70095
Epoch 10/100
Epoch 10: val_loss did not improve from 0.70095
Epoch 11/100
Epoch 11: val_loss did not improve from 0.70095
Epoch 12/100
Epoch 12: val_loss did not improve from 0.

Epoch 4/100
Epoch 4: val_loss did not improve from 0.69149
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69149
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69149
Epoch 7/100
Epoch 7: val_loss did not improve from 0.69149
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69149
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69149
Epoch 10/100
Epoch 10: val_loss did not improve from 0.69149
Epoch 11/100
Epoch 11: val_loss did not improve from 0.69149
Epoch 12/100
Epoch 12: val_loss did not improve from 0.69149
Epoch 13/100
Epoch 13: val_loss did not improve from 0.69149
Epoch 14/100
Epoch 14: val_loss did not improve from 0.69149
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69149
Epoch 16/100
Epoch 16: val_loss did not improve from 0.69149
Epoch 17/100
Epoch 17: val_loss did not improve from 0.69149
Epoch 18/100

Epoch 18: val_loss did not improve from 0.69149
Epoch 18: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_2.png
Test Acc

Epoch 7/100
Epoch 7: val_loss did not improve from 0.69602
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69602
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69602
Epoch 10/100
Epoch 10: val_loss did not improve from 0.69602
Epoch 11/100
Epoch 11: val_loss did not improve from 0.69602
Epoch 12/100
Epoch 12: val_loss did not improve from 0.69602
Epoch 13/100
Epoch 13: val_loss did not improve from 0.69602
Epoch 14/100
Epoch 14: val_loss did not improve from 0.69602
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69602
Epoch 16/100

Epoch 16: val_loss did not improve from 0.69602
Epoch 16: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_3.png
Test Accuracy for Layer 3: 0.5571
Test AUC for Layer 3: 0.5798

Average Test Accuracy across all layers: 0.5019
Average Test AUC across all layers: 0.4756

Training MLP model for Full text and L_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing peri

Epoch 9/100
Epoch 9: val_loss improved from 0.69080 to 0.69053, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 10/100
Epoch 10: val_loss improved from 0.69053 to 0.68969, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 11/100
Epoch 11: val_loss did not improve from 0.68969
Epoch 12/100
Epoch 12: val_loss did not improve from 0.68969
Epoch 13/100
Epoch 13: val_loss did not improve from 0.68969
Epoch 14/100
Epoch 14: val_loss did not improve from 0.68969
Epoch 15/100
Epoch 15: val_loss did not improve from 0.68969
Epoch 16/100
Epoch 16: val_loss did not improve from 0.68969
Epoch 17/100
Epoch 17: val_loss did not improve from 0.68969
Epoch 18/100
Epoch 18: val_loss did not improve from 0.68969
Epoch 19/100
Epoch 19: val_loss did not improve from 0.68969
Epoch 20/100
Epoch 20: val_loss did not improve from 0.68969
Epoch 21/100
Epoch 21: val_loss did not improve from 0.68969

Epoch 3/100
Epoch 3: val_loss improved from 0.67380 to 0.66758, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 4/100
Epoch 4: val_loss improved from 0.66758 to 0.66167, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 5/100
Epoch 5: val_loss improved from 0.66167 to 0.65636, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 6/100
Epoch 6: val_loss improved from 0.65636 to 0.65533, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 7/100
Epoch 7: val_loss improved from 0.65533 to 0.65523, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 8/100
Epoch 8: val_loss did not improve from 0.65523
Epoch 9/100
Epoch 9: val_loss did not improve from 0.65523
Epoch 10/100
Epoch 10: val_loss did not improve from 0.65523
E

Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.69177, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_full_text_L_label_layer_3.weights.h5
Epoch 2/100
Epoch 2: val_loss improved from 0.69177 to 0.69112, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_full_text_L_label_layer_3.weights.h5
Epoch 3/100
Epoch 3: val_loss improved from 0.69112 to 0.69090, saving model to OpenAI_MLP_XGB/visualizations_mlp/XOM\best_mlp_full_text_L_label_layer_3.weights.h5
Epoch 4/100
Epoch 4: val_loss did not improve from 0.69090
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69090
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69090
Epoch 7/100
Epoch 7: val_loss did not improve from 0.69090
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69090
Epoch 9/100
Epoch 9: val_l

[0]	validation_0-auc:0.51856
[1]	validation_0-auc:0.53455
[2]	validation_0-auc:0.50474
[3]	validation_0-auc:0.48130
[4]	validation_0-auc:0.46179
[5]	validation_0-auc:0.45908
[6]	validation_0-auc:0.50528
[7]	validation_0-auc:0.50298
[8]	validation_0-auc:0.50190
[9]	validation_0-auc:0.49864
[10]	validation_0-auc:0.49485
[11]	validation_0-auc:0.49228
[12]	validation_0-auc:0.51165
[13]	validation_0-auc:0.50081
[14]	validation_0-auc:0.49675
[15]	validation_0-auc:0.49146
[16]	validation_0-auc:0.50759
[17]	validation_0-auc:0.52629
[18]	validation_0-auc:0.51165
[19]	validation_0-auc:0.51396
[20]	validation_0-auc:0.50556
[21]	validation_0-auc:0.49702
[22]	validation_0-auc:0.50190
[23]	validation_0-auc:0.49444
[24]	validation_0-auc:0.49539
[25]	validation_0-auc:0.48970
[26]	validation_0-auc:0.48753
[27]	validation_0-auc:0.48604
[28]	validation_0-auc:0.47602
[29]	validation_0-auc:0.47669
[30]	validation_0-auc:0.47886
[31]	validation_0-auc:0.48076
[32]	validation_0-auc:0.48821
[33]	validation_0-au

[85]	validation_0-auc:0.46530
[86]	validation_0-auc:0.46748
[87]	validation_0-auc:0.46656
[88]	validation_0-auc:0.46748
[89]	validation_0-auc:0.46473
[90]	validation_0-auc:0.46679
[91]	validation_0-auc:0.46713
[92]	validation_0-auc:0.46759
[93]	validation_0-auc:0.46862
[94]	validation_0-auc:0.46839
[95]	validation_0-auc:0.46942
[96]	validation_0-auc:0.47000
[97]	validation_0-auc:0.46862
[98]	validation_0-auc:0.46427
[99]	validation_0-auc:0.46496
Test Accuracy for Layer 2: 0.5446
Test AUC for Layer 2: 0.3759

Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Embeddings shapes - Train: (666, 1536), Val: (101, 1536), Test: (70, 1536)
Creating and training XGBoost model...
[0]	validation_0-auc:0.51858
[1]	validation_0-auc:0.45652
[2]	validation_0-auc:0.49348
[3]	validation_0-auc:0.49190
[4]	validation_0-auc:0.48360
[5]	validation_0

XGBoost summary comparison visualization saved as: OpenAI_MLP_XGB/visualizations_summary/XOM\xgb_performance_comparison.png
MLP layer performance visualization saved as: OpenAI_MLP_XGB/visualizations_summary/XOM\mlp_layer_performance.png
XGBoost layer performance visualization saved as: OpenAI_MLP_XGB/visualizations_summary/XOM\xgboost_layer_performance.png

Analysis complete! Results saved to 'OpenAI_MLP_XGB' directory.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class MergedEmbeddingMLPPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using merged OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with merged OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'Merged_OpenAI_MLP/visualizations_mlp/COP'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs('Merged_OpenAI_MLP/visualizations_summary/COP', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with merged OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing merged OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert merged embeddings to numpy arrays
        self.data['Merged_embedding'] = self.data['Merged_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_merged_embedding = self.data['Merged_embedding'].iloc[0]
        print(f"Sample Merged embedding shape: {sample_merged_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data['Merged_embedding'].values)
        X_val = np.stack(val_data['Merged_embedding'].values)
        X_test = np.stack(test_data['Merged_embedding'].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def plot_final_learning_curves(self, history, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_mlp_merged_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, label_col):
        """
        Train and evaluate an MLP model for merged embeddings and a specific label column.
        
        Args:
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"MLP|Merged|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training MLP model for Merged Embeddings and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_mlp_model((1536,))
            print(f"Created MLP model for Merged Embeddings ({label_col})")
            model.summary()
            
            # Setup callbacks
            plot_callback = PlotLearningCallback('Merged', label_col, i+1, self.mlp_viz_dir)
            early_stopping = EarlyStopping(
                monitor='val_loss',
                patience=15,
                restore_best_weights=True,
                verbose=1
            )
            model_checkpoint = ModelCheckpoint(
                filepath=f"{self.mlp_viz_dir}/best_mlp_merged_{label_col}_layer_{i+1}.weights.h5",
                monitor='val_loss',
                save_weights_only=True,
                save_best_only=True,
                verbose=1
            )
            
            # Train model
            print(f"Training MLP model...")
            batch_size = 32  # Larger batch size for embeddings
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                batch_size=batch_size,
                callbacks=[early_stopping, model_checkpoint, plot_callback],
                verbose=1
            )
            
            # Evaluate on test set
            y_pred_proba = model.predict(X_test)
            y_pred = (y_pred_proba > 0.5).astype(int)
            
            # Create final learning curve visualization
            self.plot_final_learning_curves(history, label_col, i+1, self.mlp_viz_dir)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'accuracy': accuracy,
                'auc': auc,
                'history': history.history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for merged embeddings with both short-term and long-term labels."""
        # Define label columns
        label_cols = ['S_label', 'L_label']
        
        # Run analysis for each label column
        for label_col in label_cols:
            self.train_and_evaluate_model(label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (COP)")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Performance comparison for MLP with merged embeddings
        plt.figure(figsize=(10, 6))
        
        # Plot accuracy and AUC bars
        x = np.arange(len(df))
        width = 0.35
        
        plt.bar(x - width/2, df['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, df['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with Merged OpenAI Embeddings')
        labels = [f"Merged + {row['Label']}" for _, row in df.iterrows()]
        plt.xticks(x, labels, rotation=0)
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(df['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(df['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(df['Avg Accuracy'].max(), df['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/COP', "mlp_merged_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP models."""
        # Prepare data for visualization
        layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                layer_data.append(layer_info)
        
        if not layer_data:
            print("No layer data available")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(10, 8))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title('MLP Accuracy by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title('MLP AUC by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/COP', "mlp_merged_layer_performance.png")
        plt.savefig(save_path)
        print(f"MLP layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['Merged_OpenAI_MLP/visualizations_mlp/COP', 'Merged_OpenAI_MLP/visualizations_summary/COP']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_title_full_text_SP500_database/semantic/wall_street_news_semantics_COP_completed_openai_Merged.csv'
    
    # Initialize the predictor with merged OpenAI embeddings
    predictor = MergedEmbeddingMLPPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'Merged_OpenAI_MLP' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing merged OpenAI embedding vectors from string format...
Sample Merged embedding shape: (1536,)
Loaded 838 climate change news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0: 431, 1: 407}
Class distribution for long-term prediction: {1: 440, 0: 398}

Training MLP model for Merged Embeddings and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Embeddings shapes - Train: (401, 1536), Val: (131, 1536), Test: (133, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (S_label)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dens

Epoch 18/100
Epoch 18: val_loss did not improve from 0.68880
Epoch 19/100
Epoch 19: val_loss did not improve from 0.68880
Epoch 20/100
Epoch 20: val_loss did not improve from 0.68880
Epoch 21/100
Epoch 21: val_loss did not improve from 0.68880
Epoch 22/100

Epoch 22: val_loss did not improve from 0.68880
Epoch 22: early stopping
Saved learning curves: final_mlp_merged_S_label_layer_1.png
Test Accuracy for Layer 1: 0.5338
Test AUC for Layer 1: 0.5395

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Embeddings shapes - Train: (532, 1536), Val: (133, 1536), Test: (101, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (S_label)
Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              P


Epoch 16: val_loss did not improve from 0.69768
Epoch 16: early stopping
Saved learning curves: final_mlp_merged_S_label_layer_2.png
Test Accuracy for Layer 2: 0.4455
Test AUC for Layer 2: 0.5040

Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Embeddings shapes - Train: (666, 1536), Val: (101, 1536), Test: (70, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (S_label)
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 512)               786944    
                                                                 
 batch_normalization_6 (Bat  (None, 512)               2048      
 chNormalization)                                  

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 512)               786944    
                                                                 
 batch_normalization_9 (Bat  (None, 512)               2048      
 chNormalization)                                                
                                                                 
 dropout_6 (Dropout)         (None, 512)               0         
                                                                 
 dense_13 (Dense)            (None, 256)               131328    
                                                                 
 batch_normalization_10 (Ba  (None, 256)               1024      
 tchNormalization)                                               
                                                                 
 dropout_7 (Dropout)         (None, 256)               0         
          

Epoch 20/100
Epoch 20: val_loss did not improve from 0.63176
Epoch 21/100
Epoch 21: val_loss did not improve from 0.63176
Epoch 22/100
Epoch 22: val_loss did not improve from 0.63176
Epoch 23/100
Epoch 23: val_loss did not improve from 0.63176
Epoch 24/100
Epoch 24: val_loss did not improve from 0.63176
Epoch 25/100
Epoch 25: val_loss did not improve from 0.63176
Epoch 26/100
Epoch 26: val_loss did not improve from 0.63176
Epoch 27/100

Epoch 27: val_loss did not improve from 0.63176
Epoch 27: early stopping
Saved learning curves: final_mlp_merged_L_label_layer_1.png
Test Accuracy for Layer 1: 0.7368
Test AUC for Layer 1: 0.6222

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Embeddings shapes - Train: (532, 1536), Val: (133, 1536), Test: (101, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_20 (Dense)            (None, 512)               786944    
                                                                 
 batch_normalization_15 (Ba  (None, 512)               2048      
 tchNormalization)                                               
                                                                 
 dropout_10 (Dropout)        (None, 512)               0         
                                                                 
 dense_21 (Dense)            (None, 256)               131328    
                                                                 
 batch_normalization_16 (Ba  (None, 256)               1024      
 tchNormalization)                                               
                                                                 
 dropout_11 (Dropout)        (None, 256)               0         
          

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class MergedEmbeddingMLPPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using merged OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with merged OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'Merged_OpenAI_MLP/visualizations_mlp/CVX'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs('Merged_OpenAI_MLP/visualizations_summary/CVX', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with merged OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing merged OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert merged embeddings to numpy arrays
        self.data['Merged_embedding'] = self.data['Merged_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_merged_embedding = self.data['Merged_embedding'].iloc[0]
        print(f"Sample Merged embedding shape: {sample_merged_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data['Merged_embedding'].values)
        X_val = np.stack(val_data['Merged_embedding'].values)
        X_test = np.stack(test_data['Merged_embedding'].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def plot_final_learning_curves(self, history, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_mlp_merged_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, label_col):
        """
        Train and evaluate an MLP model for merged embeddings and a specific label column.
        
        Args:
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"MLP|Merged|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training MLP model for Merged Embeddings and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_mlp_model((1536,))
            print(f"Created MLP model for Merged Embeddings ({label_col})")
            model.summary()
            
            # Setup callbacks
            plot_callback = PlotLearningCallback('Merged', label_col, i+1, self.mlp_viz_dir)
            early_stopping = EarlyStopping(
                monitor='val_loss',
                patience=15,
                restore_best_weights=True,
                verbose=1
            )
            model_checkpoint = ModelCheckpoint(
                filepath=f"{self.mlp_viz_dir}/best_mlp_merged_{label_col}_layer_{i+1}.weights.h5",
                monitor='val_loss',
                save_weights_only=True,
                save_best_only=True,
                verbose=1
            )
            
            # Train model
            print(f"Training MLP model...")
            batch_size = 32  # Larger batch size for embeddings
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                batch_size=batch_size,
                callbacks=[early_stopping, model_checkpoint, plot_callback],
                verbose=1
            )
            
            # Evaluate on test set
            y_pred_proba = model.predict(X_test)
            y_pred = (y_pred_proba > 0.5).astype(int)
            
            # Create final learning curve visualization
            self.plot_final_learning_curves(history, label_col, i+1, self.mlp_viz_dir)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'accuracy': accuracy,
                'auc': auc,
                'history': history.history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for merged embeddings with both short-term and long-term labels."""
        # Define label columns
        label_cols = ['S_label', 'L_label']
        
        # Run analysis for each label column
        for label_col in label_cols:
            self.train_and_evaluate_model(label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (CVX)")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Performance comparison for MLP with merged embeddings
        plt.figure(figsize=(10, 6))
        
        # Plot accuracy and AUC bars
        x = np.arange(len(df))
        width = 0.35
        
        plt.bar(x - width/2, df['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, df['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with Merged OpenAI Embeddings')
        labels = [f"Merged + {row['Label']}" for _, row in df.iterrows()]
        plt.xticks(x, labels, rotation=0)
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(df['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(df['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(df['Avg Accuracy'].max(), df['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/CVX', "mlp_merged_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP models."""
        # Prepare data for visualization
        layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                layer_data.append(layer_info)
        
        if not layer_data:
            print("No layer data available")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(10, 8))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title('MLP Accuracy by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title('MLP AUC by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/CVX', "mlp_merged_layer_performance.png")
        plt.savefig(save_path)
        print(f"MLP layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['Merged_OpenAI_MLP/visualizations_mlp/CVX', 'Merged_OpenAI_MLP/visualizations_summary/CVX']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_title_full_text_SP500_database/semantic/wall_street_news_semantics_CVX_completed_openai_Merged.csv'
    
    # Initialize the predictor with merged OpenAI embeddings
    predictor = MergedEmbeddingMLPPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'Merged_OpenAI_MLP' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing merged OpenAI embedding vectors from string format...
Sample Merged embedding shape: (1536,)
Loaded 838 climate change news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 452, 0: 386}
Class distribution for long-term prediction: {1: 464, 0: 374}

Training MLP model for Merged Embeddings and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Embeddings shapes - Train: (401, 1536), Val: (131, 1536), Test: (133, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (S_label)
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_24 

 dense_28 (Dense)            (None, 512)               786944    
                                                                 
 batch_normalization_21 (Ba  (None, 512)               2048      
 tchNormalization)                                               
                                                                 
 dropout_14 (Dropout)        (None, 512)               0         
                                                                 
 dense_29 (Dense)            (None, 256)               131328    
                                                                 
 batch_normalization_22 (Ba  (None, 256)               1024      
 tchNormalization)                                               
                                                                 
 dropout_15 (Dropout)        (None, 256)               0         
                                                                 
 dense_30 (Dense)            (None, 128)               32896     
          

Epoch 4: val_loss did not improve from 0.69082
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69082
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69082
Epoch 7/100
Epoch 7: val_loss did not improve from 0.69082
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69082
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69082
Epoch 10/100
Epoch 10: val_loss did not improve from 0.69082
Epoch 11/100
Epoch 11: val_loss did not improve from 0.69082
Epoch 12/100
Epoch 12: val_loss did not improve from 0.69082
Epoch 13/100
Epoch 13: val_loss did not improve from 0.69082
Epoch 14/100
Epoch 14: val_loss did not improve from 0.69082
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69082
Epoch 16/100

Epoch 16: val_loss did not improve from 0.69082
Epoch 16: early stopping
Saved learning curves: final_mlp_merged_S_label_layer_3.png
Test Accuracy for Layer 3: 0.5143
Test AUC for Layer 3: 0.3725

Average Test Accuracy across all layers: 0.4635
Average Test AUC across all layer

Epoch 6/100
Epoch 6: val_loss improved from 0.66484 to 0.66120, saving model to Merged_OpenAI_MLP/visualizations_mlp/CVX\best_mlp_merged_L_label_layer_1.weights.h5
Epoch 7/100
Epoch 7: val_loss improved from 0.66120 to 0.65778, saving model to Merged_OpenAI_MLP/visualizations_mlp/CVX\best_mlp_merged_L_label_layer_1.weights.h5
Epoch 8/100
Epoch 8: val_loss improved from 0.65778 to 0.65470, saving model to Merged_OpenAI_MLP/visualizations_mlp/CVX\best_mlp_merged_L_label_layer_1.weights.h5
Epoch 9/100
Epoch 9: val_loss improved from 0.65470 to 0.65201, saving model to Merged_OpenAI_MLP/visualizations_mlp/CVX\best_mlp_merged_L_label_layer_1.weights.h5
Epoch 10/100
Epoch 10: val_loss improved from 0.65201 to 0.64994, saving model to Merged_OpenAI_MLP/visualizations_mlp/CVX\best_mlp_merged_L_label_layer_1.weights.h5
Epoch 11/100
Epoch 11: val_loss improved from 0.64994 to 0.64807, saving model to Merged_OpenAI_MLP/visualizations_mlp/CVX\best_mlp_merged_L_label_layer_1.weights.h5
Epoch 12/100

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_40 (Dense)            (None, 512)               786944    
                                                                 
 batch_normalization_30 (Ba  (None, 512)               2048      
 tchNormalization)                                               
                                                                 
 dropout_20 (Dropout)        (None, 512)               0         
                                                                 
 dense_41 (Dense)            (None, 256)               131328    
                                                                 
 batch_normalization_31 (Ba  (None, 256)               1024      
 tchNormalization)                                               
                                                                 
 dropout_21 (Dropout)        (None, 256)               0         
          

 batch_normalization_35 (Ba  (None, 128)               512       
 tchNormalization)                                               
                                                                 
 dense_47 (Dense)            (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.70287, saving model to Merged_OpenAI_MLP/visualizations_mlp/CVX\best_mlp_merged_L_label_layer_3.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.70287
Epoch 3/100
Epoch 3: val_loss did not improve from 0.70287
Epoch 4/100
Epoch 4: val_loss did not improve from 0.70287
Epoch 5/100
Epoch 5: val_loss did not improve from 0.70287
Epoch 6/100
Epoch 6: val_loss did not improve from 0.70287
Epoch 7/100
Epoch 7: val

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class MergedEmbeddingMLPPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using merged OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with merged OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'Merged_OpenAI_MLP/visualizations_mlp/MPC'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs('Merged_OpenAI_MLP/visualizations_summary/MPC', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with merged OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing merged OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert merged embeddings to numpy arrays
        self.data['Merged_embedding'] = self.data['Merged_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_merged_embedding = self.data['Merged_embedding'].iloc[0]
        print(f"Sample Merged embedding shape: {sample_merged_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data['Merged_embedding'].values)
        X_val = np.stack(val_data['Merged_embedding'].values)
        X_test = np.stack(test_data['Merged_embedding'].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def plot_final_learning_curves(self, history, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_mlp_merged_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, label_col):
        """
        Train and evaluate an MLP model for merged embeddings and a specific label column.
        
        Args:
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"MLP|Merged|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training MLP model for Merged Embeddings and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_mlp_model((1536,))
            print(f"Created MLP model for Merged Embeddings ({label_col})")
            model.summary()
            
            # Setup callbacks
            plot_callback = PlotLearningCallback('Merged', label_col, i+1, self.mlp_viz_dir)
            early_stopping = EarlyStopping(
                monitor='val_loss',
                patience=15,
                restore_best_weights=True,
                verbose=1
            )
            model_checkpoint = ModelCheckpoint(
                filepath=f"{self.mlp_viz_dir}/best_mlp_merged_{label_col}_layer_{i+1}.weights.h5",
                monitor='val_loss',
                save_weights_only=True,
                save_best_only=True,
                verbose=1
            )
            
            # Train model
            print(f"Training MLP model...")
            batch_size = 32  # Larger batch size for embeddings
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                batch_size=batch_size,
                callbacks=[early_stopping, model_checkpoint, plot_callback],
                verbose=1
            )
            
            # Evaluate on test set
            y_pred_proba = model.predict(X_test)
            y_pred = (y_pred_proba > 0.5).astype(int)
            
            # Create final learning curve visualization
            self.plot_final_learning_curves(history, label_col, i+1, self.mlp_viz_dir)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'accuracy': accuracy,
                'auc': auc,
                'history': history.history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for merged embeddings with both short-term and long-term labels."""
        # Define label columns
        label_cols = ['S_label', 'L_label']
        
        # Run analysis for each label column
        for label_col in label_cols:
            self.train_and_evaluate_model(label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (MPC)")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Performance comparison for MLP with merged embeddings
        plt.figure(figsize=(10, 6))
        
        # Plot accuracy and AUC bars
        x = np.arange(len(df))
        width = 0.35
        
        plt.bar(x - width/2, df['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, df['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with Merged OpenAI Embeddings')
        labels = [f"Merged + {row['Label']}" for _, row in df.iterrows()]
        plt.xticks(x, labels, rotation=0)
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(df['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(df['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(df['Avg Accuracy'].max(), df['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/MPC', "mlp_merged_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP models."""
        # Prepare data for visualization
        layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                layer_data.append(layer_info)
        
        if not layer_data:
            print("No layer data available")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(10, 8))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title('MLP Accuracy by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title('MLP AUC by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/MPC', "mlp_merged_layer_performance.png")
        plt.savefig(save_path)
        print(f"MLP layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['Merged_OpenAI_MLP/visualizations_mlp/MPC', 'Merged_OpenAI_MLP/visualizations_summary/MPC']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_title_full_text_SP500_database/semantic/wall_street_news_semantics_MPC_completed_openai_Merged.csv'
    
    # Initialize the predictor with merged OpenAI embeddings
    predictor = MergedEmbeddingMLPPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'Merged_OpenAI_MLP' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing merged OpenAI embedding vectors from string format...
Sample Merged embedding shape: (1536,)
Loaded 838 climate change news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 460, 0: 378}
Class distribution for long-term prediction: {1: 503, 0: 335}

Training MLP model for Merged Embeddings and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Embeddings shapes - Train: (401, 1536), Val: (131, 1536), Test: (133, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (S_label)
Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_48

Epoch 2: val_loss did not improve from 0.69361
Epoch 3/100
Epoch 3: val_loss did not improve from 0.69361
Epoch 4/100
Epoch 4: val_loss did not improve from 0.69361
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69361
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69361
Epoch 7/100
Epoch 7: val_loss did not improve from 0.69361
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69361
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69361
Epoch 10/100
Epoch 10: val_loss did not improve from 0.69361
Epoch 11/100
Epoch 11: val_loss did not improve from 0.69361
Epoch 12/100
Epoch 12: val_loss did not improve from 0.69361
Epoch 13/100
Epoch 13: val_loss did not improve from 0.69361
Epoch 14/100
Epoch 14: val_loss did not improve from 0.69361
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69361
Epoch 16/100

Epoch 16: val_loss did not improve from 0.69361
Epoch 16: early stopping
Saved learning curves: final_mlp_merged_S_label_layer_2.png
Test Accuracy for Layer 2: 

Epoch 31/100
Epoch 31: val_loss did not improve from 0.67798
Epoch 32/100
Epoch 32: val_loss did not improve from 0.67798
Epoch 33/100
Epoch 33: val_loss did not improve from 0.67798
Epoch 34/100
Epoch 34: val_loss did not improve from 0.67798
Epoch 35/100
Epoch 35: val_loss did not improve from 0.67798
Epoch 36/100
Epoch 36: val_loss did not improve from 0.67798
Epoch 37/100
Epoch 37: val_loss did not improve from 0.67798
Epoch 38/100
Epoch 38: val_loss did not improve from 0.67798
Epoch 39/100
Epoch 39: val_loss did not improve from 0.67798
Epoch 40/100

Epoch 40: val_loss did not improve from 0.67798
Epoch 40: early stopping
Saved learning curves: final_mlp_merged_S_label_layer_3.png
Test Accuracy for Layer 3: 0.5000
Test AUC for Layer 3: 0.4292

Average Test Accuracy across all layers: 0.5303
Average Test AUC across all layers: 0.4751

Training MLP model for Merged Embeddings and L_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
T

Epoch 8/100
Epoch 8: val_loss did not improve from 0.65586
Epoch 9/100
Epoch 9: val_loss did not improve from 0.65586
Epoch 10/100
Epoch 10: val_loss did not improve from 0.65586
Epoch 11/100
Epoch 11: val_loss did not improve from 0.65586
Epoch 12/100
Epoch 12: val_loss did not improve from 0.65586
Epoch 13/100
Epoch 13: val_loss did not improve from 0.65586
Epoch 14/100
Epoch 14: val_loss did not improve from 0.65586
Epoch 15/100
Epoch 15: val_loss did not improve from 0.65586
Epoch 16/100
Epoch 16: val_loss did not improve from 0.65586
Epoch 17/100
Epoch 17: val_loss did not improve from 0.65586
Epoch 18/100
Epoch 18: val_loss did not improve from 0.65586
Epoch 19/100
Epoch 19: val_loss did not improve from 0.65586
Epoch 20/100
Epoch 20: val_loss did not improve from 0.65586
Epoch 21/100

Epoch 21: val_loss did not improve from 0.65586
Epoch 21: early stopping
Saved learning curves: final_mlp_merged_L_label_layer_1.png
Test Accuracy for Layer 1: 0.7368
Test AUC for Layer 1: 0.6496



Epoch 6/100
Epoch 6: val_loss improved from 0.63305 to 0.62730, saving model to Merged_OpenAI_MLP/visualizations_mlp/MPC\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 7/100
Epoch 7: val_loss improved from 0.62730 to 0.62366, saving model to Merged_OpenAI_MLP/visualizations_mlp/MPC\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 8/100
Epoch 8: val_loss improved from 0.62366 to 0.62210, saving model to Merged_OpenAI_MLP/visualizations_mlp/MPC\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 9/100
Epoch 9: val_loss improved from 0.62210 to 0.62068, saving model to Merged_OpenAI_MLP/visualizations_mlp/MPC\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 10/100
Epoch 10: val_loss improved from 0.62068 to 0.61974, saving model to Merged_OpenAI_MLP/visualizations_mlp/MPC\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 11/100
Epoch 11: val_loss improved from 0.61974 to 0.61966, saving model to Merged_OpenAI_MLP/visualizations_mlp/MPC\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 12/100

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_68 (Dense)            (None, 512)               786944    
                                                                 
 batch_normalization_51 (Ba  (None, 512)               2048      
 tchNormalization)                                               
                                                                 
 dropout_34 (Dropout)        (None, 512)               0         
                                                                 
 dense_69 (Dense)            (None, 256)               131328    
                                                                 
 batch_normalization_52 (Ba  (None, 256)               1024      
 tchNormalization)                                               
                                                                 
 dropout_35 (Dropout)        (None, 256)               0         
          

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class MergedEmbeddingMLPPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using merged OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with merged OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'Merged_OpenAI_MLP/visualizations_mlp/SLB'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs('Merged_OpenAI_MLP/visualizations_summary/SLB', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with merged OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing merged OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert merged embeddings to numpy arrays
        self.data['Merged_embedding'] = self.data['Merged_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_merged_embedding = self.data['Merged_embedding'].iloc[0]
        print(f"Sample Merged embedding shape: {sample_merged_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data['Merged_embedding'].values)
        X_val = np.stack(val_data['Merged_embedding'].values)
        X_test = np.stack(test_data['Merged_embedding'].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def plot_final_learning_curves(self, history, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_mlp_merged_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, label_col):
        """
        Train and evaluate an MLP model for merged embeddings and a specific label column.
        
        Args:
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"MLP|Merged|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training MLP model for Merged Embeddings and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_mlp_model((1536,))
            print(f"Created MLP model for Merged Embeddings ({label_col})")
            model.summary()
            
            # Setup callbacks
            plot_callback = PlotLearningCallback('Merged', label_col, i+1, self.mlp_viz_dir)
            early_stopping = EarlyStopping(
                monitor='val_loss',
                patience=15,
                restore_best_weights=True,
                verbose=1
            )
            model_checkpoint = ModelCheckpoint(
                filepath=f"{self.mlp_viz_dir}/best_mlp_merged_{label_col}_layer_{i+1}.weights.h5",
                monitor='val_loss',
                save_weights_only=True,
                save_best_only=True,
                verbose=1
            )
            
            # Train model
            print(f"Training MLP model...")
            batch_size = 32  # Larger batch size for embeddings
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                batch_size=batch_size,
                callbacks=[early_stopping, model_checkpoint, plot_callback],
                verbose=1
            )
            
            # Evaluate on test set
            y_pred_proba = model.predict(X_test)
            y_pred = (y_pred_proba > 0.5).astype(int)
            
            # Create final learning curve visualization
            self.plot_final_learning_curves(history, label_col, i+1, self.mlp_viz_dir)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'accuracy': accuracy,
                'auc': auc,
                'history': history.history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for merged embeddings with both short-term and long-term labels."""
        # Define label columns
        label_cols = ['S_label', 'L_label']
        
        # Run analysis for each label column
        for label_col in label_cols:
            self.train_and_evaluate_model(label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (SLB)")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Performance comparison for MLP with merged embeddings
        plt.figure(figsize=(10, 6))
        
        # Plot accuracy and AUC bars
        x = np.arange(len(df))
        width = 0.35
        
        plt.bar(x - width/2, df['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, df['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with Merged OpenAI Embeddings')
        labels = [f"Merged + {row['Label']}" for _, row in df.iterrows()]
        plt.xticks(x, labels, rotation=0)
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(df['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(df['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(df['Avg Accuracy'].max(), df['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/SLB', "mlp_merged_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP models."""
        # Prepare data for visualization
        layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                layer_data.append(layer_info)
        
        if not layer_data:
            print("No layer data available")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(10, 8))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title('MLP Accuracy by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title('MLP AUC by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/SLB', "mlp_merged_layer_performance.png")
        plt.savefig(save_path)
        print(f"MLP layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['Merged_OpenAI_MLP/visualizations_mlp/SLB', 'Merged_OpenAI_MLP/visualizations_summary/SLB']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_title_full_text_SP500_database/semantic/wall_street_news_semantics_SLB_completed_openai_Merged.csv'
    
    # Initialize the predictor with merged OpenAI embeddings
    predictor = MergedEmbeddingMLPPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'Merged_OpenAI_MLP' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing merged OpenAI embedding vectors from string format...
Sample Merged embedding shape: (1536,)
Loaded 838 climate change news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0: 433, 1: 405}
Class distribution for long-term prediction: {0: 437, 1: 401}

Training MLP model for Merged Embeddings and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Embeddings shapes - Train: (401, 1536), Val: (131, 1536), Test: (133, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (S_label)
Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_72

 dense_76 (Dense)            (None, 512)               786944    
                                                                 
 batch_normalization_57 (Ba  (None, 512)               2048      
 tchNormalization)                                               
                                                                 
 dropout_38 (Dropout)        (None, 512)               0         
                                                                 
 dense_77 (Dense)            (None, 256)               131328    
                                                                 
 batch_normalization_58 (Ba  (None, 256)               1024      
 tchNormalization)                                               
                                                                 
 dropout_39 (Dropout)        (None, 256)               0         
                                                                 
 dense_78 (Dense)            (None, 128)               32896     
          

Epoch 18/100
Epoch 18: val_loss did not improve from 0.69058
Epoch 19/100
Epoch 19: val_loss did not improve from 0.69058
Epoch 20/100
Epoch 20: val_loss did not improve from 0.69058
Epoch 21/100

Epoch 21: val_loss did not improve from 0.69058
Epoch 21: early stopping
Saved learning curves: final_mlp_merged_S_label_layer_3.png
Test Accuracy for Layer 3: 0.4429
Test AUC for Layer 3: 0.5724

Average Test Accuracy across all layers: 0.4897
Average Test AUC across all layers: 0.5198

Training MLP model for Merged Embeddings and L_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Embeddings shapes - Train: (401, 1536), Val: (131, 1536), Test: (133, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (L_label)
Model: "sequential_21"
____________________________________

                                                                 
 batch_normalization_69 (Ba  (None, 512)               2048      
 tchNormalization)                                               
                                                                 
 dropout_46 (Dropout)        (None, 512)               0         
                                                                 
 dense_93 (Dense)            (None, 256)               131328    
                                                                 
 batch_normalization_70 (Ba  (None, 256)               1024      
 tchNormalization)                                               
                                                                 
 dropout_47 (Dropout)        (None, 256)               0         
                                                                 
 dense_94 (Dense)            (None, 128)               32896     
                                                                 
 batch_nor

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class MergedEmbeddingMLPPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using merged OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with merged OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'Merged_OpenAI_MLP/visualizations_mlp/XOM'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs('Merged_OpenAI_MLP/visualizations_summary/XOM', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with merged OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing merged OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert merged embeddings to numpy arrays
        self.data['Merged_embedding'] = self.data['Merged_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_merged_embedding = self.data['Merged_embedding'].iloc[0]
        print(f"Sample Merged embedding shape: {sample_merged_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data['Merged_embedding'].values)
        X_val = np.stack(val_data['Merged_embedding'].values)
        X_test = np.stack(test_data['Merged_embedding'].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def plot_final_learning_curves(self, history, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_mlp_merged_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, label_col):
        """
        Train and evaluate an MLP model for merged embeddings and a specific label column.
        
        Args:
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"MLP|Merged|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training MLP model for Merged Embeddings and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_mlp_model((1536,))
            print(f"Created MLP model for Merged Embeddings ({label_col})")
            model.summary()
            
            # Setup callbacks
            plot_callback = PlotLearningCallback('Merged', label_col, i+1, self.mlp_viz_dir)
            early_stopping = EarlyStopping(
                monitor='val_loss',
                patience=15,
                restore_best_weights=True,
                verbose=1
            )
            model_checkpoint = ModelCheckpoint(
                filepath=f"{self.mlp_viz_dir}/best_mlp_merged_{label_col}_layer_{i+1}.weights.h5",
                monitor='val_loss',
                save_weights_only=True,
                save_best_only=True,
                verbose=1
            )
            
            # Train model
            print(f"Training MLP model...")
            batch_size = 32  # Larger batch size for embeddings
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                batch_size=batch_size,
                callbacks=[early_stopping, model_checkpoint, plot_callback],
                verbose=1
            )
            
            # Evaluate on test set
            y_pred_proba = model.predict(X_test)
            y_pred = (y_pred_proba > 0.5).astype(int)
            
            # Create final learning curve visualization
            self.plot_final_learning_curves(history, label_col, i+1, self.mlp_viz_dir)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'accuracy': accuracy,
                'auc': auc,
                'history': history.history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for merged embeddings with both short-term and long-term labels."""
        # Define label columns
        label_cols = ['S_label', 'L_label']
        
        # Run analysis for each label column
        for label_col in label_cols:
            self.train_and_evaluate_model(label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (XOM)")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Performance comparison for MLP with merged embeddings
        plt.figure(figsize=(10, 6))
        
        # Plot accuracy and AUC bars
        x = np.arange(len(df))
        width = 0.35
        
        plt.bar(x - width/2, df['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, df['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with Merged OpenAI Embeddings')
        labels = [f"Merged + {row['Label']}" for _, row in df.iterrows()]
        plt.xticks(x, labels, rotation=0)
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(df['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(df['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(df['Avg Accuracy'].max(), df['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/XOM', "mlp_merged_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP models."""
        # Prepare data for visualization
        layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                layer_data.append(layer_info)
        
        if not layer_data:
            print("No layer data available")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(10, 8))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title('MLP Accuracy by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title('MLP AUC by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/XOM', "mlp_merged_layer_performance.png")
        plt.savefig(save_path)
        print(f"MLP layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['Merged_OpenAI_MLP/visualizations_mlp/XOM', 'Merged_OpenAI_MLP/visualizations_summary/XOM']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_title_full_text_SP500_database/semantic/wall_street_news_semantics_XOM_completed_openai_Merged.csv'
    
    # Initialize the predictor with merged OpenAI embeddings
    predictor = MergedEmbeddingMLPPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'Merged_OpenAI_MLP' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing merged OpenAI embedding vectors from string format...
Sample Merged embedding shape: (1536,)
Loaded 838 climate change news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 427, 0: 411}
Class distribution for long-term prediction: {1: 428, 0: 410}

Training MLP model for Merged Embeddings and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Embeddings shapes - Train: (401, 1536), Val: (131, 1536), Test: (133, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (S_label)
Model: "sequential_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_96

 dense_100 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_75 (Ba  (None, 512)               2048      
 tchNormalization)                                               
                                                                 
 dropout_50 (Dropout)        (None, 512)               0         
                                                                 
 dense_101 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_76 (Ba  (None, 256)               1024      
 tchNormalization)                                               
                                                                 
 dropout_51 (Dropout)        (None, 256)               0         
                                                                 
 dense_102 (Dense)           (None, 128)               32896     
          

 dense_107 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.70349, saving model to Merged_OpenAI_MLP/visualizations_mlp/XOM\best_mlp_merged_S_label_layer_3.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.70349
Epoch 3/100
Epoch 3: val_loss did not improve from 0.70349
Epoch 4/100
Epoch 4: val_loss did not improve from 0.70349
Epoch 5/100
Epoch 5: val_loss did not improve from 0.70349
Epoch 6/100
Epoch 6: val_loss did not improve from 0.70349
Epoch 7/100
Epoch 7: val_loss did not improve from 0.70349
Epoch 8/100
Epoch 8: val_loss did not improve from 0.70349
Epoch 9/100
Epoch 9: val_loss did not improve from 0.70349
Epoch 10/100
Epoch 10: val_loss did not impro

Epoch 3/100
Epoch 3: val_loss did not improve from 0.73475
Epoch 4/100
Epoch 4: val_loss did not improve from 0.73475
Epoch 5/100
Epoch 5: val_loss did not improve from 0.73475
Epoch 6/100
Epoch 6: val_loss did not improve from 0.73475
Epoch 7/100
Epoch 7: val_loss did not improve from 0.73475
Epoch 8/100
Epoch 8: val_loss did not improve from 0.73475
Epoch 9/100
Epoch 9: val_loss did not improve from 0.73475
Epoch 10/100
Epoch 10: val_loss did not improve from 0.73475
Epoch 11/100
Epoch 11: val_loss did not improve from 0.73475
Epoch 12/100
Epoch 12: val_loss did not improve from 0.73475
Epoch 13/100
Epoch 13: val_loss did not improve from 0.73475
Epoch 14/100
Epoch 14: val_loss did not improve from 0.73475
Epoch 15/100
Epoch 15: val_loss did not improve from 0.73475
Epoch 16/100

Epoch 16: val_loss did not improve from 0.73475
Epoch 16: early stopping
Saved learning curves: final_mlp_merged_L_label_layer_1.png
Test Accuracy for Layer 1: 0.2707
Test AUC for Layer 1: 0.4104

Layer 2:
T

Epoch 6/100
Epoch 6: val_loss improved from 0.65446 to 0.64766, saving model to Merged_OpenAI_MLP/visualizations_mlp/XOM\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 7/100
Epoch 7: val_loss improved from 0.64766 to 0.64184, saving model to Merged_OpenAI_MLP/visualizations_mlp/XOM\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 8/100
Epoch 8: val_loss improved from 0.64184 to 0.63671, saving model to Merged_OpenAI_MLP/visualizations_mlp/XOM\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 9/100
Epoch 9: val_loss improved from 0.63671 to 0.63242, saving model to Merged_OpenAI_MLP/visualizations_mlp/XOM\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 10/100
Epoch 10: val_loss improved from 0.63242 to 0.62978, saving model to Merged_OpenAI_MLP/visualizations_mlp/XOM\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 11/100
Epoch 11: val_loss improved from 0.62978 to 0.62721, saving model to Merged_OpenAI_MLP/visualizations_mlp/XOM\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 12/100

Epoch 31/100
Epoch 31: val_loss did not improve from 0.61312
Epoch 32/100
Epoch 32: val_loss did not improve from 0.61312
Epoch 33/100

Epoch 33: val_loss did not improve from 0.61312
Epoch 33: early stopping
Saved learning curves: final_mlp_merged_L_label_layer_2.png
Test Accuracy for Layer 2: 0.4158
Test AUC for Layer 2: 0.4681

Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Embeddings shapes - Train: (666, 1536), Val: (101, 1536), Test: (70, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (L_label)
Model: "sequential_29"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_116 (Dense)           (None, 512)               786944    
                                               

MLP layer performance visualization saved as: Merged_OpenAI_MLP/visualizations_summary/XOM\mlp_merged_layer_performance.png

Analysis complete! Results saved to 'Merged_OpenAI_MLP' directory.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
import xgboost as xgb
from xgboost import callback
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, model_type, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.model_type = model_type
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class ClimateNewsOpenAIPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'OpenAI_MLP/visualizations_mlp/XOM'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs('OpenAI_MLP/visualizations_summary/XOM', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert embeddings to numpy arrays
        self.data['Title_embedding'] = self.data['Title_embedding_vector'].apply(parse_embedding)
        self.data['Fulltext_embedding'] = self.data['Full_text_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_title_embedding = self.data['Title_embedding'].iloc[0]
        sample_fulltext_embedding = self.data['Fulltext_embedding'].iloc[0]
        
        print(f"Sample Title embedding shape: {sample_title_embedding.shape}")
        print(f"Sample Fulltext embedding shape: {sample_fulltext_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            text_col: The column containing the embeddings ('Title_embedding' or 'Fulltext_embedding')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data[text_col].values)
        X_val = np.stack(val_data[text_col].values)
        X_test = np.stack(test_data[text_col].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def plot_final_learning_curves(self, history, model_type, text_col, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            model_type: Model type (MLP)
            text_col: Text column used
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_{model_type.lower()}_{display_text.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, text_col, label_col, model_type):
        """
        Train and evaluate a model for a specific text column and label column.
        
        Args:
            text_col: The embedding column to use ('Title_embedding' or 'Fulltext_embedding')
            label_col: The label column to use ('S_label' or 'L_label')
            model_type: The model type to use ('MLP')
        """
        # Store results
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        combination_key = f"{model_type}|{display_text}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training {model_type} model for {display_text} and {label_col}")
        print(f"{'='*80}")
        
        visualization_dir = self.mlp_viz_dir if model_type == 'MLP' else self.xgb_viz_dir
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            if model_type == 'MLP':
                # MLP model training
                model = self.create_mlp_model((1536,))
                print(f"Created MLP model for {display_text} ({label_col})")
                model.summary()
                
                # Setup callbacks
                plot_callback = PlotLearningCallback(model_type, display_text, label_col, i+1, visualization_dir)
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=15,
                    restore_best_weights=True,
                    verbose=1
                )
                model_checkpoint = ModelCheckpoint(
                    filepath=f"{visualization_dir}/best_{model_type.lower()}_{display_text.lower().replace(' ', '_')}_{label_col}_layer_{i+1}.weights.h5",
                    monitor='val_loss',
                    save_weights_only=True,
                    save_best_only=True,
                    verbose=1
                )
                
                # Train model
                print(f"Training MLP model...")
                batch_size = 32  # Larger batch size for embeddings
                history = model.fit(
                    X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                    batch_size=batch_size,
                    callbacks=[early_stopping, model_checkpoint, plot_callback],
                    verbose=1
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict(X_test)
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create final learning curve visualization
                self.plot_final_learning_curves(history, model_type, text_col, label_col, i+1, visualization_dir)
                
                # Store training history
                training_history = history.history
                
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model_type': model_type,
                'accuracy': accuracy,
                'auc': auc,
                'history': training_history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text inputs, label columns, and model types."""
        # Define all combinations
        embedding_cols = ['Title_embedding', 'Fulltext_embedding']
        label_cols = ['S_label', 'L_label']
        model_types = ['MLP']
        
        # Run analysis for each combination
        for model_type in model_types:
            for embedding_col in embedding_cols:
                for label_col in label_cols:
                    self.train_and_evaluate_model(embedding_col, label_col, model_type)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS （XOM)")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Model': model_type,
                'Text': text_col,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing all model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Split by model type
        mlp_data = df[df['Model'] == 'MLP']
        
        # 1. Performance comparison for MLP
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for MLP
        x = np.arange(len(mlp_data))
        width = 0.35
        
        plt.bar(x - width/2, mlp_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, mlp_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('MLP Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in mlp_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(mlp_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(mlp_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(mlp_data['Avg Accuracy'].max(), mlp_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP/visualizations_summary/XOM', "mlp_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP models."""
        # Prepare data for visualization
        mlp_layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Model': model_type,
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                if model_type == 'MLP':
                    mlp_layer_data.append(layer_info)
        
        # Create visualizations for each model type
        self._create_model_layer_visualization(mlp_layer_data, 'MLP')
    
    def _create_model_layer_visualization(self, layer_data, model_type):
        """Create layer-specific visualizations for a given model type."""
        if not layer_data:
            print(f"No layer data available for {model_type}")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(14, 10))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title(f'{model_type} Accuracy by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title(f'{model_type} AUC by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP/visualizations_summary/XOM', f"{model_type.lower()}_layer_performance.png")
        plt.savefig(save_path)
        print(f"{model_type} layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['OpenAI_MLP/visualizations_mlp/XOM',
                      'OpenAI_MLP/visualizations_summary/XOM']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_XOM_completed_openai.csv'
    
    # Initialize the predictor with OpenAI embeddings
    predictor = ClimateNewsOpenAIPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'OpenAI_MLP' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing OpenAI embedding vectors from string format...
Sample Title embedding shape: (1536,)
Sample Fulltext embedding shape: (1536,)
Loaded 929 climate change news articles spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {0: 466, 1: 463}
Class distribution for long-term prediction: {1: 507, 0: 422}

Training MLP model for Title and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (S_label)
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dens

Epoch 19/100
Epoch 19: val_loss did not improve from 0.69135
Epoch 20/100
Epoch 20: val_loss did not improve from 0.69135
Epoch 21/100
Epoch 21: val_loss did not improve from 0.69135
Epoch 22/100
Epoch 22: val_loss improved from 0.69135 to 0.69077, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_title_S_label_layer_1.weights.h5
Epoch 23/100
Epoch 23: val_loss improved from 0.69077 to 0.68955, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_title_S_label_layer_1.weights.h5
Epoch 24/100
Epoch 24: val_loss did not improve from 0.68955
Epoch 25/100
Epoch 25: val_loss did not improve from 0.68955
Epoch 26/100
Epoch 26: val_loss did not improve from 0.68955
Epoch 27/100
Epoch 27: val_loss did not improve from 0.68955
Epoch 28/100
Epoch 28: val_loss did not improve from 0.68955
Epoch 29/100
Epoch 29: val_loss did not improve from 0.68955
Epoch 30/100
Epoch 30: val_loss did not improve from 0.68955
Epoch 31/100
Epoch 31: val_loss did not improve from 0.68955
Epoch 32/100


Epoch 1: val_loss improved from inf to 0.67951, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_title_S_label_layer_2.weights.h5
Epoch 2/100
Epoch 2: val_loss improved from 0.67951 to 0.67366, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_title_S_label_layer_2.weights.h5
Epoch 3/100
Epoch 3: val_loss improved from 0.67366 to 0.67100, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_title_S_label_layer_2.weights.h5
Epoch 4/100
Epoch 4: val_loss improved from 0.67100 to 0.66970, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_title_S_label_layer_2.weights.h5
Epoch 5/100
Epoch 5: val_loss did not improve from 0.66970
Epoch 6/100
Epoch 6: val_loss did not improve from 0.66970
Epoch 7/100
Epoch 7: val_loss did not improve from 0.66970
Epoch 8/100
Epoch 8: val_loss did not improve from 0.66970
Epoch 9/100
Epoch 9: val_loss did not improve from 0.66970
Epoch 10/100
Epoch 10: val_loss did not improve from 0.66970
Epoch 11/100
Epoch 11: val_loss did 

Epoch 2/100
Epoch 2: val_loss did not improve from 0.70933
Epoch 3/100
Epoch 3: val_loss did not improve from 0.70933
Epoch 4/100
Epoch 4: val_loss did not improve from 0.70933
Epoch 5/100
Epoch 5: val_loss did not improve from 0.70933
Epoch 6/100
Epoch 6: val_loss did not improve from 0.70933
Epoch 7/100
Epoch 7: val_loss did not improve from 0.70933
Epoch 8/100
Epoch 8: val_loss did not improve from 0.70933
Epoch 9/100
Epoch 9: val_loss did not improve from 0.70933
Epoch 10/100
Epoch 10: val_loss did not improve from 0.70933
Epoch 11/100
Epoch 11: val_loss did not improve from 0.70933
Epoch 12/100
Epoch 12: val_loss did not improve from 0.70933
Epoch 13/100
Epoch 13: val_loss did not improve from 0.70933
Epoch 14/100
Epoch 14: val_loss did not improve from 0.70933
Epoch 15/100
Epoch 15: val_loss did not improve from 0.70933
Epoch 16/100

Epoch 16: val_loss did not improve from 0.70933
Epoch 16: early stopping
Saved learning curves: final_mlp_Title_S_label_layer_3.png
Test Accuracy fo

Epoch 5/100
Epoch 5: val_loss did not improve from 0.69669
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69669
Epoch 7/100
Epoch 7: val_loss did not improve from 0.69669
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69669
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69669
Epoch 10/100
Epoch 10: val_loss did not improve from 0.69669
Epoch 11/100
Epoch 11: val_loss did not improve from 0.69669
Epoch 12/100
Epoch 12: val_loss did not improve from 0.69669
Epoch 13/100
Epoch 13: val_loss did not improve from 0.69669
Epoch 14/100
Epoch 14: val_loss did not improve from 0.69669
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69669
Epoch 16/100

Epoch 16: val_loss did not improve from 0.69669
Epoch 16: early stopping
Saved learning curves: final_mlp_Title_L_label_layer_1.png
Test Accuracy for Layer 1: 0.1364
Test AUC for Layer 1: 0.4639

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2

Epoch 8/100
Epoch 8: val_loss improved from 0.54132 to 0.53789, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_title_L_label_layer_2.weights.h5
Epoch 9/100
Epoch 9: val_loss did not improve from 0.53789
Epoch 10/100
Epoch 10: val_loss did not improve from 0.53789
Epoch 11/100
Epoch 11: val_loss did not improve from 0.53789
Epoch 12/100
Epoch 12: val_loss did not improve from 0.53789
Epoch 13/100
Epoch 13: val_loss did not improve from 0.53789
Epoch 14/100
Epoch 14: val_loss did not improve from 0.53789
Epoch 15/100
Epoch 15: val_loss did not improve from 0.53789
Epoch 16/100
Epoch 16: val_loss did not improve from 0.53789
Epoch 17/100
Epoch 17: val_loss did not improve from 0.53789
Epoch 18/100
Epoch 18: val_loss did not improve from 0.53789
Epoch 19/100
Epoch 19: val_loss did not improve from 0.53789
Epoch 20/100
Epoch 20: val_loss did not improve from 0.53789
Epoch 21/100
Epoch 21: val_loss did not improve from 0.53789
Epoch 22/100
Epoch 22: val_loss did not improve from 

Epoch 5/100
Epoch 5: val_loss improved from 0.65793 to 0.65341, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_title_L_label_layer_3.weights.h5
Epoch 6/100
Epoch 6: val_loss improved from 0.65341 to 0.65039, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_title_L_label_layer_3.weights.h5
Epoch 7/100
Epoch 7: val_loss improved from 0.65039 to 0.64890, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_title_L_label_layer_3.weights.h5
Epoch 8/100
Epoch 8: val_loss improved from 0.64890 to 0.64716, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_title_L_label_layer_3.weights.h5
Epoch 9/100
Epoch 9: val_loss improved from 0.64716 to 0.64617, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_title_L_label_layer_3.weights.h5
Epoch 10/100
Epoch 10: val_loss improved from 0.64617 to 0.64548, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_title_L_label_layer_3.weights.h5
Epoch 11/100
Epoch 11: val_loss improved from 0.64548 to 0.644

Epoch 32/100
Epoch 32: val_loss did not improve from 0.64267
Epoch 33/100
Epoch 33: val_loss did not improve from 0.64267
Epoch 34/100

Epoch 34: val_loss did not improve from 0.64267
Epoch 34: early stopping
Saved learning curves: final_mlp_Title_L_label_layer_3.png
Test Accuracy for Layer 3: 0.3768
Test AUC for Layer 3: 0.4832

Average Test Accuracy across all layers: 0.3933
Average Test AUC across all layers: 0.4812

Training MLP model for Full text and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Full text (S_label)
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape          

Saved learning curves: final_mlp_Full_text_S_label_layer_1.png
Test Accuracy for Layer 1: 0.3788
Test AUC for Layer 1: 0.6283

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Embeddings shapes - Train: (685, 1536), Val: (66, 1536), Test: (108, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Full text (S_label)
Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_28 (Dense)            (None, 512)               786944    
                                                                 
 batch_normalization_21 (Ba  (None, 512)               2048      
 tchNormalization)                                               
                                                                

 dense_33 (Dense)            (None, 256)               131328    
                                                                 
 batch_normalization_25 (Ba  (None, 256)               1024      
 tchNormalization)                                               
                                                                 
 dropout_17 (Dropout)        (None, 256)               0         
                                                                 
 dense_34 (Dense)            (None, 128)               32896     
                                                                 
 batch_normalization_26 (Ba  (None, 128)               512       
 tchNormalization)                                               
                                                                 
 dense_35 (Dense)            (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-traina

Epoch 24/100

Epoch 24: val_loss did not improve from 0.67902
Epoch 24: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_3.png
Test Accuracy for Layer 3: 0.5072
Test AUC for Layer 3: 0.4613

Average Test Accuracy across all layers: 0.4404
Average Test AUC across all layers: 0.5220

Training MLP model for Full text and L_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Full text (L_label)
Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_36 (Dense)            (None, 512)               786944    
                                     

Epoch 17/100
Epoch 17: val_loss did not improve from 0.69144
Epoch 18/100
Epoch 18: val_loss did not improve from 0.69144
Epoch 19/100
Epoch 19: val_loss did not improve from 0.69144
Epoch 20/100
Epoch 20: val_loss did not improve from 0.69144
Epoch 21/100
Epoch 21: val_loss did not improve from 0.69144
Epoch 22/100
Epoch 22: val_loss did not improve from 0.69144
Epoch 23/100
Epoch 23: val_loss did not improve from 0.69144
Epoch 24/100
Epoch 24: val_loss did not improve from 0.69144
Epoch 25/100
Epoch 25: val_loss did not improve from 0.69144
Epoch 26/100
Epoch 26: val_loss did not improve from 0.69144
Epoch 27/100

Epoch 27: val_loss did not improve from 0.69144
Epoch 27: early stopping
Saved learning curves: final_mlp_Full_text_L_label_layer_1.png
Test Accuracy for Layer 1: 0.6212
Test AUC for Layer 1: 0.7193

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data:

Epoch 8: val_loss improved from 0.52873 to 0.52668, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 9/100
Epoch 9: val_loss improved from 0.52668 to 0.52137, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 10/100
Epoch 10: val_loss improved from 0.52137 to 0.52128, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 11/100
Epoch 11: val_loss improved from 0.52128 to 0.52056, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 12/100
Epoch 12: val_loss improved from 0.52056 to 0.51774, saving model to OpenAI_MLP/visualizations_mlp/XOM\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 13/100
Epoch 13: val_loss did not improve from 0.51774
Epoch 14/100
Epoch 14: val_loss did not improve from 0.51774
Epoch 15/100
Epoch 15: val_loss improved from 0.51774 to 0.51525, saving model to Op

 dense_45 (Dense)            (None, 256)               131328    
                                                                 
 batch_normalization_34 (Ba  (None, 256)               1024      
 tchNormalization)                                               
                                                                 
 dropout_23 (Dropout)        (None, 256)               0         
                                                                 
 dense_46 (Dense)            (None, 128)               32896     
                                                                 
 batch_normalization_35 (Ba  (None, 128)               512       
 tchNormalization)                                               
                                                                 
 dense_47 (Dense)            (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-traina

MLP layer performance visualization saved as: OpenAI_MLP/visualizations_summary/XOM\mlp_layer_performance.png

Analysis complete! Results saved to 'OpenAI_MLP' directory.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class MergedEmbeddingMLPPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using merged OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with merged OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'Merged_OpenAI_MLP/visualizations_mlp/XOM'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs('Merged_OpenAI_MLP/visualizations_summary/XOM', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with merged OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing merged OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert merged embeddings to numpy arrays
        self.data['Merged_embedding'] = self.data['Merged_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_merged_embedding = self.data['Merged_embedding'].iloc[0]
        print(f"Sample Merged embedding shape: {sample_merged_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data['Merged_embedding'].values)
        X_val = np.stack(val_data['Merged_embedding'].values)
        X_test = np.stack(test_data['Merged_embedding'].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def plot_final_learning_curves(self, history, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_mlp_merged_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, label_col):
        """
        Train and evaluate an MLP model for merged embeddings and a specific label column.
        
        Args:
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"MLP|Merged|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training MLP model for Merged Embeddings and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_mlp_model((1536,))
            print(f"Created MLP model for Merged Embeddings ({label_col})")
            model.summary()
            
            # Setup callbacks
            plot_callback = PlotLearningCallback('Merged', label_col, i+1, self.mlp_viz_dir)
            early_stopping = EarlyStopping(
                monitor='val_loss',
                patience=15,
                restore_best_weights=True,
                verbose=1
            )
            model_checkpoint = ModelCheckpoint(
                filepath=f"{self.mlp_viz_dir}/best_mlp_merged_{label_col}_layer_{i+1}.weights.h5",
                monitor='val_loss',
                save_weights_only=True,
                save_best_only=True,
                verbose=1
            )
            
            # Train model
            print(f"Training MLP model...")
            batch_size = 32  # Larger batch size for embeddings
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                batch_size=batch_size,
                callbacks=[early_stopping, model_checkpoint, plot_callback],
                verbose=1
            )
            
            # Evaluate on test set
            y_pred_proba = model.predict(X_test)
            y_pred = (y_pred_proba > 0.5).astype(int)
            
            # Create final learning curve visualization
            self.plot_final_learning_curves(history, label_col, i+1, self.mlp_viz_dir)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'accuracy': accuracy,
                'auc': auc,
                'history': history.history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for merged embeddings with both short-term and long-term labels."""
        # Define label columns
        label_cols = ['S_label', 'L_label']
        
        # Run analysis for each label column
        for label_col in label_cols:
            self.train_and_evaluate_model(label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (XOM)")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Performance comparison for MLP with merged embeddings
        plt.figure(figsize=(10, 6))
        
        # Plot accuracy and AUC bars
        x = np.arange(len(df))
        width = 0.35
        
        plt.bar(x - width/2, df['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, df['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with Merged OpenAI Embeddings')
        labels = [f"Merged + {row['Label']}" for _, row in df.iterrows()]
        plt.xticks(x, labels, rotation=0)
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(df['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(df['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(df['Avg Accuracy'].max(), df['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/XOM', "mlp_merged_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP models."""
        # Prepare data for visualization
        layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                layer_data.append(layer_info)
        
        if not layer_data:
            print("No layer data available")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(10, 8))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title('MLP Accuracy by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title('MLP AUC by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/XOM', "mlp_merged_layer_performance.png")
        plt.savefig(save_path)
        print(f"MLP layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['Merged_OpenAI_MLP/visualizations_mlp/XOM', 'Merged_OpenAI_MLP/visualizations_summary/XOM']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_XOM_completed_openai.csv'
    
    # Initialize the predictor with merged OpenAI embeddings
    predictor = MergedEmbeddingMLPPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'Merged_OpenAI_MLP' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing merged OpenAI embedding vectors from string format...
Sample Merged embedding shape: (1536,)
Loaded 929 climate change news articles spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {0: 466, 1: 463}
Class distribution for long-term prediction: {1: 507, 0: 422}

Training MLP model for Merged Embeddings and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (S_label)
Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_48 (

 dense_52 (Dense)            (None, 512)               786944    
                                                                 
 batch_normalization_39 (Ba  (None, 512)               2048      
 tchNormalization)                                               
                                                                 
 dropout_26 (Dropout)        (None, 512)               0         
                                                                 
 dense_53 (Dense)            (None, 256)               131328    
                                                                 
 batch_normalization_40 (Ba  (None, 256)               1024      
 tchNormalization)                                               
                                                                 
 dropout_27 (Dropout)        (None, 256)               0         
                                                                 
 dense_54 (Dense)            (None, 128)               32896     
          

 dense_59 (Dense)            (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.69657, saving model to Merged_OpenAI_MLP/visualizations_mlp/XOM\best_mlp_merged_S_label_layer_3.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.69657
Epoch 3/100
Epoch 3: val_loss did not improve from 0.69657
Epoch 4/100
Epoch 4: val_loss did not improve from 0.69657
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69657
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69657
Epoch 7/100
Epoch 7: val_loss did not improve from 0.69657
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69657
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69657
Epoch 10/100
Epoch 10: val_loss did not impro

Epoch 3/100
Epoch 3: val_loss did not improve from 0.69267
Epoch 4/100
Epoch 4: val_loss did not improve from 0.69267
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69267
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69267
Epoch 7/100
Epoch 7: val_loss did not improve from 0.69267
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69267
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69267
Epoch 10/100
Epoch 10: val_loss did not improve from 0.69267
Epoch 11/100
Epoch 11: val_loss did not improve from 0.69267
Epoch 12/100
Epoch 12: val_loss did not improve from 0.69267
Epoch 13/100
Epoch 13: val_loss did not improve from 0.69267
Epoch 14/100
Epoch 14: val_loss did not improve from 0.69267
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69267
Epoch 16/100

Epoch 16: val_loss did not improve from 0.69267
Epoch 16: early stopping
Saved learning curves: final_mlp_merged_L_label_layer_1.png
Test Accuracy for Layer 1: 0.2727
Test AUC for Layer 1: 0.4561

Layer 2:
T

Epoch 6: val_loss did not improve from 0.70186
Epoch 7/100
Epoch 7: val_loss improved from 0.70186 to 0.70101, saving model to Merged_OpenAI_MLP/visualizations_mlp/XOM\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 8/100
Epoch 8: val_loss improved from 0.70101 to 0.69660, saving model to Merged_OpenAI_MLP/visualizations_mlp/XOM\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 9/100
Epoch 9: val_loss improved from 0.69660 to 0.69514, saving model to Merged_OpenAI_MLP/visualizations_mlp/XOM\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 10/100
Epoch 10: val_loss improved from 0.69514 to 0.68933, saving model to Merged_OpenAI_MLP/visualizations_mlp/XOM\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 11/100
Epoch 11: val_loss improved from 0.68933 to 0.68623, saving model to Merged_OpenAI_MLP/visualizations_mlp/XOM\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 12/100
Epoch 12: val_loss did not improve from 0.68623
Epoch 13/100
Epoch 13: val_loss improved from 0.68623 to 0.67535, sa

Epoch 33/100
Epoch 33: val_loss did not improve from 0.65792
Epoch 34/100
Epoch 34: val_loss did not improve from 0.65792
Epoch 35/100

Epoch 35: val_loss did not improve from 0.65792
Epoch 35: early stopping
Saved learning curves: final_mlp_merged_L_label_layer_2.png
Test Accuracy for Layer 2: 0.5648
Test AUC for Layer 2: 0.5575

Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Embeddings shapes - Train: (751, 1536), Val: (108, 1536), Test: (69, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (L_label)
Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_68 (Dense)            (None, 512)               786944    
                                               

Epoch 17/100
Epoch 17: val_loss did not improve from 0.63881
Epoch 18/100
Epoch 18: val_loss did not improve from 0.63881
Epoch 19/100

Epoch 19: val_loss did not improve from 0.63881
Epoch 19: early stopping
Saved learning curves: final_mlp_merged_L_label_layer_3.png
Test Accuracy for Layer 3: 0.3913
Test AUC for Layer 3: 0.5053

Average Test Accuracy across all layers: 0.4096
Average Test AUC across all layers: 0.5063

SUMMARY OF RESULTS (XOM)

Combination: MLP with Merged + S_label
Average Accuracy: 0.5164
Average AUC: 0.5004
  Layer 1 - Accuracy: 0.6212, AUC: 0.4868
  Layer 2 - Accuracy: 0.4352, AUC: 0.5075
  Layer 3 - Accuracy: 0.4928, AUC: 0.5067

Combination: MLP with Merged + L_label
Average Accuracy: 0.4096
Average AUC: 0.5063
  Layer 1 - Accuracy: 0.2727, AUC: 0.4561
  Layer 2 - Accuracy: 0.5648, AUC: 0.5575
  Layer 3 - Accuracy: 0.3913, AUC: 0.5053
MLP summary comparison visualization saved as: Merged_OpenAI_MLP/visualizations_summary/XOM\mlp_merged_performance_comparison.pn

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
import xgboost as xgb
from xgboost import callback
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, model_type, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.model_type = model_type
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class ClimateNewsOpenAIPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'OpenAI_MLP/visualizations_mlp/SLB'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs('OpenAI_MLP/visualizations_summary/SLB', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert embeddings to numpy arrays
        self.data['Title_embedding'] = self.data['Title_embedding_vector'].apply(parse_embedding)
        self.data['Fulltext_embedding'] = self.data['Full_text_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_title_embedding = self.data['Title_embedding'].iloc[0]
        sample_fulltext_embedding = self.data['Fulltext_embedding'].iloc[0]
        
        print(f"Sample Title embedding shape: {sample_title_embedding.shape}")
        print(f"Sample Fulltext embedding shape: {sample_fulltext_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            text_col: The column containing the embeddings ('Title_embedding' or 'Fulltext_embedding')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data[text_col].values)
        X_val = np.stack(val_data[text_col].values)
        X_test = np.stack(test_data[text_col].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def plot_final_learning_curves(self, history, model_type, text_col, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            model_type: Model type (MLP)
            text_col: Text column used
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_{model_type.lower()}_{display_text.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, text_col, label_col, model_type):
        """
        Train and evaluate a model for a specific text column and label column.
        
        Args:
            text_col: The embedding column to use ('Title_embedding' or 'Fulltext_embedding')
            label_col: The label column to use ('S_label' or 'L_label')
            model_type: The model type to use ('MLP')
        """
        # Store results
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        combination_key = f"{model_type}|{display_text}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training {model_type} model for {display_text} and {label_col}")
        print(f"{'='*80}")
        
        visualization_dir = self.mlp_viz_dir if model_type == 'MLP' else self.xgb_viz_dir
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            if model_type == 'MLP':
                # MLP model training
                model = self.create_mlp_model((1536,))
                print(f"Created MLP model for {display_text} ({label_col})")
                model.summary()
                
                # Setup callbacks
                plot_callback = PlotLearningCallback(model_type, display_text, label_col, i+1, visualization_dir)
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=15,
                    restore_best_weights=True,
                    verbose=1
                )
                model_checkpoint = ModelCheckpoint(
                    filepath=f"{visualization_dir}/best_{model_type.lower()}_{display_text.lower().replace(' ', '_')}_{label_col}_layer_{i+1}.weights.h5",
                    monitor='val_loss',
                    save_weights_only=True,
                    save_best_only=True,
                    verbose=1
                )
                
                # Train model
                print(f"Training MLP model...")
                batch_size = 32  # Larger batch size for embeddings
                history = model.fit(
                    X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                    batch_size=batch_size,
                    callbacks=[early_stopping, model_checkpoint, plot_callback],
                    verbose=1
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict(X_test)
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create final learning curve visualization
                self.plot_final_learning_curves(history, model_type, text_col, label_col, i+1, visualization_dir)
                
                # Store training history
                training_history = history.history
                
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model_type': model_type,
                'accuracy': accuracy,
                'auc': auc,
                'history': training_history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text inputs, label columns, and model types."""
        # Define all combinations
        embedding_cols = ['Title_embedding', 'Fulltext_embedding']
        label_cols = ['S_label', 'L_label']
        model_types = ['MLP']
        
        # Run analysis for each combination
        for model_type in model_types:
            for embedding_col in embedding_cols:
                for label_col in label_cols:
                    self.train_and_evaluate_model(embedding_col, label_col, model_type)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS （SLB)")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Model': model_type,
                'Text': text_col,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing all model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Split by model type
        mlp_data = df[df['Model'] == 'MLP']
        
        # 1. Performance comparison for MLP
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for MLP
        x = np.arange(len(mlp_data))
        width = 0.35
        
        plt.bar(x - width/2, mlp_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, mlp_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('MLP Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in mlp_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(mlp_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(mlp_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(mlp_data['Avg Accuracy'].max(), mlp_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP/visualizations_summary/SLB', "mlp_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP models."""
        # Prepare data for visualization
        mlp_layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Model': model_type,
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                if model_type == 'MLP':
                    mlp_layer_data.append(layer_info)
        
        # Create visualizations for each model type
        self._create_model_layer_visualization(mlp_layer_data, 'MLP')
    
    def _create_model_layer_visualization(self, layer_data, model_type):
        """Create layer-specific visualizations for a given model type."""
        if not layer_data:
            print(f"No layer data available for {model_type}")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(14, 10))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title(f'{model_type} Accuracy by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title(f'{model_type} AUC by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP/visualizations_summary/SLB', f"{model_type.lower()}_layer_performance.png")
        plt.savefig(save_path)
        print(f"{model_type} layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['OpenAI_MLP/visualizations_mlp/SLB',
                      'OpenAI_MLP/visualizations_summary/SLB']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_SLB_completed_openai.csv'
    
    # Initialize the predictor with OpenAI embeddings
    predictor = ClimateNewsOpenAIPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'OpenAI_MLP' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing OpenAI embedding vectors from string format...
Sample Title embedding shape: (1536,)
Sample Fulltext embedding shape: (1536,)
Loaded 929 climate change news articles spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {1: 472, 0: 457}
Class distribution for long-term prediction: {1: 494, 0: 435}

Training MLP model for Title and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (S_label)
Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 d

Epoch 20/100
Epoch 20: val_loss did not improve from 0.69372
Epoch 21/100
Epoch 21: val_loss did not improve from 0.69372
Epoch 22/100
Epoch 22: val_loss did not improve from 0.69372
Epoch 23/100
Epoch 23: val_loss did not improve from 0.69372
Epoch 24/100
Epoch 24: val_loss did not improve from 0.69372
Epoch 25/100
Epoch 25: val_loss did not improve from 0.69372
Epoch 26/100
Epoch 26: val_loss did not improve from 0.69372
Epoch 27/100

Epoch 27: val_loss did not improve from 0.69372
Epoch 27: early stopping
Saved learning curves: final_mlp_Title_S_label_layer_1.png
Test Accuracy for Layer 1: 0.4394
Test AUC for Layer 1: 0.4430

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Embeddings shapes - Train: (685, 1536), Val: (66, 1536), Test: (108, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP mo

Epoch 14/100
Epoch 14: val_loss did not improve from 0.69961
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69961
Epoch 16/100

Epoch 16: val_loss did not improve from 0.69961
Epoch 16: early stopping
Saved learning curves: final_mlp_Title_S_label_layer_2.png
Test Accuracy for Layer 2: 0.5741
Test AUC for Layer 2: 0.4972

Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Embeddings shapes - Train: (751, 1536), Val: (108, 1536), Test: (69, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (S_label)
Model: "sequential_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_80 (Dense)            (None, 512)               786944    
                                                            

Epoch 17: early stopping
Saved learning curves: final_mlp_Title_S_label_layer_3.png
Test Accuracy for Layer 3: 0.5652
Test AUC for Layer 3: 0.4726

Average Test Accuracy across all layers: 0.5262
Average Test AUC across all layers: 0.4710

Training MLP model for Title and L_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (L_label)
Model: "sequential_21"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_84 (Dense)            (None, 512)               786944    
                                                                 
 batch_normalization_63 (Ba  (None, 512)    

Epoch 19/100
Epoch 19: val_loss did not improve from 0.68804
Epoch 20/100
Epoch 20: val_loss did not improve from 0.68804
Epoch 21/100
Epoch 21: val_loss did not improve from 0.68804
Epoch 22/100
Epoch 22: val_loss did not improve from 0.68804
Epoch 23/100
Epoch 23: val_loss did not improve from 0.68804
Epoch 24/100
Epoch 24: val_loss did not improve from 0.68804
Epoch 25/100
Epoch 25: val_loss did not improve from 0.68804
Epoch 26/100
Epoch 26: val_loss did not improve from 0.68804
Epoch 27/100
Epoch 27: val_loss did not improve from 0.68804
Epoch 28/100
Epoch 28: val_loss did not improve from 0.68804
Epoch 29/100
Epoch 29: val_loss did not improve from 0.68804
Epoch 30/100
Epoch 30: val_loss did not improve from 0.68804
Epoch 31/100
Epoch 31: val_loss did not improve from 0.68804
Epoch 32/100
Epoch 32: val_loss did not improve from 0.68804
Epoch 33/100

Epoch 33: val_loss did not improve from 0.68804
Epoch 33: early stopping
Saved learning curves: final_mlp_Title_L_label_layer_1.png


Epoch 6/100
Epoch 6: val_loss did not improve from 0.67639
Epoch 7/100
Epoch 7: val_loss did not improve from 0.67639
Epoch 8/100
Epoch 8: val_loss did not improve from 0.67639
Epoch 9/100
Epoch 9: val_loss did not improve from 0.67639
Epoch 10/100
Epoch 10: val_loss did not improve from 0.67639
Epoch 11/100
Epoch 11: val_loss did not improve from 0.67639
Epoch 12/100
Epoch 12: val_loss did not improve from 0.67639
Epoch 13/100
Epoch 13: val_loss did not improve from 0.67639
Epoch 14/100
Epoch 14: val_loss did not improve from 0.67639
Epoch 15/100
Epoch 15: val_loss did not improve from 0.67639
Epoch 16/100
Epoch 16: val_loss did not improve from 0.67639
Epoch 17/100
Epoch 17: val_loss did not improve from 0.67639
Epoch 18/100
Epoch 18: val_loss did not improve from 0.67639
Epoch 19/100

Epoch 19: val_loss did not improve from 0.67639
Epoch 19: early stopping
Saved learning curves: final_mlp_Title_L_label_layer_2.png
Test Accuracy for Layer 2: 0.6667
Test AUC for Layer 2: 0.4907

Layer

Epoch 6: val_loss did not improve from 0.63563
Epoch 7/100
Epoch 7: val_loss did not improve from 0.63563
Epoch 8/100
Epoch 8: val_loss did not improve from 0.63563
Epoch 9/100
Epoch 9: val_loss did not improve from 0.63563
Epoch 10/100
Epoch 10: val_loss did not improve from 0.63563
Epoch 11/100
Epoch 11: val_loss did not improve from 0.63563
Epoch 12/100
Epoch 12: val_loss did not improve from 0.63563
Epoch 13/100
Epoch 13: val_loss did not improve from 0.63563
Epoch 14/100
Epoch 14: val_loss did not improve from 0.63563
Epoch 15/100
Epoch 15: val_loss did not improve from 0.63563
Epoch 16/100
Epoch 16: val_loss did not improve from 0.63563
Epoch 17/100
Epoch 17: val_loss did not improve from 0.63563
Epoch 18/100

Epoch 18: val_loss did not improve from 0.63563
Epoch 18: early stopping
Saved learning curves: final_mlp_Title_L_label_layer_3.png
Test Accuracy for Layer 3: 0.3188
Test AUC for Layer 3: 0.3085

Average Test Accuracy across all layers: 0.5103
Average Test AUC across all la

Epoch 7: val_loss did not improve from 0.69047
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69047
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69047
Epoch 10/100
Epoch 10: val_loss did not improve from 0.69047
Epoch 11/100
Epoch 11: val_loss did not improve from 0.69047
Epoch 12/100
Epoch 12: val_loss did not improve from 0.69047
Epoch 13/100
Epoch 13: val_loss did not improve from 0.69047
Epoch 14/100
Epoch 14: val_loss did not improve from 0.69047
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69047
Epoch 16/100
Epoch 16: val_loss did not improve from 0.69047
Epoch 17/100

Epoch 17: val_loss did not improve from 0.69047
Epoch 17: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_1.png
Test Accuracy for Layer 1: 0.4848
Test AUC for Layer 1: 0.4816

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test d

Epoch 11/100
Epoch 11: val_loss did not improve from 0.70295
Epoch 12/100
Epoch 12: val_loss did not improve from 0.70295
Epoch 13/100
Epoch 13: val_loss did not improve from 0.70295
Epoch 14/100
Epoch 14: val_loss did not improve from 0.70295
Epoch 15/100
Epoch 15: val_loss did not improve from 0.70295
Epoch 16/100

Epoch 16: val_loss did not improve from 0.70295
Epoch 16: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_2.png
Test Accuracy for Layer 2: 0.5741
Test AUC for Layer 2: 0.5712

Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Embeddings shapes - Train: (751, 1536), Val: (108, 1536), Test: (69, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Full text (S_label)
Model: "sequential_26"
_________________________________________________________________
 

Epoch 15/100
Epoch 15: val_loss did not improve from 0.70991
Epoch 16/100

Epoch 16: val_loss did not improve from 0.70991
Epoch 16: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_3.png
Test Accuracy for Layer 3: 0.4348
Test AUC for Layer 3: 0.4248

Average Test Accuracy across all layers: 0.4979
Average Test AUC across all layers: 0.4925

Training MLP model for Full text and L_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Full text (L_label)
Model: "sequential_27"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_108 (Dense)           (None, 512) 

Epoch 17/100
Epoch 17: val_loss did not improve from 0.69501
Epoch 18/100
Epoch 18: val_loss did not improve from 0.69501
Epoch 19/100
Epoch 19: val_loss did not improve from 0.69501
Epoch 20/100
Epoch 20: val_loss did not improve from 0.69501
Epoch 21/100
Epoch 21: val_loss did not improve from 0.69501
Epoch 22/100
Epoch 22: val_loss did not improve from 0.69501
Epoch 23/100
Epoch 23: val_loss did not improve from 0.69501
Epoch 24/100
Epoch 24: val_loss did not improve from 0.69501
Epoch 25/100
Epoch 25: val_loss did not improve from 0.69501
Epoch 26/100
Epoch 26: val_loss did not improve from 0.69501
Epoch 27/100
Epoch 27: val_loss did not improve from 0.69501
Epoch 28/100
Epoch 28: val_loss did not improve from 0.69501
Epoch 29/100

Epoch 29: val_loss did not improve from 0.69501
Epoch 29: early stopping
Saved learning curves: final_mlp_Full_text_L_label_layer_1.png
Test Accuracy for Layer 1: 0.4242
Test AUC for Layer 1: 0.5531

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Vali

Epoch 7/100
Epoch 7: val_loss did not improve from 0.67707
Epoch 8/100
Epoch 8: val_loss did not improve from 0.67707
Epoch 9/100
Epoch 9: val_loss did not improve from 0.67707
Epoch 10/100
Epoch 10: val_loss improved from 0.67707 to 0.67638, saving model to OpenAI_MLP/visualizations_mlp/SLB\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 11/100
Epoch 11: val_loss improved from 0.67638 to 0.67450, saving model to OpenAI_MLP/visualizations_mlp/SLB\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 12/100
Epoch 12: val_loss did not improve from 0.67450
Epoch 13/100
Epoch 13: val_loss did not improve from 0.67450
Epoch 14/100
Epoch 14: val_loss improved from 0.67450 to 0.67164, saving model to OpenAI_MLP/visualizations_mlp/SLB\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 15/100
Epoch 15: val_loss improved from 0.67164 to 0.67098, saving model to OpenAI_MLP/visualizations_mlp/SLB\best_mlp_full_text_L_label_layer_2.weights.h5
Epoch 16/100
Epoch 16: val_loss did not improve from 

                                                                 
 dense_117 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_88 (Ba  (None, 256)               1024      
 tchNormalization)                                               
                                                                 
 dropout_59 (Dropout)        (None, 256)               0         
                                                                 
 dense_118 (Dense)           (None, 128)               32896     
                                                                 
 batch_normalization_89 (Ba  (None, 128)               512       
 tchNormalization)                                               
                                                                 
 dense_119 (Dense)           (None, 1)                 129       
                                                                 
Total para

Epoch 21/100
Epoch 21: val_loss did not improve from 0.62444
Epoch 22/100
Epoch 22: val_loss did not improve from 0.62444
Epoch 23/100
Epoch 23: val_loss did not improve from 0.62444
Epoch 24/100
Epoch 24: val_loss did not improve from 0.62444
Epoch 25/100
Epoch 25: val_loss did not improve from 0.62444
Epoch 26/100
Epoch 26: val_loss did not improve from 0.62444
Epoch 27/100
Epoch 27: val_loss did not improve from 0.62444
Epoch 28/100
Epoch 28: val_loss did not improve from 0.62444
Epoch 29/100
Epoch 29: val_loss did not improve from 0.62444
Epoch 30/100
Epoch 30: val_loss did not improve from 0.62444
Epoch 31/100

Epoch 31: val_loss did not improve from 0.62444
Epoch 31: early stopping
Saved learning curves: final_mlp_Full_text_L_label_layer_3.png
Test Accuracy for Layer 3: 0.3188
Test AUC for Layer 3: 0.4072

Average Test Accuracy across all layers: 0.4514
Average Test AUC across all layers: 0.5208

SUMMARY OF RESULTS （SLB)

Combination: MLP with Title + S_label
Average Accuracy: 0.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class MergedEmbeddingMLPPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using merged OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with merged OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'Merged_OpenAI_MLP/visualizations_mlp/SLB'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs('Merged_OpenAI_MLP/visualizations_summary/SLB', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with merged OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing merged OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert merged embeddings to numpy arrays
        self.data['Merged_embedding'] = self.data['Merged_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_merged_embedding = self.data['Merged_embedding'].iloc[0]
        print(f"Sample Merged embedding shape: {sample_merged_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data['Merged_embedding'].values)
        X_val = np.stack(val_data['Merged_embedding'].values)
        X_test = np.stack(test_data['Merged_embedding'].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def plot_final_learning_curves(self, history, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_mlp_merged_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, label_col):
        """
        Train and evaluate an MLP model for merged embeddings and a specific label column.
        
        Args:
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"MLP|Merged|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training MLP model for Merged Embeddings and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_mlp_model((1536,))
            print(f"Created MLP model for Merged Embeddings ({label_col})")
            model.summary()
            
            # Setup callbacks
            plot_callback = PlotLearningCallback('Merged', label_col, i+1, self.mlp_viz_dir)
            early_stopping = EarlyStopping(
                monitor='val_loss',
                patience=15,
                restore_best_weights=True,
                verbose=1
            )
            model_checkpoint = ModelCheckpoint(
                filepath=f"{self.mlp_viz_dir}/best_mlp_merged_{label_col}_layer_{i+1}.weights.h5",
                monitor='val_loss',
                save_weights_only=True,
                save_best_only=True,
                verbose=1
            )
            
            # Train model
            print(f"Training MLP model...")
            batch_size = 32  # Larger batch size for embeddings
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                batch_size=batch_size,
                callbacks=[early_stopping, model_checkpoint, plot_callback],
                verbose=1
            )
            
            # Evaluate on test set
            y_pred_proba = model.predict(X_test)
            y_pred = (y_pred_proba > 0.5).astype(int)
            
            # Create final learning curve visualization
            self.plot_final_learning_curves(history, label_col, i+1, self.mlp_viz_dir)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'accuracy': accuracy,
                'auc': auc,
                'history': history.history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for merged embeddings with both short-term and long-term labels."""
        # Define label columns
        label_cols = ['S_label', 'L_label']
        
        # Run analysis for each label column
        for label_col in label_cols:
            self.train_and_evaluate_model(label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (SLB)")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Performance comparison for MLP with merged embeddings
        plt.figure(figsize=(10, 6))
        
        # Plot accuracy and AUC bars
        x = np.arange(len(df))
        width = 0.35
        
        plt.bar(x - width/2, df['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, df['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with Merged OpenAI Embeddings')
        labels = [f"Merged + {row['Label']}" for _, row in df.iterrows()]
        plt.xticks(x, labels, rotation=0)
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(df['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(df['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(df['Avg Accuracy'].max(), df['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/SLB', "mlp_merged_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP models."""
        # Prepare data for visualization
        layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                layer_data.append(layer_info)
        
        if not layer_data:
            print("No layer data available")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(10, 8))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title('MLP Accuracy by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title('MLP AUC by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/SLB', "mlp_merged_layer_performance.png")
        plt.savefig(save_path)
        print(f"MLP layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['Merged_OpenAI_MLP/visualizations_mlp/SLB', 'Merged_OpenAI_MLP/visualizations_summary/SLB']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_SLB_completed_openai.csv'
    
    # Initialize the predictor with merged OpenAI embeddings
    predictor = MergedEmbeddingMLPPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'Merged_OpenAI_MLP' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing merged OpenAI embedding vectors from string format...
Sample Merged embedding shape: (1536,)
Loaded 929 climate change news articles spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {1: 472, 0: 457}
Class distribution for long-term prediction: {1: 494, 0: 435}

Training MLP model for Merged Embeddings and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (S_label)
Model: "sequential_30"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_120 

 dense_124 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_93 (Ba  (None, 512)               2048      
 tchNormalization)                                               
                                                                 
 dropout_62 (Dropout)        (None, 512)               0         
                                                                 
 dense_125 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_94 (Ba  (None, 256)               1024      
 tchNormalization)                                               
                                                                 
 dropout_63 (Dropout)        (None, 256)               0         
                                                                 
 dense_126 (Dense)           (None, 128)               32896     
          

 dense_131 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.69025, saving model to Merged_OpenAI_MLP/visualizations_mlp/SLB\best_mlp_merged_S_label_layer_3.weights.h5
Epoch 2/100
Epoch 2: val_loss improved from 0.69025 to 0.68817, saving model to Merged_OpenAI_MLP/visualizations_mlp/SLB\best_mlp_merged_S_label_layer_3.weights.h5
Epoch 3/100
Epoch 3: val_loss did not improve from 0.68817
Epoch 4/100
Epoch 4: val_loss improved from 0.68817 to 0.68774, saving model to Merged_OpenAI_MLP/visualizations_mlp/SLB\best_mlp_merged_S_label_layer_3.weights.h5
Epoch 5/100
Epoch 5: val_loss improved from 0.68774 to 0.68707, saving model to Merged_OpenAI_MLP/visualizations_mlp/SLB\best_mlp_

 dropout_67 (Dropout)        (None, 256)               0         
                                                                 
 dense_134 (Dense)           (None, 128)               32896     
                                                                 
 batch_normalization_101 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_135 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.69863, saving model to Merged_OpenAI_MLP/visualizations_mlp/SLB\best_mlp_merged_L_label_layer_1.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.69

Epoch 2/100
Epoch 2: val_loss did not improve from 0.69897
Epoch 3/100
Epoch 3: val_loss did not improve from 0.69897
Epoch 4/100
Epoch 4: val_loss improved from 0.69897 to 0.69893, saving model to Merged_OpenAI_MLP/visualizations_mlp/SLB\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 5/100
Epoch 5: val_loss improved from 0.69893 to 0.69730, saving model to Merged_OpenAI_MLP/visualizations_mlp/SLB\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69730
Epoch 7/100
Epoch 7: val_loss did not improve from 0.69730
Epoch 8/100
Epoch 8: val_loss improved from 0.69730 to 0.69700, saving model to Merged_OpenAI_MLP/visualizations_mlp/SLB\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69700
Epoch 10/100
Epoch 10: val_loss did not improve from 0.69700
Epoch 11/100
Epoch 11: val_loss did not improve from 0.69700
Epoch 12/100
Epoch 12: val_loss improved from 0.69700 to 0.69666, saving model to Merged_Op

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_140 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_105 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_70 (Dropout)        (None, 512)               0         
                                                                 
 dense_141 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_106 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_71 (Dropout)        (None, 256)               0         
          

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
import xgboost as xgb
from xgboost import callback
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, model_type, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.model_type = model_type
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class ClimateNewsOpenAIPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'OpenAI_MLP/visualizations_mlp/MPC'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs('OpenAI_MLP/visualizations_summary/MPC', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert embeddings to numpy arrays
        self.data['Title_embedding'] = self.data['Title_embedding_vector'].apply(parse_embedding)
        self.data['Fulltext_embedding'] = self.data['Full_text_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_title_embedding = self.data['Title_embedding'].iloc[0]
        sample_fulltext_embedding = self.data['Fulltext_embedding'].iloc[0]
        
        print(f"Sample Title embedding shape: {sample_title_embedding.shape}")
        print(f"Sample Fulltext embedding shape: {sample_fulltext_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            text_col: The column containing the embeddings ('Title_embedding' or 'Fulltext_embedding')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data[text_col].values)
        X_val = np.stack(val_data[text_col].values)
        X_test = np.stack(test_data[text_col].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def plot_final_learning_curves(self, history, model_type, text_col, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            model_type: Model type (MLP)
            text_col: Text column used
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_{model_type.lower()}_{display_text.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, text_col, label_col, model_type):
        """
        Train and evaluate a model for a specific text column and label column.
        
        Args:
            text_col: The embedding column to use ('Title_embedding' or 'Fulltext_embedding')
            label_col: The label column to use ('S_label' or 'L_label')
            model_type: The model type to use ('MLP')
        """
        # Store results
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        combination_key = f"{model_type}|{display_text}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training {model_type} model for {display_text} and {label_col}")
        print(f"{'='*80}")
        
        visualization_dir = self.mlp_viz_dir if model_type == 'MLP' else self.xgb_viz_dir
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            if model_type == 'MLP':
                # MLP model training
                model = self.create_mlp_model((1536,))
                print(f"Created MLP model for {display_text} ({label_col})")
                model.summary()
                
                # Setup callbacks
                plot_callback = PlotLearningCallback(model_type, display_text, label_col, i+1, visualization_dir)
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=15,
                    restore_best_weights=True,
                    verbose=1
                )
                model_checkpoint = ModelCheckpoint(
                    filepath=f"{visualization_dir}/best_{model_type.lower()}_{display_text.lower().replace(' ', '_')}_{label_col}_layer_{i+1}.weights.h5",
                    monitor='val_loss',
                    save_weights_only=True,
                    save_best_only=True,
                    verbose=1
                )
                
                # Train model
                print(f"Training MLP model...")
                batch_size = 32  # Larger batch size for embeddings
                history = model.fit(
                    X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                    batch_size=batch_size,
                    callbacks=[early_stopping, model_checkpoint, plot_callback],
                    verbose=1
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict(X_test)
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create final learning curve visualization
                self.plot_final_learning_curves(history, model_type, text_col, label_col, i+1, visualization_dir)
                
                # Store training history
                training_history = history.history
                
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model_type': model_type,
                'accuracy': accuracy,
                'auc': auc,
                'history': training_history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text inputs, label columns, and model types."""
        # Define all combinations
        embedding_cols = ['Title_embedding', 'Fulltext_embedding']
        label_cols = ['S_label', 'L_label']
        model_types = ['MLP']
        
        # Run analysis for each combination
        for model_type in model_types:
            for embedding_col in embedding_cols:
                for label_col in label_cols:
                    self.train_and_evaluate_model(embedding_col, label_col, model_type)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS （MPC)")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Model': model_type,
                'Text': text_col,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing all model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Split by model type
        mlp_data = df[df['Model'] == 'MLP']
        
        # 1. Performance comparison for MLP
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for MLP
        x = np.arange(len(mlp_data))
        width = 0.35
        
        plt.bar(x - width/2, mlp_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, mlp_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('MLP Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in mlp_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(mlp_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(mlp_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(mlp_data['Avg Accuracy'].max(), mlp_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP/visualizations_summary/MPC', "mlp_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP models."""
        # Prepare data for visualization
        mlp_layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Model': model_type,
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                if model_type == 'MLP':
                    mlp_layer_data.append(layer_info)
        
        # Create visualizations for each model type
        self._create_model_layer_visualization(mlp_layer_data, 'MLP')
    
    def _create_model_layer_visualization(self, layer_data, model_type):
        """Create layer-specific visualizations for a given model type."""
        if not layer_data:
            print(f"No layer data available for {model_type}")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(14, 10))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title(f'{model_type} Accuracy by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title(f'{model_type} AUC by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP/visualizations_summary/MPC', f"{model_type.lower()}_layer_performance.png")
        plt.savefig(save_path)
        print(f"{model_type} layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['OpenAI_MLP/visualizations_mlp/MPC',
                      'OpenAI_MLP/visualizations_summary/MPC']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_MPC_completed_openai.csv'
    
    # Initialize the predictor with OpenAI embeddings
    predictor = ClimateNewsOpenAIPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'OpenAI_MLP' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing OpenAI embedding vectors from string format...
Sample Title embedding shape: (1536,)
Sample Fulltext embedding shape: (1536,)
Loaded 929 climate change news articles spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {1: 482, 0: 447}
Class distribution for long-term prediction: {1: 555, 0: 374}

Training MLP model for Title and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (S_label)
Model: "sequential_36"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 d

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_148 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_111 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_74 (Dropout)        (None, 512)               0         
                                                                 
 dense_149 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_112 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_75 (Dropout)        (None, 256)               0         
          

 batch_normalization_116 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_155 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.68974, saving model to OpenAI_MLP/visualizations_mlp/MPC\best_mlp_title_S_label_layer_3.weights.h5
Epoch 2/100
Epoch 2: val_loss improved from 0.68974 to 0.68878, saving model to OpenAI_MLP/visualizations_mlp/MPC\best_mlp_title_S_label_layer_3.weights.h5
Epoch 3/100
Epoch 3: val_loss improved from 0.68878 to 0.68875, saving model to OpenAI_MLP/visualizations_mlp/MPC\best_mlp_title_S_label_layer_3.weights.h5
Epoch 4/100
Epo

Epoch 26/100
Epoch 26: val_loss did not improve from 0.68061
Epoch 27/100
Epoch 27: val_loss did not improve from 0.68061
Epoch 28/100
Epoch 28: val_loss did not improve from 0.68061
Epoch 29/100
Epoch 29: val_loss did not improve from 0.68061
Epoch 30/100
Epoch 30: val_loss did not improve from 0.68061
Epoch 31/100
Epoch 31: val_loss did not improve from 0.68061
Epoch 32/100
Epoch 32: val_loss did not improve from 0.68061
Epoch 33/100

Epoch 33: val_loss did not improve from 0.68061
Epoch 33: early stopping
Saved learning curves: final_mlp_Title_S_label_layer_3.png
Test Accuracy for Layer 3: 0.5072
Test AUC for Layer 3: 0.4588

Average Test Accuracy across all layers: 0.4645
Average Test AUC across all layers: 0.5216

Training MLP model for Title and L_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes 

Epoch 13/100
Epoch 13: val_loss did not improve from 0.72510
Epoch 14/100
Epoch 14: val_loss did not improve from 0.72510
Epoch 15/100
Epoch 15: val_loss did not improve from 0.72510
Epoch 16/100

Epoch 16: val_loss did not improve from 0.72510
Epoch 16: early stopping
Saved learning curves: final_mlp_Title_L_label_layer_1.png
Test Accuracy for Layer 1: 0.1061
Test AUC for Layer 1: 0.6441

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Embeddings shapes - Train: (685, 1536), Val: (66, 1536), Test: (108, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (L_label)
Model: "sequential_40"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_160 (Dense)           (None, 512)               786944    

Epoch 16: early stopping
Saved learning curves: final_mlp_Title_L_label_layer_2.png
Test Accuracy for Layer 2: 0.2685
Test AUC for Layer 2: 0.5260

Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Embeddings shapes - Train: (751, 1536), Val: (108, 1536), Test: (69, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (L_label)
Model: "sequential_41"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_164 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_123 (B  (None, 512)               2048      
 atchNormalization)                                              
                                              

Epoch 19/100
Epoch 19: val_loss did not improve from 0.62460
Epoch 20/100
Epoch 20: val_loss did not improve from 0.62460
Epoch 21/100
Epoch 21: val_loss did not improve from 0.62460
Epoch 22/100
Epoch 22: val_loss did not improve from 0.62460
Epoch 23/100
Epoch 23: val_loss did not improve from 0.62460
Epoch 24/100
Epoch 24: val_loss did not improve from 0.62460
Epoch 25/100
Epoch 25: val_loss did not improve from 0.62460
Epoch 26/100
Epoch 26: val_loss did not improve from 0.62460
Epoch 27/100

Epoch 27: val_loss did not improve from 0.62460
Epoch 27: early stopping
Saved learning curves: final_mlp_Title_L_label_layer_3.png
Test Accuracy for Layer 3: 0.5217
Test AUC for Layer 3: 0.4571

Average Test Accuracy across all layers: 0.2988
Average Test AUC across all layers: 0.5424

Training MLP model for Full text and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Vali

Epoch 11/100
Epoch 11: val_loss did not improve from 0.69425
Epoch 12/100
Epoch 12: val_loss did not improve from 0.69425
Epoch 13/100
Epoch 13: val_loss did not improve from 0.69425
Epoch 14/100
Epoch 14: val_loss did not improve from 0.69425
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69425
Epoch 16/100

Epoch 16: val_loss did not improve from 0.69425
Epoch 16: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_1.png
Test Accuracy for Layer 1: 0.3939
Test AUC for Layer 1: 0.5538

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Embeddings shapes - Train: (685, 1536), Val: (66, 1536), Test: (108, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Full text (S_label)
Model: "sequential_43"
_________________________________________________________________
 

Epoch 16/100

Epoch 16: val_loss did not improve from 0.70138
Epoch 16: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_2.png
Test Accuracy for Layer 2: 0.4167
Test AUC for Layer 2: 0.4942

Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Embeddings shapes - Train: (751, 1536), Val: (108, 1536), Test: (69, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Full text (S_label)
Model: "sequential_44"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_176 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_132 (B  (None, 512)               2048      
 atchNormalization)                       

Epoch 18/100
Epoch 18: val_loss did not improve from 0.68188
Epoch 19/100
Epoch 19: val_loss did not improve from 0.68188
Epoch 20/100
Epoch 20: val_loss did not improve from 0.68188
Epoch 21/100
Epoch 21: val_loss did not improve from 0.68188
Epoch 22/100
Epoch 22: val_loss did not improve from 0.68188
Epoch 23/100

Epoch 23: val_loss did not improve from 0.68188
Epoch 23: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_3.png
Test Accuracy for Layer 3: 0.5072
Test AUC for Layer 3: 0.4664

Average Test Accuracy across all layers: 0.4393
Average Test AUC across all layers: 0.5048

Training MLP model for Full text and L_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model cr

Epoch 12/100
Epoch 12: val_loss improved from 0.68463 to 0.68367, saving model to OpenAI_MLP/visualizations_mlp/MPC\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 13/100
Epoch 13: val_loss did not improve from 0.68367
Epoch 14/100
Epoch 14: val_loss did not improve from 0.68367
Epoch 15/100
Epoch 15: val_loss did not improve from 0.68367
Epoch 16/100
Epoch 16: val_loss did not improve from 0.68367
Epoch 17/100
Epoch 17: val_loss did not improve from 0.68367
Epoch 18/100
Epoch 18: val_loss did not improve from 0.68367
Epoch 19/100
Epoch 19: val_loss did not improve from 0.68367
Epoch 20/100
Epoch 20: val_loss did not improve from 0.68367
Epoch 21/100
Epoch 21: val_loss did not improve from 0.68367
Epoch 22/100
Epoch 22: val_loss did not improve from 0.68367
Epoch 23/100
Epoch 23: val_loss did not improve from 0.68367
Epoch 24/100
Epoch 24: val_loss did not improve from 0.68367
Epoch 25/100
Epoch 25: val_loss did not improve from 0.68367
Epoch 26/100
Epoch 26: val_loss did not impro

Epoch 5/100
Epoch 5: val_loss did not improve from 0.80384
Epoch 6/100
Epoch 6: val_loss did not improve from 0.80384
Epoch 7/100
Epoch 7: val_loss did not improve from 0.80384
Epoch 8/100
Epoch 8: val_loss did not improve from 0.80384
Epoch 9/100
Epoch 9: val_loss did not improve from 0.80384
Epoch 10/100
Epoch 10: val_loss did not improve from 0.80384
Epoch 11/100
Epoch 11: val_loss did not improve from 0.80384
Epoch 12/100
Epoch 12: val_loss did not improve from 0.80384
Epoch 13/100
Epoch 13: val_loss did not improve from 0.80384
Epoch 14/100
Epoch 14: val_loss did not improve from 0.80384
Epoch 15/100
Epoch 15: val_loss did not improve from 0.80384
Epoch 16/100

Epoch 16: val_loss did not improve from 0.80384
Epoch 16: early stopping
Saved learning curves: final_mlp_Full_text_L_label_layer_2.png
Test Accuracy for Layer 2: 0.2685
Test AUC for Layer 2: 0.4644

Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/

Epoch 8/100
Epoch 8: val_loss improved from 0.61595 to 0.60772, saving model to OpenAI_MLP/visualizations_mlp/MPC\best_mlp_full_text_L_label_layer_3.weights.h5
Epoch 9/100
Epoch 9: val_loss improved from 0.60772 to 0.60137, saving model to OpenAI_MLP/visualizations_mlp/MPC\best_mlp_full_text_L_label_layer_3.weights.h5
Epoch 10/100
Epoch 10: val_loss improved from 0.60137 to 0.59865, saving model to OpenAI_MLP/visualizations_mlp/MPC\best_mlp_full_text_L_label_layer_3.weights.h5
Epoch 11/100
Epoch 11: val_loss improved from 0.59865 to 0.59475, saving model to OpenAI_MLP/visualizations_mlp/MPC\best_mlp_full_text_L_label_layer_3.weights.h5
Epoch 12/100
Epoch 12: val_loss improved from 0.59475 to 0.58922, saving model to OpenAI_MLP/visualizations_mlp/MPC\best_mlp_full_text_L_label_layer_3.weights.h5
Epoch 13/100
Epoch 13: val_loss improved from 0.58922 to 0.58697, saving model to OpenAI_MLP/visualizations_mlp/MPC\best_mlp_full_text_L_label_layer_3.weights.h5
Epoch 14/100
Epoch 14: val_loss 

Epoch 33: val_loss did not improve from 0.57782
Epoch 34/100
Epoch 34: val_loss did not improve from 0.57782
Epoch 35/100
Epoch 35: val_loss did not improve from 0.57782
Epoch 36/100
Epoch 36: val_loss did not improve from 0.57782
Epoch 37/100
Epoch 37: val_loss did not improve from 0.57782
Epoch 38/100
Epoch 38: val_loss did not improve from 0.57782
Epoch 39/100
Epoch 39: val_loss did not improve from 0.57782
Epoch 40/100
Epoch 40: val_loss did not improve from 0.57782
Epoch 41/100
Epoch 41: val_loss did not improve from 0.57782
Epoch 42/100
Epoch 42: val_loss did not improve from 0.57782
Epoch 43/100
Epoch 43: val_loss did not improve from 0.57782
Epoch 44/100

Epoch 44: val_loss did not improve from 0.57782
Epoch 44: early stopping
Saved learning curves: final_mlp_Full_text_L_label_layer_3.png
Test Accuracy for Layer 3: 0.5362
Test AUC for Layer 3: 0.4773

Average Test Accuracy across all layers: 0.5258
Average Test AUC across all layers: 0.4406

SUMMARY OF RESULTS （MPC)

Combinatio

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class MergedEmbeddingMLPPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using merged OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with merged OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'Merged_OpenAI_MLP/visualizations_mlp/MPC'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs('Merged_OpenAI_MLP/visualizations_summary/MPC', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with merged OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing merged OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert merged embeddings to numpy arrays
        self.data['Merged_embedding'] = self.data['Merged_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_merged_embedding = self.data['Merged_embedding'].iloc[0]
        print(f"Sample Merged embedding shape: {sample_merged_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data['Merged_embedding'].values)
        X_val = np.stack(val_data['Merged_embedding'].values)
        X_test = np.stack(test_data['Merged_embedding'].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def plot_final_learning_curves(self, history, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_mlp_merged_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, label_col):
        """
        Train and evaluate an MLP model for merged embeddings and a specific label column.
        
        Args:
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"MLP|Merged|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training MLP model for Merged Embeddings and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_mlp_model((1536,))
            print(f"Created MLP model for Merged Embeddings ({label_col})")
            model.summary()
            
            # Setup callbacks
            plot_callback = PlotLearningCallback('Merged', label_col, i+1, self.mlp_viz_dir)
            early_stopping = EarlyStopping(
                monitor='val_loss',
                patience=15,
                restore_best_weights=True,
                verbose=1
            )
            model_checkpoint = ModelCheckpoint(
                filepath=f"{self.mlp_viz_dir}/best_mlp_merged_{label_col}_layer_{i+1}.weights.h5",
                monitor='val_loss',
                save_weights_only=True,
                save_best_only=True,
                verbose=1
            )
            
            # Train model
            print(f"Training MLP model...")
            batch_size = 32  # Larger batch size for embeddings
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                batch_size=batch_size,
                callbacks=[early_stopping, model_checkpoint, plot_callback],
                verbose=1
            )
            
            # Evaluate on test set
            y_pred_proba = model.predict(X_test)
            y_pred = (y_pred_proba > 0.5).astype(int)
            
            # Create final learning curve visualization
            self.plot_final_learning_curves(history, label_col, i+1, self.mlp_viz_dir)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'accuracy': accuracy,
                'auc': auc,
                'history': history.history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for merged embeddings with both short-term and long-term labels."""
        # Define label columns
        label_cols = ['S_label', 'L_label']
        
        # Run analysis for each label column
        for label_col in label_cols:
            self.train_and_evaluate_model(label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (MPC)")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Performance comparison for MLP with merged embeddings
        plt.figure(figsize=(10, 6))
        
        # Plot accuracy and AUC bars
        x = np.arange(len(df))
        width = 0.35
        
        plt.bar(x - width/2, df['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, df['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with Merged OpenAI Embeddings')
        labels = [f"Merged + {row['Label']}" for _, row in df.iterrows()]
        plt.xticks(x, labels, rotation=0)
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(df['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(df['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(df['Avg Accuracy'].max(), df['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/MPC', "mlp_merged_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP models."""
        # Prepare data for visualization
        layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                layer_data.append(layer_info)
        
        if not layer_data:
            print("No layer data available")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(10, 8))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title('MLP Accuracy by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title('MLP AUC by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/MPC', "mlp_merged_layer_performance.png")
        plt.savefig(save_path)
        print(f"MLP layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['Merged_OpenAI_MLP/visualizations_mlp/MPC', 'Merged_OpenAI_MLP/visualizations_summary/MPC']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_MPC_completed_openai.csv'
    
    # Initialize the predictor with merged OpenAI embeddings
    predictor = MergedEmbeddingMLPPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'Merged_OpenAI_MLP' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing merged OpenAI embedding vectors from string format...
Sample Merged embedding shape: (1536,)
Loaded 929 climate change news articles spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {1: 482, 0: 447}
Class distribution for long-term prediction: {1: 555, 0: 374}

Training MLP model for Merged Embeddings and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (S_label)
Model: "sequential_48"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_192 

 dense_196 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_147 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_98 (Dropout)        (None, 512)               0         
                                                                 
 dense_197 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_148 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_99 (Dropout)        (None, 256)               0         
                                                                 
 dense_198 (Dense)           (None, 128)               32896     
          

                                                                 
 dense_202 (Dense)           (None, 128)               32896     
                                                                 
 batch_normalization_152 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_203 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.71687, saving model to Merged_OpenAI_MLP/visualizations_mlp/MPC\best_mlp_merged_S_label_layer_3.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.71687
Epoch 3/100
Epoch 3: val_loss did not improve from 0.71687
Epo

Epoch 2/100
Epoch 2: val_loss did not improve from 0.70022
Epoch 3/100
Epoch 3: val_loss did not improve from 0.70022
Epoch 4/100
Epoch 4: val_loss did not improve from 0.70022
Epoch 5/100
Epoch 5: val_loss did not improve from 0.70022
Epoch 6/100
Epoch 6: val_loss did not improve from 0.70022
Epoch 7/100
Epoch 7: val_loss did not improve from 0.70022
Epoch 8/100
Epoch 8: val_loss did not improve from 0.70022
Epoch 9/100
Epoch 9: val_loss did not improve from 0.70022
Epoch 10/100
Epoch 10: val_loss did not improve from 0.70022
Epoch 11/100
Epoch 11: val_loss did not improve from 0.70022
Epoch 12/100
Epoch 12: val_loss did not improve from 0.70022
Epoch 13/100
Epoch 13: val_loss did not improve from 0.70022
Epoch 14/100
Epoch 14: val_loss did not improve from 0.70022
Epoch 15/100
Epoch 15: val_loss did not improve from 0.70022
Epoch 16/100

Epoch 16: val_loss did not improve from 0.70022
Epoch 16: early stopping
Saved learning curves: final_mlp_merged_L_label_layer_1.png
Test Accuracy f

Epoch 5/100
Epoch 5: val_loss improved from 0.61180 to 0.59420, saving model to Merged_OpenAI_MLP/visualizations_mlp/MPC\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 6/100
Epoch 6: val_loss improved from 0.59420 to 0.58512, saving model to Merged_OpenAI_MLP/visualizations_mlp/MPC\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 7/100
Epoch 7: val_loss improved from 0.58512 to 0.57156, saving model to Merged_OpenAI_MLP/visualizations_mlp/MPC\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 8/100
Epoch 8: val_loss improved from 0.57156 to 0.56173, saving model to Merged_OpenAI_MLP/visualizations_mlp/MPC\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 9/100
Epoch 9: val_loss improved from 0.56173 to 0.55085, saving model to Merged_OpenAI_MLP/visualizations_mlp/MPC\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 10/100
Epoch 10: val_loss improved from 0.55085 to 0.54421, saving model to Merged_OpenAI_MLP/visualizations_mlp/MPC\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 11/100
E

Epoch 30/100
Epoch 30: val_loss did not improve from 0.52487
Epoch 31/100
Epoch 31: val_loss did not improve from 0.52487
Epoch 32/100
Epoch 32: val_loss did not improve from 0.52487
Epoch 33/100
Epoch 33: val_loss did not improve from 0.52487
Epoch 34/100
Epoch 34: val_loss did not improve from 0.52487
Epoch 35/100
Epoch 35: val_loss did not improve from 0.52487
Epoch 36/100
Epoch 36: val_loss did not improve from 0.52487
Epoch 37/100

Epoch 37: val_loss did not improve from 0.52487
Epoch 37: early stopping
Saved learning curves: final_mlp_merged_L_label_layer_2.png
Test Accuracy for Layer 2: 0.7037
Test AUC for Layer 2: 0.6124

Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Embeddings shapes - Train: (751, 1536), Val: (108, 1536), Test: (69, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP m

Epoch 14/100
Epoch 14: val_loss did not improve from 0.70554
Epoch 15/100
Epoch 15: val_loss did not improve from 0.70554
Epoch 16/100

Epoch 16: val_loss did not improve from 0.70554
Epoch 16: early stopping
Saved learning curves: final_mlp_merged_L_label_layer_3.png
Test Accuracy for Layer 3: 0.4783
Test AUC for Layer 3: 0.4882

Average Test Accuracy across all layers: 0.4293
Average Test AUC across all layers: 0.4742

SUMMARY OF RESULTS (MPC)

Combination: MLP with Merged + S_label
Average Accuracy: 0.5607
Average AUC: 0.5154
  Layer 1 - Accuracy: 0.6061, AUC: 0.4606
  Layer 2 - Accuracy: 0.5833, AUC: 0.5563
  Layer 3 - Accuracy: 0.4928, AUC: 0.5294

Combination: MLP with Merged + L_label
Average Accuracy: 0.4293
Average AUC: 0.4742
  Layer 1 - Accuracy: 0.1061, AUC: 0.3220
  Layer 2 - Accuracy: 0.7037, AUC: 0.6124
  Layer 3 - Accuracy: 0.4783, AUC: 0.4882
MLP summary comparison visualization saved as: Merged_OpenAI_MLP/visualizations_summary/MPC\mlp_merged_performance_comparison.pn

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
import xgboost as xgb
from xgboost import callback
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, model_type, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.model_type = model_type
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class ClimateNewsOpenAIPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'OpenAI_MLP/visualizations_mlp/CVX'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs('OpenAI_MLP/visualizations_summary/CVX', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert embeddings to numpy arrays
        self.data['Title_embedding'] = self.data['Title_embedding_vector'].apply(parse_embedding)
        self.data['Fulltext_embedding'] = self.data['Full_text_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_title_embedding = self.data['Title_embedding'].iloc[0]
        sample_fulltext_embedding = self.data['Fulltext_embedding'].iloc[0]
        
        print(f"Sample Title embedding shape: {sample_title_embedding.shape}")
        print(f"Sample Fulltext embedding shape: {sample_fulltext_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            text_col: The column containing the embeddings ('Title_embedding' or 'Fulltext_embedding')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data[text_col].values)
        X_val = np.stack(val_data[text_col].values)
        X_test = np.stack(test_data[text_col].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def plot_final_learning_curves(self, history, model_type, text_col, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            model_type: Model type (MLP)
            text_col: Text column used
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_{model_type.lower()}_{display_text.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, text_col, label_col, model_type):
        """
        Train and evaluate a model for a specific text column and label column.
        
        Args:
            text_col: The embedding column to use ('Title_embedding' or 'Fulltext_embedding')
            label_col: The label column to use ('S_label' or 'L_label')
            model_type: The model type to use ('MLP')
        """
        # Store results
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        combination_key = f"{model_type}|{display_text}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training {model_type} model for {display_text} and {label_col}")
        print(f"{'='*80}")
        
        visualization_dir = self.mlp_viz_dir if model_type == 'MLP' else self.xgb_viz_dir
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            if model_type == 'MLP':
                # MLP model training
                model = self.create_mlp_model((1536,))
                print(f"Created MLP model for {display_text} ({label_col})")
                model.summary()
                
                # Setup callbacks
                plot_callback = PlotLearningCallback(model_type, display_text, label_col, i+1, visualization_dir)
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=15,
                    restore_best_weights=True,
                    verbose=1
                )
                model_checkpoint = ModelCheckpoint(
                    filepath=f"{visualization_dir}/best_{model_type.lower()}_{display_text.lower().replace(' ', '_')}_{label_col}_layer_{i+1}.weights.h5",
                    monitor='val_loss',
                    save_weights_only=True,
                    save_best_only=True,
                    verbose=1
                )
                
                # Train model
                print(f"Training MLP model...")
                batch_size = 32  # Larger batch size for embeddings
                history = model.fit(
                    X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                    batch_size=batch_size,
                    callbacks=[early_stopping, model_checkpoint, plot_callback],
                    verbose=1
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict(X_test)
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create final learning curve visualization
                self.plot_final_learning_curves(history, model_type, text_col, label_col, i+1, visualization_dir)
                
                # Store training history
                training_history = history.history
                
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model_type': model_type,
                'accuracy': accuracy,
                'auc': auc,
                'history': training_history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text inputs, label columns, and model types."""
        # Define all combinations
        embedding_cols = ['Title_embedding', 'Fulltext_embedding']
        label_cols = ['S_label', 'L_label']
        model_types = ['MLP']
        
        # Run analysis for each combination
        for model_type in model_types:
            for embedding_col in embedding_cols:
                for label_col in label_cols:
                    self.train_and_evaluate_model(embedding_col, label_col, model_type)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS （CVX)")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Model': model_type,
                'Text': text_col,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing all model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Split by model type
        mlp_data = df[df['Model'] == 'MLP']
        
        # 1. Performance comparison for MLP
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for MLP
        x = np.arange(len(mlp_data))
        width = 0.35
        
        plt.bar(x - width/2, mlp_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, mlp_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('MLP Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in mlp_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(mlp_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(mlp_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(mlp_data['Avg Accuracy'].max(), mlp_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP/visualizations_summary/CVX', "mlp_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP models."""
        # Prepare data for visualization
        mlp_layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Model': model_type,
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                if model_type == 'MLP':
                    mlp_layer_data.append(layer_info)
        
        # Create visualizations for each model type
        self._create_model_layer_visualization(mlp_layer_data, 'MLP')
    
    def _create_model_layer_visualization(self, layer_data, model_type):
        """Create layer-specific visualizations for a given model type."""
        if not layer_data:
            print(f"No layer data available for {model_type}")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(14, 10))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title(f'{model_type} Accuracy by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title(f'{model_type} AUC by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP/visualizations_summary/CVX', f"{model_type.lower()}_layer_performance.png")
        plt.savefig(save_path)
        print(f"{model_type} layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['OpenAI_MLP/visualizations_mlp/CVX',
                      'OpenAI_MLP/visualizations_summary/CVX']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_CVX_completed_openai.csv'
    
    # Initialize the predictor with OpenAI embeddings
    predictor = ClimateNewsOpenAIPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'OpenAI_MLP' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing OpenAI embedding vectors from string format...
Sample Title embedding shape: (1536,)
Sample Fulltext embedding shape: (1536,)
Loaded 929 climate change news articles spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {1: 499, 0: 430}
Class distribution for long-term prediction: {1: 521, 0: 408}

Training MLP model for Title and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (S_label)
Model: "sequential_54"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 d

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_220 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_165 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_110 (Dropout)       (None, 512)               0         
                                                                 
 dense_221 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_166 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_111 (Dropout)       (None, 256)               0         
          

Epoch 22/100
Epoch 22: val_loss did not improve from 0.67482
Epoch 23/100
Epoch 23: val_loss did not improve from 0.67482
Epoch 24/100
Epoch 24: val_loss did not improve from 0.67482
Epoch 25/100
Epoch 25: val_loss did not improve from 0.67482
Epoch 26/100
Epoch 26: val_loss did not improve from 0.67482
Epoch 27/100
Epoch 27: val_loss did not improve from 0.67482
Epoch 28/100
Epoch 28: val_loss did not improve from 0.67482
Epoch 29/100
Epoch 29: val_loss did not improve from 0.67482
Epoch 30/100
Epoch 30: val_loss did not improve from 0.67482
Epoch 31/100
Epoch 31: val_loss did not improve from 0.67482
Epoch 32/100
Epoch 32: val_loss did not improve from 0.67482
Epoch 33/100
Epoch 33: val_loss did not improve from 0.67482
Epoch 34/100

Epoch 34: val_loss did not improve from 0.67482
Epoch 34: early stopping
Saved learning curves: final_mlp_Title_S_label_layer_2.png
Test Accuracy for Layer 2: 0.5000
Test AUC for Layer 2: 0.4243

Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validati

Epoch 7/100
Epoch 7: val_loss did not improve from 0.67472
Epoch 8/100
Epoch 8: val_loss did not improve from 0.67472
Epoch 9/100
Epoch 9: val_loss did not improve from 0.67472
Epoch 10/100
Epoch 10: val_loss did not improve from 0.67472
Epoch 11/100
Epoch 11: val_loss did not improve from 0.67472
Epoch 12/100
Epoch 12: val_loss did not improve from 0.67472
Epoch 13/100
Epoch 13: val_loss did not improve from 0.67472
Epoch 14/100
Epoch 14: val_loss did not improve from 0.67472
Epoch 15/100
Epoch 15: val_loss did not improve from 0.67472
Epoch 16/100
Epoch 16: val_loss did not improve from 0.67472
Epoch 17/100
Epoch 17: val_loss did not improve from 0.67472
Epoch 18/100
Epoch 18: val_loss did not improve from 0.67472
Epoch 19/100

Epoch 19: val_loss did not improve from 0.67472
Epoch 19: early stopping
Saved learning curves: final_mlp_Title_S_label_layer_3.png
Test Accuracy for Layer 3: 0.4638
Test AUC for Layer 3: 0.5414

Average Test Accuracy across all layers: 0.5384
Average Test AUC

Epoch 6/100
Epoch 6: val_loss improved from 0.66234 to 0.66172, saving model to OpenAI_MLP/visualizations_mlp/CVX\best_mlp_title_L_label_layer_1.weights.h5
Epoch 7/100
Epoch 7: val_loss did not improve from 0.66172
Epoch 8/100
Epoch 8: val_loss did not improve from 0.66172
Epoch 9/100
Epoch 9: val_loss did not improve from 0.66172
Epoch 10/100
Epoch 10: val_loss did not improve from 0.66172
Epoch 11/100
Epoch 11: val_loss did not improve from 0.66172
Epoch 12/100
Epoch 12: val_loss did not improve from 0.66172
Epoch 13/100
Epoch 13: val_loss did not improve from 0.66172
Epoch 14/100
Epoch 14: val_loss did not improve from 0.66172
Epoch 15/100
Epoch 15: val_loss did not improve from 0.66172
Epoch 16/100
Epoch 16: val_loss did not improve from 0.66172
Epoch 17/100
Epoch 17: val_loss did not improve from 0.66172
Epoch 18/100
Epoch 18: val_loss did not improve from 0.66172
Epoch 19/100
Epoch 19: val_loss did not improve from 0.66172
Epoch 20/100
Epoch 20: val_loss did not improve from 0.66

Epoch 5: val_loss did not improve from 0.74419
Epoch 6/100
Epoch 6: val_loss did not improve from 0.74419
Epoch 7/100
Epoch 7: val_loss did not improve from 0.74419
Epoch 8/100
Epoch 8: val_loss did not improve from 0.74419
Epoch 9/100
Epoch 9: val_loss did not improve from 0.74419
Epoch 10/100
Epoch 10: val_loss did not improve from 0.74419
Epoch 11/100
Epoch 11: val_loss did not improve from 0.74419
Epoch 12/100
Epoch 12: val_loss did not improve from 0.74419
Epoch 13/100
Epoch 13: val_loss did not improve from 0.74419
Epoch 14/100
Epoch 14: val_loss did not improve from 0.74419
Epoch 15/100
Epoch 15: val_loss did not improve from 0.74419
Epoch 16/100

Epoch 16: val_loss did not improve from 0.74419
Epoch 16: early stopping
Saved learning curves: final_mlp_Title_L_label_layer_2.png
Test Accuracy for Layer 2: 0.3704
Test AUC for Layer 2: 0.5055

Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training

Epoch 8: val_loss did not improve from 0.66850
Epoch 9/100
Epoch 9: val_loss did not improve from 0.66850
Epoch 10/100
Epoch 10: val_loss did not improve from 0.66850
Epoch 11/100
Epoch 11: val_loss did not improve from 0.66850
Epoch 12/100
Epoch 12: val_loss did not improve from 0.66850
Epoch 13/100
Epoch 13: val_loss did not improve from 0.66850
Epoch 14/100
Epoch 14: val_loss did not improve from 0.66850
Epoch 15/100
Epoch 15: val_loss did not improve from 0.66850
Epoch 16/100
Epoch 16: val_loss did not improve from 0.66850
Epoch 17/100
Epoch 17: val_loss did not improve from 0.66850
Epoch 18/100
Epoch 18: val_loss did not improve from 0.66850
Epoch 19/100
Epoch 19: val_loss did not improve from 0.66850
Epoch 20/100
Epoch 20: val_loss improved from 0.66850 to 0.66354, saving model to OpenAI_MLP/visualizations_mlp/CVX\best_mlp_title_L_label_layer_3.weights.h5
Epoch 21/100
Epoch 21: val_loss did not improve from 0.66354
Epoch 22/100
Epoch 22: val_loss did not improve from 0.66354
Epoc

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_240 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_180 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_120 (Dropout)       (None, 512)               0         
                                                                 
 dense_241 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_181 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_121 (Dropout)       (None, 256)               0         
          

 batch_normalization_185 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_247 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.74706, saving model to OpenAI_MLP/visualizations_mlp/CVX\best_mlp_full_text_S_label_layer_2.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.74706
Epoch 3/100
Epoch 3: val_loss did not improve from 0.74706
Epoch 4/100
Epoch 4: val_loss did not improve from 0.74706
Epoch 5/100
Epoch 5: val_loss did not improve from 0.74706
Epoch 6/100
Epoch 6: val_loss did not improve from 0.74706
Epoch 7/100
Epoch 7: val_los

Epoch 3/100
Epoch 3: val_loss improved from 0.68347 to 0.68090, saving model to OpenAI_MLP/visualizations_mlp/CVX\best_mlp_full_text_S_label_layer_3.weights.h5
Epoch 4/100
Epoch 4: val_loss improved from 0.68090 to 0.67937, saving model to OpenAI_MLP/visualizations_mlp/CVX\best_mlp_full_text_S_label_layer_3.weights.h5
Epoch 5/100
Epoch 5: val_loss improved from 0.67937 to 0.67801, saving model to OpenAI_MLP/visualizations_mlp/CVX\best_mlp_full_text_S_label_layer_3.weights.h5
Epoch 6/100
Epoch 6: val_loss improved from 0.67801 to 0.67736, saving model to OpenAI_MLP/visualizations_mlp/CVX\best_mlp_full_text_S_label_layer_3.weights.h5
Epoch 7/100
Epoch 7: val_loss did not improve from 0.67736
Epoch 8/100
Epoch 8: val_loss did not improve from 0.67736
Epoch 9/100
Epoch 9: val_loss did not improve from 0.67736
Epoch 10/100
Epoch 10: val_loss did not improve from 0.67736
Epoch 11/100
Epoch 11: val_loss did not improve from 0.67736
Epoch 12/100
Epoch 12: val_loss did not improve from 0.67736


Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.69021, saving model to OpenAI_MLP/visualizations_mlp/CVX\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 2/100
Epoch 2: val_loss improved from 0.69021 to 0.68542, saving model to OpenAI_MLP/visualizations_mlp/CVX\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 3/100
Epoch 3: val_loss improved from 0.68542 to 0.68372, saving model to OpenAI_MLP/visualizations_mlp/CVX\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 4/100
Epoch 4: val_loss improved from 0.68372 to 0.68216, saving model to OpenAI_MLP/visualizations_mlp/CVX\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 5/100
Epoch 5: val_loss improved from 0.68216 to 0.68031, saving model to OpenAI_MLP/visualizations_mlp/CVX\best_mlp_full_text_L_label_layer_1.weights.h5
Epoch 6/100
E

Epoch 26: val_loss did not improve from 0.67537
Epoch 27/100
Epoch 27: val_loss did not improve from 0.67537
Epoch 28/100

Epoch 28: val_loss did not improve from 0.67537
Epoch 28: early stopping
Saved learning curves: final_mlp_Full_text_L_label_layer_1.png
Test Accuracy for Layer 1: 0.7576
Test AUC for Layer 1: 0.4533

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Embeddings shapes - Train: (685, 1536), Val: (66, 1536), Test: (108, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Full text (L_label)
Model: "sequential_64"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_256 (Dense)           (None, 512)               786944    
                                                                 

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_260 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_195 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_130 (Dropout)       (None, 512)               0         
                                                                 
 dense_261 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_196 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_131 (Dropout)       (None, 256)               0         
          

Epoch 20/100
Epoch 20: val_loss did not improve from 0.65137
Epoch 21/100
Epoch 21: val_loss did not improve from 0.65137
Epoch 22/100
Epoch 22: val_loss did not improve from 0.65137
Epoch 23/100
Epoch 23: val_loss did not improve from 0.65137
Epoch 24/100
Epoch 24: val_loss did not improve from 0.65137
Epoch 25/100
Epoch 25: val_loss did not improve from 0.65137
Epoch 26/100
Epoch 26: val_loss did not improve from 0.65137
Epoch 27/100
Epoch 27: val_loss did not improve from 0.65137
Epoch 28/100
Epoch 28: val_loss did not improve from 0.65137
Epoch 29/100

Epoch 29: val_loss did not improve from 0.65137
Epoch 29: early stopping
Saved learning curves: final_mlp_Full_text_L_label_layer_3.png
Test Accuracy for Layer 3: 0.2899
Test AUC for Layer 3: 0.4684

Average Test Accuracy across all layers: 0.4726
Average Test AUC across all layers: 0.4659

SUMMARY OF RESULTS （CVX)

Combination: MLP with Title + S_label
Average Accuracy: 0.5384
Average AUC: 0.4530
  Layer 1 - Accuracy: 0.6515, AUC: 0

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class MergedEmbeddingMLPPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using merged OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with merged OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'Merged_OpenAI_MLP/visualizations_mlp/CVX'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs('Merged_OpenAI_MLP/visualizations_summary/CVX', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with merged OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing merged OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert merged embeddings to numpy arrays
        self.data['Merged_embedding'] = self.data['Merged_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_merged_embedding = self.data['Merged_embedding'].iloc[0]
        print(f"Sample Merged embedding shape: {sample_merged_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data['Merged_embedding'].values)
        X_val = np.stack(val_data['Merged_embedding'].values)
        X_test = np.stack(test_data['Merged_embedding'].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def plot_final_learning_curves(self, history, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_mlp_merged_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, label_col):
        """
        Train and evaluate an MLP model for merged embeddings and a specific label column.
        
        Args:
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"MLP|Merged|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training MLP model for Merged Embeddings and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_mlp_model((1536,))
            print(f"Created MLP model for Merged Embeddings ({label_col})")
            model.summary()
            
            # Setup callbacks
            plot_callback = PlotLearningCallback('Merged', label_col, i+1, self.mlp_viz_dir)
            early_stopping = EarlyStopping(
                monitor='val_loss',
                patience=15,
                restore_best_weights=True,
                verbose=1
            )
            model_checkpoint = ModelCheckpoint(
                filepath=f"{self.mlp_viz_dir}/best_mlp_merged_{label_col}_layer_{i+1}.weights.h5",
                monitor='val_loss',
                save_weights_only=True,
                save_best_only=True,
                verbose=1
            )
            
            # Train model
            print(f"Training MLP model...")
            batch_size = 32  # Larger batch size for embeddings
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                batch_size=batch_size,
                callbacks=[early_stopping, model_checkpoint, plot_callback],
                verbose=1
            )
            
            # Evaluate on test set
            y_pred_proba = model.predict(X_test)
            y_pred = (y_pred_proba > 0.5).astype(int)
            
            # Create final learning curve visualization
            self.plot_final_learning_curves(history, label_col, i+1, self.mlp_viz_dir)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'accuracy': accuracy,
                'auc': auc,
                'history': history.history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for merged embeddings with both short-term and long-term labels."""
        # Define label columns
        label_cols = ['S_label', 'L_label']
        
        # Run analysis for each label column
        for label_col in label_cols:
            self.train_and_evaluate_model(label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (CVX)")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Performance comparison for MLP with merged embeddings
        plt.figure(figsize=(10, 6))
        
        # Plot accuracy and AUC bars
        x = np.arange(len(df))
        width = 0.35
        
        plt.bar(x - width/2, df['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, df['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with Merged OpenAI Embeddings')
        labels = [f"Merged + {row['Label']}" for _, row in df.iterrows()]
        plt.xticks(x, labels, rotation=0)
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(df['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(df['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(df['Avg Accuracy'].max(), df['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/CVX', "mlp_merged_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP models."""
        # Prepare data for visualization
        layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                layer_data.append(layer_info)
        
        if not layer_data:
            print("No layer data available")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(10, 8))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title('MLP Accuracy by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title('MLP AUC by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/CVX', "mlp_merged_layer_performance.png")
        plt.savefig(save_path)
        print(f"MLP layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['Merged_OpenAI_MLP/visualizations_mlp/CVX', 'Merged_OpenAI_MLP/visualizations_summary/CVX']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_CVX_completed_openai.csv'
    
    # Initialize the predictor with merged OpenAI embeddings
    predictor = MergedEmbeddingMLPPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'Merged_OpenAI_MLP' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing merged OpenAI embedding vectors from string format...
Sample Merged embedding shape: (1536,)
Loaded 929 climate change news articles spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {1: 499, 0: 430}
Class distribution for long-term prediction: {1: 521, 0: 408}

Training MLP model for Merged Embeddings and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (S_label)
Model: "sequential_66"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_264 

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_268 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_201 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_134 (Dropout)       (None, 512)               0         
                                                                 
 dense_269 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_202 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_135 (Dropout)       (None, 256)               0         
          

 batch_normalization_206 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_275 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.67975, saving model to Merged_OpenAI_MLP/visualizations_mlp/CVX\best_mlp_merged_S_label_layer_3.weights.h5
Epoch 2/100
Epoch 2: val_loss improved from 0.67975 to 0.67480, saving model to Merged_OpenAI_MLP/visualizations_mlp/CVX\best_mlp_merged_S_label_layer_3.weights.h5
Epoch 3/100
Epoch 3: val_loss improved from 0.67480 to 0.67361, saving model to Merged_OpenAI_MLP/visualizations_mlp/CVX\best_mlp_merged_S_label_layer_3.we

 dense_279 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.67585, saving model to Merged_OpenAI_MLP/visualizations_mlp/CVX\best_mlp_merged_L_label_layer_1.weights.h5
Epoch 2/100
Epoch 2: val_loss improved from 0.67585 to 0.66680, saving model to Merged_OpenAI_MLP/visualizations_mlp/CVX\best_mlp_merged_L_label_layer_1.weights.h5
Epoch 3/100
Epoch 3: val_loss improved from 0.66680 to 0.66185, saving model to Merged_OpenAI_MLP/visualizations_mlp/CVX\best_mlp_merged_L_label_layer_1.weights.h5
Epoch 4/100
Epoch 4: val_loss improved from 0.66185 to 0.65983, saving model to Merged_OpenAI_MLP/visualizations_mlp/CVX\best_mlp_merged_L_label_layer_1.weights.h5
Epoch 5/100
Epoch 5: val_

Epoch 24: val_loss did not improve from 0.65417
Epoch 25/100
Epoch 25: val_loss did not improve from 0.65417
Epoch 26/100
Epoch 26: val_loss did not improve from 0.65417
Epoch 27/100
Epoch 27: val_loss did not improve from 0.65417
Epoch 28/100
Epoch 28: val_loss did not improve from 0.65417
Epoch 29/100

Epoch 29: val_loss did not improve from 0.65417
Epoch 29: early stopping
Saved learning curves: final_mlp_merged_L_label_layer_1.png
Test Accuracy for Layer 1: 0.7879
Test AUC for Layer 1: 0.6332

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Embeddings shapes - Train: (685, 1536), Val: (66, 1536), Test: (108, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (L_label)
Model: "sequential_70"
_________________________________________________________________
 Layer (t

Epoch 16/100

Epoch 16: val_loss did not improve from 0.74546
Epoch 16: early stopping
Saved learning curves: final_mlp_merged_L_label_layer_2.png
Test Accuracy for Layer 2: 0.3704
Test AUC for Layer 2: 0.4485

Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Embeddings shapes - Train: (751, 1536), Val: (108, 1536), Test: (69, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (L_label)
Model: "sequential_71"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_284 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_213 (B  (None, 512)               2048      
 atchNormalization)                  

Epoch 17/100
Epoch 17: val_loss did not improve from 0.67316
Epoch 18/100
Epoch 18: val_loss did not improve from 0.67316
Epoch 19/100
Epoch 19: val_loss did not improve from 0.67316
Epoch 20/100
Epoch 20: val_loss did not improve from 0.67316
Epoch 21/100
Epoch 21: val_loss did not improve from 0.67316
Epoch 22/100
Epoch 22: val_loss did not improve from 0.67316
Epoch 23/100
Epoch 23: val_loss did not improve from 0.67316
Epoch 24/100
Epoch 24: val_loss did not improve from 0.67316
Epoch 25/100
Epoch 25: val_loss did not improve from 0.67316
Epoch 26/100
Epoch 26: val_loss did not improve from 0.67316
Epoch 27/100

Epoch 27: val_loss did not improve from 0.67316
Epoch 27: early stopping
Saved learning curves: final_mlp_merged_L_label_layer_3.png
Test Accuracy for Layer 3: 0.2754
Test AUC for Layer 3: 0.3674

Average Test Accuracy across all layers: 0.4779
Average Test AUC across all layers: 0.4830

SUMMARY OF RESULTS (CVX)

Combination: MLP with Merged + S_label
Average Accuracy: 0.51

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
import xgboost as xgb
from xgboost import callback
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, model_type, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.model_type = model_type
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class ClimateNewsOpenAIPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'OpenAI_MLP/visualizations_mlp/COP'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs('OpenAI_MLP/visualizations_summary/COP', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert embeddings to numpy arrays
        self.data['Title_embedding'] = self.data['Title_embedding_vector'].apply(parse_embedding)
        self.data['Fulltext_embedding'] = self.data['Full_text_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_title_embedding = self.data['Title_embedding'].iloc[0]
        sample_fulltext_embedding = self.data['Fulltext_embedding'].iloc[0]
        
        print(f"Sample Title embedding shape: {sample_title_embedding.shape}")
        print(f"Sample Fulltext embedding shape: {sample_fulltext_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            text_col: The column containing the embeddings ('Title_embedding' or 'Fulltext_embedding')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data[text_col].values)
        X_val = np.stack(val_data[text_col].values)
        X_test = np.stack(test_data[text_col].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def plot_final_learning_curves(self, history, model_type, text_col, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            model_type: Model type (MLP)
            text_col: Text column used
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: {model_type} with {display_text} ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_{model_type.lower()}_{display_text.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, text_col, label_col, model_type):
        """
        Train and evaluate a model for a specific text column and label column.
        
        Args:
            text_col: The embedding column to use ('Title_embedding' or 'Fulltext_embedding')
            label_col: The label column to use ('S_label' or 'L_label')
            model_type: The model type to use ('MLP')
        """
        # Store results
        display_text = 'Title' if 'Title' in text_col else 'Full text'
        combination_key = f"{model_type}|{display_text}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training {model_type} model for {display_text} and {label_col}")
        print(f"{'='*80}")
        
        visualization_dir = self.mlp_viz_dir if model_type == 'MLP' else self.xgb_viz_dir
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            if model_type == 'MLP':
                # MLP model training
                model = self.create_mlp_model((1536,))
                print(f"Created MLP model for {display_text} ({label_col})")
                model.summary()
                
                # Setup callbacks
                plot_callback = PlotLearningCallback(model_type, display_text, label_col, i+1, visualization_dir)
                early_stopping = EarlyStopping(
                    monitor='val_loss',
                    patience=15,
                    restore_best_weights=True,
                    verbose=1
                )
                model_checkpoint = ModelCheckpoint(
                    filepath=f"{visualization_dir}/best_{model_type.lower()}_{display_text.lower().replace(' ', '_')}_{label_col}_layer_{i+1}.weights.h5",
                    monitor='val_loss',
                    save_weights_only=True,
                    save_best_only=True,
                    verbose=1
                )
                
                # Train model
                print(f"Training MLP model...")
                batch_size = 32  # Larger batch size for embeddings
                history = model.fit(
                    X_train, y_train,
                    validation_data=(X_val, y_val),
                    epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                    batch_size=batch_size,
                    callbacks=[early_stopping, model_checkpoint, plot_callback],
                    verbose=1
                )
                
                # Evaluate on test set
                y_pred_proba = model.predict(X_test)
                y_pred = (y_pred_proba > 0.5).astype(int)
                
                # Create final learning curve visualization
                self.plot_final_learning_curves(history, model_type, text_col, label_col, i+1, visualization_dir)
                
                # Store training history
                training_history = history.history
                
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model_type': model_type,
                'accuracy': accuracy,
                'auc': auc,
                'history': training_history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text inputs, label columns, and model types."""
        # Define all combinations
        embedding_cols = ['Title_embedding', 'Fulltext_embedding']
        label_cols = ['S_label', 'L_label']
        model_types = ['MLP']
        
        # Run analysis for each combination
        for model_type in model_types:
            for embedding_col in embedding_cols:
                for label_col in label_cols:
                    self.train_and_evaluate_model(embedding_col, label_col, model_type)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS （COP)")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Model': model_type,
                'Text': text_col,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing all model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Split by model type
        mlp_data = df[df['Model'] == 'MLP']
        
        # 1. Performance comparison for MLP
        plt.figure(figsize=(12, 8))
        
        # Plot accuracy and AUC bars for MLP
        x = np.arange(len(mlp_data))
        width = 0.35
        
        plt.bar(x - width/2, mlp_data['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, mlp_data['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('MLP Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with OpenAI Embeddings')
        labels = [f"{row['Text']} + {row['Label']}" for _, row in mlp_data.iterrows()]
        plt.xticks(x, labels, rotation=0, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(mlp_data['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(mlp_data['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(mlp_data['Avg Accuracy'].max(), mlp_data['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP/visualizations_summary/COP', "mlp_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP models."""
        # Prepare data for visualization
        mlp_layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Model': model_type,
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                if model_type == 'MLP':
                    mlp_layer_data.append(layer_info)
        
        # Create visualizations for each model type
        self._create_model_layer_visualization(mlp_layer_data, 'MLP')
    
    def _create_model_layer_visualization(self, layer_data, model_type):
        """Create layer-specific visualizations for a given model type."""
        if not layer_data:
            print(f"No layer data available for {model_type}")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(14, 10))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title(f'{model_type} Accuracy by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title(f'{model_type} AUC by Model Combination and Layer with OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('OpenAI_MLP/visualizations_summary/COP', f"{model_type.lower()}_layer_performance.png")
        plt.savefig(save_path)
        print(f"{model_type} layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['OpenAI_MLP/visualizations_mlp/COP',
                      'OpenAI_MLP/visualizations_summary/COP']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_COP_completed_openai.csv'
    
    # Initialize the predictor with OpenAI embeddings
    predictor = ClimateNewsOpenAIPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'OpenAI_MLP' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing OpenAI embedding vectors from string format...
Sample Title embedding shape: (1536,)
Sample Fulltext embedding shape: (1536,)
Loaded 929 climate change news articles spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {0: 469, 1: 460}
Class distribution for long-term prediction: {1: 504, 0: 425}

Training MLP model for Title and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Title (S_label)
Model: "sequential_72"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 d

 dense_292 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_219 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_146 (Dropout)       (None, 512)               0         
                                                                 
 dense_293 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_220 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_147 (Dropout)       (None, 256)               0         
                                                                 
 dense_294 (Dense)           (None, 128)               32896     
          

 dense_299 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.69578, saving model to OpenAI_MLP/visualizations_mlp/COP\best_mlp_title_S_label_layer_3.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.69578
Epoch 3/100
Epoch 3: val_loss did not improve from 0.69578
Epoch 4/100
Epoch 4: val_loss did not improve from 0.69578
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69578
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69578
Epoch 7/100
Epoch 7: val_loss did not improve from 0.69578
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69578
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69578
Epoch 10/100
Epoch 10: val_loss did not improve from 

Epoch 3/100
Epoch 3: val_loss did not improve from 0.69786
Epoch 4/100
Epoch 4: val_loss did not improve from 0.69786
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69786
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69786
Epoch 7/100
Epoch 7: val_loss improved from 0.69786 to 0.69755, saving model to OpenAI_MLP/visualizations_mlp/COP\best_mlp_title_L_label_layer_1.weights.h5
Epoch 8/100
Epoch 8: val_loss improved from 0.69755 to 0.69551, saving model to OpenAI_MLP/visualizations_mlp/COP\best_mlp_title_L_label_layer_1.weights.h5
Epoch 9/100
Epoch 9: val_loss improved from 0.69551 to 0.69368, saving model to OpenAI_MLP/visualizations_mlp/COP\best_mlp_title_L_label_layer_1.weights.h5
Epoch 10/100
Epoch 10: val_loss improved from 0.69368 to 0.69208, saving model to OpenAI_MLP/visualizations_mlp/COP\best_mlp_title_L_label_layer_1.weights.h5
Epoch 11/100
Epoch 11: val_loss improved from 0.69208 to 0.69051, saving model to OpenAI_MLP/visualizations_mlp/COP\best_mlp_title_L_label

_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_304 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_228 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_152 (Dropout)       (None, 512)               0         
                                                                 
 dense_305 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_229 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_153 (Dropout)       (None, 256)               0         
          

 batch_normalization_233 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_311 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.72223, saving model to OpenAI_MLP/visualizations_mlp/COP\best_mlp_title_L_label_layer_3.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.72223
Epoch 3/100
Epoch 3: val_loss did not improve from 0.72223
Epoch 4/100
Epoch 4: val_loss did not improve from 0.72223
Epoch 5/100
Epoch 5: val_loss did not improve from 0.72223
Epoch 6/100
Epoch 6: val_loss did not improve from 0.72223
Epoch 7/100
Epoch 7: val_loss di

Epoch 2/100
Epoch 2: val_loss did not improve from 0.69371
Epoch 3/100
Epoch 3: val_loss did not improve from 0.69371
Epoch 4/100
Epoch 4: val_loss did not improve from 0.69371
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69371
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69371
Epoch 7/100
Epoch 7: val_loss did not improve from 0.69371
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69371
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69371
Epoch 10/100
Epoch 10: val_loss did not improve from 0.69371
Epoch 11/100
Epoch 11: val_loss did not improve from 0.69371
Epoch 12/100
Epoch 12: val_loss did not improve from 0.69371
Epoch 13/100
Epoch 13: val_loss did not improve from 0.69371
Epoch 14/100
Epoch 14: val_loss did not improve from 0.69371
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69371
Epoch 16/100

Epoch 16: val_loss did not improve from 0.69371
Epoch 16: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_1.png
Test Accurac

Epoch 7/100
Epoch 7: val_loss did not improve from 0.68905
Epoch 8/100
Epoch 8: val_loss did not improve from 0.68905
Epoch 9/100
Epoch 9: val_loss did not improve from 0.68905
Epoch 10/100
Epoch 10: val_loss did not improve from 0.68905
Epoch 11/100
Epoch 11: val_loss did not improve from 0.68905
Epoch 12/100
Epoch 12: val_loss did not improve from 0.68905
Epoch 13/100
Epoch 13: val_loss did not improve from 0.68905
Epoch 14/100
Epoch 14: val_loss did not improve from 0.68905
Epoch 15/100
Epoch 15: val_loss did not improve from 0.68905
Epoch 16/100

Epoch 16: val_loss did not improve from 0.68905
Epoch 16: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_2.png
Test Accuracy for Layer 2: 0.5278
Test AUC for Layer 2: 0.2907

Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Embeddings shapes - Train: (751,

Epoch 11/100
Epoch 11: val_loss did not improve from 0.70678
Epoch 12/100
Epoch 12: val_loss did not improve from 0.70678
Epoch 13/100
Epoch 13: val_loss did not improve from 0.70678
Epoch 14/100
Epoch 14: val_loss did not improve from 0.70678
Epoch 15/100
Epoch 15: val_loss did not improve from 0.70678
Epoch 16/100

Epoch 16: val_loss did not improve from 0.70678
Epoch 16: early stopping
Saved learning curves: final_mlp_Full_text_S_label_layer_3.png
Test Accuracy for Layer 3: 0.5507
Test AUC for Layer 3: 0.4516

Average Test Accuracy across all layers: 0.5110
Average Test AUC across all layers: 0.4224

Training MLP model for Full text and L_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model cr

Epoch 14/100
Epoch 14: val_loss did not improve from 0.68729
Epoch 15/100
Epoch 15: val_loss did not improve from 0.68729
Epoch 16/100
Epoch 16: val_loss did not improve from 0.68729
Epoch 17/100
Epoch 17: val_loss did not improve from 0.68729
Epoch 18/100

Epoch 18: val_loss did not improve from 0.68729
Epoch 18: early stopping
Saved learning curves: final_mlp_Full_text_L_label_layer_1.png
Test Accuracy for Layer 1: 0.8333
Test AUC for Layer 1: 0.3521

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Embeddings shapes - Train: (685, 1536), Val: (66, 1536), Test: (108, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Full text (L_label)
Model: "sequential_82"
_________________________________________________________________
 Layer (type)                Output Shape              Param #


Epoch 16: val_loss did not improve from 0.72310
Epoch 16: early stopping
Saved learning curves: final_mlp_Full_text_L_label_layer_2.png
Test Accuracy for Layer 2: 0.3519
Test AUC for Layer 2: 0.4929

Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Embeddings shapes - Train: (751, 1536), Val: (108, 1536), Test: (69, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Full text (L_label)
Model: "sequential_83"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_332 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_249 (B  (None, 512)               2048      
 atchNormalization)                                    

Epoch 17: val_loss did not improve from 0.64911
Epoch 18/100
Epoch 18: val_loss did not improve from 0.64911
Epoch 19/100
Epoch 19: val_loss did not improve from 0.64911
Epoch 20/100
Epoch 20: val_loss did not improve from 0.64911
Epoch 21/100
Epoch 21: val_loss did not improve from 0.64911
Epoch 22/100
Epoch 22: val_loss did not improve from 0.64911
Epoch 23/100
Epoch 23: val_loss did not improve from 0.64911
Epoch 24/100
Epoch 24: val_loss did not improve from 0.64911
Epoch 25/100
Epoch 25: val_loss did not improve from 0.64911
Epoch 26/100
Epoch 26: val_loss did not improve from 0.64911
Epoch 27/100

Epoch 27: val_loss did not improve from 0.64911
Epoch 27: early stopping
Saved learning curves: final_mlp_Full_text_L_label_layer_3.png
Test Accuracy for Layer 3: 0.4348
Test AUC for Layer 3: 0.3991

Average Test Accuracy across all layers: 0.5400
Average Test AUC across all layers: 0.4147

SUMMARY OF RESULTS （COP)

Combination: MLP with Title + S_label
Average Accuracy: 0.5110
Average 

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import gc
import ast  # For safely parsing string representations of arrays
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve
import warnings
warnings.filterwarnings('ignore')

# Custom callback to plot training history after each epoch
class PlotLearningCallback(Callback):
    def __init__(self, text_col, label_col, layer_num, plot_dir):
        super().__init__()
        self.text_col = text_col
        self.label_col = label_col
        self.layer_num = layer_num
        self.plot_dir = plot_dir
        os.makedirs(plot_dir, exist_ok=True)
        # Keep track of metrics for each epoch
        self.train_acc = []
        self.val_acc = []
        self.train_loss = []
        self.val_loss = []
        
    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}
            
        # Collect metrics
        self.train_acc.append(logs.get('accuracy', logs.get('acc', 0)))
        self.val_acc.append(logs.get('val_accuracy', logs.get('val_acc', 0)))
        self.train_loss.append(logs.get('loss', 0))
        self.val_loss.append(logs.get('val_loss', 0))

class MergedEmbeddingMLPPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis using merged OpenAI embeddings.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data with merged OpenAI embeddings
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        
        # Create directories for visualizations
        self.mlp_viz_dir = 'Merged_OpenAI_MLP/visualizations_mlp/COP'
        os.makedirs(self.mlp_viz_dir, exist_ok=True)
        os.makedirs('Merged_OpenAI_MLP/visualizations_summary/COP', exist_ok=True)
            
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news with merged OpenAI embeddings."""
        print("Loading data...")
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        try:
            # Try pandas' automatic date parsing first with dayfirst=True
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], dayfirst=True)
        except (ValueError, TypeError):
            print("Automatic date parsing failed. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as DD/MM/YYYY
                        date = pd.to_datetime(date_str, format='%d/%m/%Y')
                    except ValueError:
                        try:
                            # Try to parse as YYYY-MM-DD
                            date = pd.to_datetime(date_str, format='%Y-%m-%d')
                        except ValueError:
                            # As a last resort, let pandas guess with dayfirst=True
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Parse embedding vectors from string format to numpy arrays
        print("Parsing merged OpenAI embedding vectors from string format...")
        
        # Function to safely convert string representation of array to numpy array
        def parse_embedding(embedding_str):
            if pd.isna(embedding_str):
                # Return zeros array if embedding is missing
                return np.zeros(1536)
            try:
                # Try parsing as a string representation of a list
                embedding_list = ast.literal_eval(embedding_str)
                return np.array(embedding_list, dtype=np.float32)
            except (ValueError, SyntaxError):
                print(f"Error parsing embedding: {embedding_str[:50]}...")
                return np.zeros(1536)
        
        # Convert merged embeddings to numpy arrays
        self.data['Merged_embedding'] = self.data['Merged_embedding_vector'].apply(parse_embedding)
        
        # Verify dimensions
        sample_merged_embedding = self.data['Merged_embedding'].iloc[0]
        print(f"Sample Merged embedding shape: {sample_merged_embedding.shape}")
        
        print(f"Loaded {len(self.data)} climate change news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        Extract OpenAI embeddings for text data.
        
        Args:
            layer: The time window layer
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data with embeddings
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Extract embeddings
        X_train = np.stack(train_data['Merged_embedding'].values)
        X_val = np.stack(val_data['Merged_embedding'].values)
        X_test = np.stack(test_data['Merged_embedding'].values)
        
        # Get labels
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        print(f"Embeddings shapes - Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_mlp_model(self, input_shape):
        """
        Create a simple MLP model with dense layers (256, 128) for document-level embeddings.
        
        Args:
            input_shape: Shape of the input data (1536,)
            
        Returns:
            Compiled MLP model
        """
        print(f"Creating MLP model with input shape: {input_shape}")
        
        # Create a feed-forward neural network
        model = Sequential([
            # Input layer
            Dense(512, activation='relu', input_shape=input_shape),
            BatchNormalization(),
            Dropout(0.3),
            
            # Hidden layer
            Dense(256, activation='relu'),
            BatchNormalization(),
            Dropout(0.3),
            Dense(128, activation='relu'),
            BatchNormalization(),
            
            # Output layer
            Dense(1, activation='sigmoid')
        ])
        
        model.compile(
            optimizer=Adam(learning_rate=0.00005),
            loss='binary_crossentropy',
            metrics=['accuracy']
        )
        
        print("MLP model created")
        return model
    
    def plot_final_learning_curves(self, history, label_col, layer_num, visualization_dir):
        """
        Plot final learning curves after training is complete for MLP.
        
        Args:
            history: Training history
            label_col: Label column used
            layer_num: Layer number
            visualization_dir: Directory to save visualizations
        """
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
        
        # Get history dictionary safely
        history_dict = {}
        if hasattr(history, 'history'):
            history_dict = history.history
        
        # Determine the correct metric names
        acc_metric = 'accuracy' if 'accuracy' in history_dict else 'acc'
        val_acc_metric = 'val_accuracy' if 'val_accuracy' in history_dict else 'val_acc'
        
        # Plot accuracy
        if acc_metric in history_dict and val_acc_metric in history_dict:
            ax1.plot(history_dict[acc_metric], label='Training Accuracy', color='blue')
            ax1.plot(history_dict[val_acc_metric], label='Validation Accuracy', color='orange')
        ax1.set_title(f'Final Accuracy Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Accuracy')
        ax1.legend(loc='lower right')
        ax1.set_ylim([0, 1])
        ax1.grid(True, linestyle='--', alpha=0.7)
        
        # Plot loss
        if 'loss' in history_dict and 'val_loss' in history_dict:
            ax2.plot(history_dict['loss'], label='Training Loss', color='blue')
            ax2.plot(history_dict['val_loss'], label='Validation Loss', color='orange')
        ax2.set_title(f'Final Loss Curves: MLP with Merged Embeddings ({label_col}, Layer {layer_num})')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Loss')
        ax2.legend(loc='upper right')
        ax2.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout()
        filename = f"final_mlp_merged_{label_col}_layer_{layer_num}.png"
        plt.savefig(f"{visualization_dir}/{filename}")
        print(f"Saved learning curves: {filename}")
        plt.close()
    
    def train_and_evaluate_model(self, label_col):
        """
        Train and evaluate an MLP model for merged embeddings and a specific label column.
        
        Args:
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"MLP|Merged|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training MLP model for Merged Embeddings and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data and get embeddings
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(
                layer, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_mlp_model((1536,))
            print(f"Created MLP model for Merged Embeddings ({label_col})")
            model.summary()
            
            # Setup callbacks
            plot_callback = PlotLearningCallback('Merged', label_col, i+1, self.mlp_viz_dir)
            early_stopping = EarlyStopping(
                monitor='val_loss',
                patience=15,
                restore_best_weights=True,
                verbose=1
            )
            model_checkpoint = ModelCheckpoint(
                filepath=f"{self.mlp_viz_dir}/best_mlp_merged_{label_col}_layer_{i+1}.weights.h5",
                monitor='val_loss',
                save_weights_only=True,
                save_best_only=True,
                verbose=1
            )
            
            # Train model
            print(f"Training MLP model...")
            batch_size = 32  # Larger batch size for embeddings
            history = model.fit(
                X_train, y_train,
                validation_data=(X_val, y_val),
                epochs=100,  # Increase max epochs, early stopping will prevent overfitting
                batch_size=batch_size,
                callbacks=[early_stopping, model_checkpoint, plot_callback],
                verbose=1
            )
            
            # Evaluate on test set
            y_pred_proba = model.predict(X_test)
            y_pred = (y_pred_proba > 0.5).astype(int)
            
            # Create final learning curve visualization
            self.plot_final_learning_curves(history, label_col, i+1, self.mlp_viz_dir)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'accuracy': accuracy,
                'auc': auc,
                'history': history.history
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy']) if self.results[combination_key]['accuracy'] else 0
        avg_auc = np.mean(self.results[combination_key]['auc']) if self.results[combination_key]['auc'] else 0
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def run_all_combinations(self):
        """Run the analysis for merged embeddings with both short-term and long-term labels."""
        # Define label columns
        label_cols = ['S_label', 'L_label']
        
        # Run analysis for each label column
        for label_col in label_cols:
            self.train_and_evaluate_model(label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (COP)")
        print("="*80)
        
        # Create a summary table for easier comparison
        summary_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            avg_accuracy = results.get('avg_accuracy', 0)
            avg_auc = results.get('avg_auc', 0)
            
            print(f"\nCombination: {model_type} with {text_col} + {label_col}")
            print(f"Average Accuracy: {avg_accuracy:.4f}")
            print(f"Average AUC: {avg_auc:.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
            
            summary_data.append({
                'Combination': f"{text_col} + {label_col}",
                'Avg Accuracy': avg_accuracy,
                'Avg AUC': avg_auc,
                'Label': label_col
            })
        
        # Create summary visualizations
        self.create_summary_visualization(summary_data)
        
        # Clean up memory
        gc.collect()
        
        return self
    
    def create_summary_visualization(self, summary_data):
        """Create a summary visualization comparing model combinations."""
        if not summary_data:
            print("No data available for summary visualization")
            return
        
        # Create a DataFrame for easier plotting
        df = pd.DataFrame(summary_data)
        
        # Convert metrics to float for plotting
        df['Avg Accuracy'] = df['Avg Accuracy'].astype(float)
        df['Avg AUC'] = df['Avg AUC'].astype(float)
        
        # Performance comparison for MLP with merged embeddings
        plt.figure(figsize=(10, 6))
        
        # Plot accuracy and AUC bars
        x = np.arange(len(df))
        width = 0.35
        
        plt.bar(x - width/2, df['Avg Accuracy'], width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, df['Avg AUC'], width, label='Average AUC', color='salmon')
        
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of MLP Models with Merged OpenAI Embeddings')
        labels = [f"Merged + {row['Label']}" for _, row in df.iterrows()]
        plt.xticks(x, labels, rotation=0)
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(df['Avg Accuracy']):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(df['Avg AUC']):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(df['Avg Accuracy'].max(), df['Avg AUC'].max()) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/COP', "mlp_merged_performance_comparison.png")
        plt.savefig(save_path)
        print(f"MLP summary comparison visualization saved as: {save_path}")
        plt.close()
        
        # Layer-specific performance visualizations
        self.create_layer_performance_visualizations()
    
    def create_layer_performance_visualizations(self):
        """Create visualizations showing performance across different layers for MLP models."""
        # Prepare data for visualization
        layer_data = []
        
        for combination, results in self.results.items():
            model_type, text_col, label_col = combination.split('|')
            
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                layer_info = {
                    'Text': text_col,
                    'Label': label_col,
                    'Layer': f"Layer {i+1}",
                    'Layer_num': i+1,
                    'Accuracy': accuracy,
                    'AUC': auc,
                    'Combination': f"{text_col} + {label_col}"
                }
                
                layer_data.append(layer_info)
        
        if not layer_data:
            print("No layer data available")
            return
        
        # Create DataFrame
        df = pd.DataFrame(layer_data)
        
        # Create visualization - Accuracy by layer
        plt.figure(figsize=(10, 8))
        
        # 1. Accuracy by combination and layer
        plt.subplot(2, 1, 1)
        sns.barplot(x='Combination', y='Accuracy', hue='Layer', data=df)
        plt.title('MLP Accuracy by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['Accuracy'].max() + 0.05))
        plt.xticks(rotation=0)
        plt.tight_layout()
        
        # 2. AUC by combination and layer
        plt.subplot(2, 1, 2)
        sns.barplot(x='Combination', y='AUC', hue='Layer', data=df)
        plt.title('MLP AUC by Model Combination and Layer with Merged OpenAI Embeddings')
        plt.ylim(0, max(0.8, df['AUC'].max() + 0.05))
        plt.xticks(rotation=0)
        
        plt.tight_layout()
        
        # Save the visualization
        save_path = os.path.join('Merged_OpenAI_MLP/visualizations_summary/COP', "mlp_merged_layer_performance.png")
        plt.savefig(save_path)
        print(f"MLP layer performance visualization saved as: {save_path}")
        plt.close()


# Main execution
if __name__ == "__main__":
    # Ensure visualization directories exist
    for directory in ['Merged_OpenAI_MLP/visualizations_mlp/COP', 'Merged_OpenAI_MLP/visualizations_summary/COP']:
        os.makedirs(directory, exist_ok=True)
    
    csv_path = 'E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_COP_completed_openai.csv'
    
    # Initialize the predictor with merged OpenAI embeddings
    predictor = MergedEmbeddingMLPPredictor(csv_path)
    
    # Run the complete analysis
    predictor.load_data().define_time_windows().run_all_combinations()
    
    print("\nAnalysis complete! Results saved to 'Merged_OpenAI_MLP' directory.")

Loading data...
Automatic date parsing failed. Trying manual conversion...
Parsing merged OpenAI embedding vectors from string format...
Sample Merged embedding shape: (1536,)
Loaded 929 climate change news articles spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {0: 469, 1: 460}
Class distribution for long-term prediction: {1: 504, 0: 425}

Training MLP model for Merged Embeddings and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Embeddings shapes - Train: (522, 1536), Val: (163, 1536), Test: (66, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (S_label)
Model: "sequential_84"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_336 

 dense_340 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_255 (B  (None, 512)               2048      
 atchNormalization)                                              
                                                                 
 dropout_170 (Dropout)       (None, 512)               0         
                                                                 
 dense_341 (Dense)           (None, 256)               131328    
                                                                 
 batch_normalization_256 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_171 (Dropout)       (None, 256)               0         
                                                                 
 dense_342 (Dense)           (None, 128)               32896     
          

                                                                 
 dense_346 (Dense)           (None, 128)               32896     
                                                                 
 batch_normalization_260 (B  (None, 128)               512       
 atchNormalization)                                              
                                                                 
 dense_347 (Dense)           (None, 1)                 129       
                                                                 
Total params: 954881 (3.64 MB)
Trainable params: 953089 (3.64 MB)
Non-trainable params: 1792 (7.00 KB)
_________________________________________________________________
Training MLP model...
Epoch 1/100
Epoch 1: val_loss improved from inf to 0.70019, saving model to Merged_OpenAI_MLP/visualizations_mlp/COP\best_mlp_merged_S_label_layer_3.weights.h5
Epoch 2/100
Epoch 2: val_loss did not improve from 0.70019
Epoch 3/100
Epoch 3: val_loss did not improve from 0.70019
Epo

Epoch 2/100
Epoch 2: val_loss did not improve from 0.69588
Epoch 3/100
Epoch 3: val_loss did not improve from 0.69588
Epoch 4/100
Epoch 4: val_loss did not improve from 0.69588
Epoch 5/100
Epoch 5: val_loss did not improve from 0.69588
Epoch 6/100
Epoch 6: val_loss did not improve from 0.69588
Epoch 7/100
Epoch 7: val_loss did not improve from 0.69588
Epoch 8/100
Epoch 8: val_loss did not improve from 0.69588
Epoch 9/100
Epoch 9: val_loss did not improve from 0.69588
Epoch 10/100
Epoch 10: val_loss did not improve from 0.69588
Epoch 11/100
Epoch 11: val_loss did not improve from 0.69588
Epoch 12/100
Epoch 12: val_loss did not improve from 0.69588
Epoch 13/100
Epoch 13: val_loss did not improve from 0.69588
Epoch 14/100
Epoch 14: val_loss did not improve from 0.69588
Epoch 15/100
Epoch 15: val_loss did not improve from 0.69588
Epoch 16/100

Epoch 16: val_loss did not improve from 0.69588
Epoch 16: early stopping
Saved learning curves: final_mlp_merged_L_label_layer_1.png
Test Accuracy f

Epoch 5/100
Epoch 5: val_loss improved from 0.64153 to 0.63253, saving model to Merged_OpenAI_MLP/visualizations_mlp/COP\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 6/100
Epoch 6: val_loss improved from 0.63253 to 0.62551, saving model to Merged_OpenAI_MLP/visualizations_mlp/COP\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 7/100
Epoch 7: val_loss improved from 0.62551 to 0.61753, saving model to Merged_OpenAI_MLP/visualizations_mlp/COP\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 8/100
Epoch 8: val_loss improved from 0.61753 to 0.61652, saving model to Merged_OpenAI_MLP/visualizations_mlp/COP\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 9/100
Epoch 9: val_loss improved from 0.61652 to 0.61475, saving model to Merged_OpenAI_MLP/visualizations_mlp/COP\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 10/100
Epoch 10: val_loss improved from 0.61475 to 0.61365, saving model to Merged_OpenAI_MLP/visualizations_mlp/COP\best_mlp_merged_L_label_layer_2.weights.h5
Epoch 11/100
E

Epoch 30: early stopping
Saved learning curves: final_mlp_merged_L_label_layer_2.png
Test Accuracy for Layer 2: 0.6667
Test AUC for Layer 2: 0.5256

Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Embeddings shapes - Train: (751, 1536), Val: (108, 1536), Test: (69, 1536)
Creating MLP model with input shape: (1536,)
MLP model created
Created MLP model for Merged Embeddings (L_label)
Model: "sequential_89"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_356 (Dense)           (None, 512)               786944    
                                                                 
 batch_normalization_267 (B  (None, 512)               2048      
 atchNormalization)                                              
                                 

Epoch 19/100
Epoch 19: val_loss improved from 0.64683 to 0.64502, saving model to Merged_OpenAI_MLP/visualizations_mlp/COP\best_mlp_merged_L_label_layer_3.weights.h5
Epoch 20/100
Epoch 20: val_loss did not improve from 0.64502
Epoch 21/100
Epoch 21: val_loss did not improve from 0.64502
Epoch 22/100
Epoch 22: val_loss did not improve from 0.64502
Epoch 23/100
Epoch 23: val_loss did not improve from 0.64502
Epoch 24/100
Epoch 24: val_loss did not improve from 0.64502
Epoch 25/100
Epoch 25: val_loss did not improve from 0.64502
Epoch 26/100
Epoch 26: val_loss did not improve from 0.64502
Epoch 27/100
Epoch 27: val_loss did not improve from 0.64502
Epoch 28/100
Epoch 28: val_loss did not improve from 0.64502
Epoch 29/100
Epoch 29: val_loss did not improve from 0.64502
Epoch 30/100
Epoch 30: val_loss did not improve from 0.64502
Epoch 31/100
Epoch 31: val_loss did not improve from 0.64502
Epoch 32/100
Epoch 32: val_loss did not improve from 0.64502
Epoch 33/100
Epoch 33: val_loss did not i