In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import os
from datetime import datetime
import ast  # For parsing string representations of lists
import warnings
warnings.filterwarnings('ignore')

class MergedSentimentStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor using logistic regression for merged financial sentiment analysis.
        
        Args:
            csv_path: Path to the CSV file containing merged sentiment score vectors and stock labels
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        self.sentiment_approaches = [
            'McDonld', 'FinBERT_ProsusAI', 'FinBERT_yiyang', 
            'FinGPT', 'Majority_vote_mean', 'Fino1'
        ]
        self.scalers = {}
        
        # Create directories for visualizations
        os.makedirs('Merged_Sentiment_Logistic_Plots/COP/visualizations', exist_ok=True)
        os.makedirs('Merged_Sentiment_Logistic_Plots/COP/visualizations/confusion_matrices', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing merged sentiment scores and stock labels."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='%d/%m/%Y')
        self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='%d/%m/%Y')
        self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='%d/%m/%Y')
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Display initial data info
        print(f"Loaded {len(self.data)} financial news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        
        # Show class distribution for each label type
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        # Process merged sentiment score columns
        self._process_merged_scores()
        
        return self
    
    def _process_merged_scores(self):
        """
        Process and normalize the merged sentiment score columns.
        Each merged score is a vector [Title_score, Full_text_score]
        """
        print("\nProcessing merged sentiment scores...")
        
        # Get all merged sentiment score columns
        merged_columns = [f"Merged_{approach}_score" for approach in self.sentiment_approaches]
        
        # Check if these columns exist in the data
        existing_columns = [col for col in merged_columns if col in self.data.columns]
        
        if not existing_columns:
            raise ValueError("No merged sentiment score columns found in the data")
        
        # First, parse the string vectors into numerical arrays
        for col in existing_columns:
            # Check the format of the data to determine parsing method
            sample_value = self.data[col].iloc[0]
            
            # If values are already numerical arrays, we don't need to parse
            if isinstance(sample_value, (list, np.ndarray)):
                print(f"Column {col} already contains numerical arrays")
                continue
                
            # If values are stored as strings, parse them
            try:
                # Try parsing as literal Python representation
                self.data[col] = self.data[col].apply(ast.literal_eval)
                print(f"Parsed {col} using ast.literal_eval")
            except (ValueError, SyntaxError):
                try:
                    # Alternative: try parsing as comma-separated values
                    self.data[col] = self.data[col].str.strip('[]').str.split(',').apply(
                        lambda x: [float(val.strip()) for val in x]
                    )
                    print(f"Parsed {col} as comma-separated values")
                except Exception as e:
                    raise ValueError(f"Could not parse sentiment vectors in column {col}: {e}")
        
        # Create two separate normalized versions of each element in the vector
        print("\nNormalizing merged sentiment scores...")
        for col in existing_columns:
            # Extract Title and Full_text components
            title_scores = np.array([vec[0] for vec in self.data[col]])
            fulltext_scores = np.array([vec[1] for vec in self.data[col]])
            
            # Display original score ranges
            print(f"{col} (Title component): min={title_scores.min():.4f}, max={title_scores.max():.4f}")
            print(f"{col} (Full_text component): min={fulltext_scores.min():.4f}, max={fulltext_scores.max():.4f}")
            
            # Normalize each component separately
            title_scaler = MinMaxScaler(feature_range=(-1, 1))
            fulltext_scaler = MinMaxScaler(feature_range=(-1, 1))
            
            title_norm = title_scaler.fit_transform(title_scores.reshape(-1, 1)).flatten()
            fulltext_norm = fulltext_scaler.fit_transform(fulltext_scores.reshape(-1, 1)).flatten()
            
            # Store scalers for potential later use
            self.scalers[f"{col}_title"] = title_scaler
            self.scalers[f"{col}_fulltext"] = fulltext_scaler
            
            # Create normalized vectors
            self.data[f"{col}_norm"] = [
                [title_norm[i], fulltext_norm[i]] for i in range(len(title_norm))
            ]
            
            # Display normalized score ranges
            print(f"{col}_norm (Title component): min={title_norm.min():.4f}, max={title_norm.max():.4f}")
            print(f"{col}_norm (Full_text component): min={fulltext_norm.min():.4f}, max={fulltext_norm.max():.4f}")
        
        # Print sample of normalized vectors
        print("\nSample of normalized merged sentiment vectors (first 3 rows):")
        for col in [f"{col}_norm" for col in existing_columns]:
            print(f"{col}: {self.data[col].iloc[:3].tolist()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, sentiment_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            sentiment_col: The column containing the merged sentiment score vectors
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            Split datasets and corresponding indices
        """
        # Create masks for each time period
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        # Get data for each period
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Additional debug information: class distribution in each split
        print(f"Train class distribution ({label_col}): {train_data[label_col].value_counts().to_dict()}")
        print(f"Val class distribution ({label_col}): {val_data[label_col].value_counts().to_dict()}")
        print(f"Test class distribution ({label_col}): {test_data[label_col].value_counts().to_dict()}")
        
        # Extract merged sentiment vectors and convert to numpy arrays
        X_train = np.array([vec for vec in train_data[sentiment_col]])
        X_val = np.array([vec for vec in val_data[sentiment_col]])
        X_test = np.array([vec for vec in test_data[sentiment_col]])
        
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        # Get indices for later reference
        train_indices = train_data.index
        val_indices = val_data.index
        test_indices = test_data.index
        
        return (X_train, y_train, train_indices), (X_val, y_val, val_indices), (X_test, y_test, test_indices)
    
    def create_logistic_model(self, C=1.0, solver='liblinear', max_iter=1000, class_weight=None):
        """
        Create a logistic regression model for prediction.
        
        Args:
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
            
        Returns:
            Logistic regression model
        """
        return LogisticRegression(
            C=C,
            solver=solver,
            max_iter=max_iter,
            class_weight=class_weight,
            random_state=42
        )
    
    def train_and_evaluate(self, sentiment_column, label_column, C=1.0, solver='liblinear', 
                          max_iter=1000, class_weight=None):
        """
        Train and evaluate logistic regression model for a specific merged sentiment column and label column.
        
        Args:
            sentiment_column: The merged sentiment score column to use
            label_column: The label column to use ('S_label' or 'L_label')
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
        """
        # Determine sentiment approach
        approach = None
        for app in self.sentiment_approaches:
            if app in sentiment_column:
                approach = app
                break
        
        if approach is None:
            raise ValueError(f"Could not determine sentiment approach from column name: {sentiment_column}")
        
        # Determine label type
        if label_column == 'S_label':
            label_type = 'Short-term'
        else:
            label_type = 'Long-term'
        
        # Store results
        combination_key = f"{approach}_{label_type}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training logistic regression model for {approach} merged approach, {label_type}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data for this layer
            (X_train, y_train, train_indices), \
            (X_val, y_val, val_indices), \
            (X_test, y_test, test_indices) = self.split_data(layer, sentiment_column, label_column)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or \
               len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_logistic_model(
                C=C,
                solver=solver,
                max_iter=max_iter,
                class_weight=class_weight
            )
            
            # Debug: Print shapes
            print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
            
            # Train model
            model.fit(X_train, y_train)
            
            # Use validation set to evaluate
            val_pred_prob = model.predict_proba(X_val)[:, 1]
            val_pred = model.predict(X_val)
            val_accuracy = accuracy_score(y_val, val_pred)
            
            print(f"Validation Accuracy: {val_accuracy:.4f}")
            
            # Evaluate on test set
            y_pred_prob = model.predict_proba(X_test)[:, 1]
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_prob)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Print the coefs - now we have two coefficients for [Title, Full_text]
            print(f"Model coefficients: {model.coef_[0]}")
            print(f"Model intercept: {model.intercept_[0]:.6f}")
            
            # Print confusion matrix
            cm = confusion_matrix(y_test, y_pred)
            print("Confusion Matrix:")
            print(cm)
            
            # Print classification report
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            
            # Predicted class distribution
            print(f"Predicted class distribution: {np.bincount(y_pred)}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_prob': y_pred_prob,
                'accuracy': accuracy,
                'auc': auc,
                'confusion_matrix': cm,
                'coefficients': model.coef_[0],
                'intercept': model.intercept_[0]
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize confusion matrix for this layer
            self.visualize_confusion_matrix(
                cm,
                approach,
                label_type,
                i+1
            )
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid layers to calculate average metrics")
            self.results[combination_key]['avg_accuracy'] = np.nan
            self.results[combination_key]['avg_auc'] = np.nan
        
        return self
    
    def visualize_confusion_matrix(self, cm, approach, label_type, layer_num):
        """
        Visualize confusion matrix for a specific model and layer.
        
        Args:
            cm: Confusion matrix
            approach: Sentiment analysis approach
            label_type: Label type (Short-term or Long-term)
            layer_num: Layer number
        """
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"Confusion Matrix: {approach} - Merged - {label_type} (Layer {layer_num})")
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        
        # Save figure
        save_path = f'Merged_Sentiment_Logistic_Plots/COP/visualizations/confusion_matrices/{approach}_Merged_{label_type}_layer{layer_num}.png'
        plt.savefig(save_path)
        plt.close()
    
    def run_all_combinations(self, use_class_weights=False):
        """
        Run the logistic regression analysis for all combinations of merged sentiment approaches and label columns.
        
        Args:
            use_class_weights: Whether to use balanced class weights
        """
        # Define all combinations - now only looping through approaches and label types
        combinations = []
        
        for approach in self.sentiment_approaches:
            for label_type in ['S_label', 'L_label']:
                sentiment_col = f"Merged_{approach}_score_norm"
                if sentiment_col in self.data.columns:
                    combinations.append((sentiment_col, label_type))
        
        # Run analysis for each combination
        for sentiment_col, label_col in combinations:
            # Train model with or without class weights
            class_weight = 'balanced' if use_class_weights else None
            
            print(f"\n{'='*80}")
            print(f"TRAINING LOGISTIC REGRESSION MODEL FOR {sentiment_col} - {label_col}")
            print(f"{'='*80}")
            
            self.train_and_evaluate(
                sentiment_column=sentiment_col, 
                label_column=label_col,
                class_weight=class_weight
            )
        
        # Create summary visualizations
        self.create_summary_visualizations()
        
        # Print final summary
        self.print_summary()
        
        return self
    
    def create_summary_visualizations(self):
        """Create summary visualizations comparing model performances."""
        self._create_performance_by_approach_visualizations()
    
    def _create_performance_by_approach_visualizations(self):
        """Create visualizations comparing performance by approach."""
        # Collect all results
        approaches = []
        combinations = []
        accuracies = []
        aucs = []
        
        for combo_key, results in self.results.items():
            if 'avg_accuracy' in results:
                # Parse combination key
                parts = combo_key.split('_')
                approach = parts[0]
                label_type = '_'.join(parts[1:])
                
                approaches.append(approach)
                combinations.append(f"{approach}_{label_type}")
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            return
        
        # Create dataframe for plotting
        df = pd.DataFrame({
            'Approach': approaches,
            'Combination': combinations,
            'Accuracy': accuracies,
            'AUC': aucs
        })
        
        # Plot accuracy comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='Accuracy', hue='Combination', data=df)
        plt.title('Accuracy by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/COP/visualizations/Accuracy_performance_comparison.png')
        plt.close()
        
        # Plot AUC comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='AUC', hue='Combination', data=df)
        plt.title('AUC by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/COP/visualizations/AUC_performance_comparison.png')
        plt.close()
        
        # Also create a coefficient analysis visualization
        self._create_coefficient_analysis()
    
    def _create_coefficient_analysis(self):
        """Create visualization showing the relative importance of Title vs Full_text in each approach."""
        coef_data = []
        
        for combo_key, results in self.results.items():
            for layer_result in results.get('layer_results', []):
                if 'coefficients' in layer_result and len(layer_result['coefficients']) == 2:
                    parts = combo_key.split('_')
                    approach = parts[0]
                    label_type = '_'.join(parts[1:])
                    
                    coef_data.append({
                        'Approach': approach,
                        'Label Type': label_type,
                        'Layer': f"Layer {layer_result['layer']}",
                        'Title Coefficient': layer_result['coefficients'][0],
                        'Full_text Coefficient': layer_result['coefficients'][1],
                        'Title to Full_text Ratio': abs(layer_result['coefficients'][0] / 
                                                      (layer_result['coefficients'][1] 
                                                       if layer_result['coefficients'][1] != 0 else 1e-6))
                    })
        
        if not coef_data:
            return
        
        df = pd.DataFrame(coef_data)
        
        # Plot coefficient comparison
        plt.figure(figsize=(14, 10))
        
        plt.subplot(2, 1, 1)
        sns.barplot(x='Approach', y='Title Coefficient', hue='Label Type', data=df)
        plt.title('Title Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.subplot(2, 1, 2)
        sns.barplot(x='Approach', y='Full_text Coefficient', hue='Label Type', data=df)
        plt.title('Full_text Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.tight_layout()
        plt.savefig('Merged_Sentiment_Logistic_Plots/COP/visualizations/Coefficient_comparison.png')
        plt.close()
    
    def print_summary(self):
        """Print a summary of all results."""
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - COP)")
        print("="*80)
        
        # Organize results by approach
        for approach in self.sentiment_approaches:
            print(f"\nApproach: {approach}")
            print("-" * 40)
            
            for label_type in ['Short-term', 'Long-term']:
                label_col = 'S_label' if label_type == 'Short-term' else 'L_label'
                combination_key = f"{approach}_{label_type}"
                
                if combination_key in self.results and 'avg_accuracy' in self.results[combination_key]:
                    avg_accuracy = self.results[combination_key]['avg_accuracy']
                    avg_auc = self.results[combination_key]['avg_auc']
                    
                    print(f"Merged Vector + {label_type}:")
                    print(f"  Average Accuracy: {avg_accuracy:.4f}")
                    print(f"  Average AUC: {avg_auc:.4f}")
                    
                    # Print layer-specific results
                    for i, layer_result in enumerate(self.results[combination_key].get('layer_results', [])):
                        accuracy = layer_result['accuracy']
                        auc = layer_result['auc']
                        coefficients = layer_result.get('coefficients', 'N/A')
                        intercept = layer_result.get('intercept', 'N/A')
                        
                        # More readable coefficient display - showing Title and Full_text components
                        coef_str = "N/A"
                        if isinstance(coefficients, np.ndarray) and len(coefficients) == 2:
                            coef_str = f"Title: {coefficients[0]:.4f}, Full_text: {coefficients[1]:.4f}"
                        
                        print(f"    Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                        print(f"    Layer {i+1} - Coefficients: {coef_str}, Intercept: {intercept}")
            
            print()
        
        # Find best overall combination
        best_accuracy = 0
        best_auc = 0
        best_accuracy_combo = None
        best_auc_combo = None
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results:
                if results['avg_accuracy'] > best_accuracy:
                    best_accuracy = results['avg_accuracy']
                    best_accuracy_combo = combo
                
                if results['avg_auc'] > best_auc:
                    best_auc = results['avg_auc']
                    best_auc_combo = combo
        
        print("\nBest Overall Combinations:")
        print(f"Best Accuracy: {best_accuracy:.4f} - {best_accuracy_combo}")
        print(f"Best AUC: {best_auc:.4f} - {best_auc_combo}")
        
        return self

# Main execution
if __name__ == "__main__":
    # Initialize predictor with the new CSV file
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_sentiment_SP500_database/wall_street_news_sentiment_database_part2_Merge_COP.csv')
    
    # Run prediction pipeline
    # First run without class weights
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS")
    print("="*80)
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=False)
    
    # Then run with class weights to address potential class imbalance
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITH BALANCED CLASS WEIGHTS")
    print("="*80)
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_sentiment_SP500_database/wall_street_news_sentiment_database_part2_Merge_COP.csv')
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=True)


RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS
Loaded 838 financial news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0: 434, 1: 404}
Class distribution for long-term prediction: {1: 438, 0: 400}

Processing merged sentiment scores...
Parsed Merged_McDonld_score using ast.literal_eval
Parsed Merged_FinBERT_ProsusAI_score using ast.literal_eval
Parsed Merged_FinBERT_yiyang_score using ast.literal_eval
Parsed Merged_FinGPT_score using ast.literal_eval
Parsed Merged_Majority_vote_mean_score using ast.literal_eval
Parsed Merged_Fino1_score using ast.literal_eval

Normalizing merged sentiment scores...
Merged_McDonld_score (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score (Full_text component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Full_text component): min=-1.0000, max=1.0000
Merged_FinBERT_ProsusAI_score (Title component)


Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Train class distribution (L_label): {1: 356, 0: 310}
Val class distribution (L_label): {0: 51, 1: 50}
Test class distribution (L_label): {0: 38, 1: 32}
X_train shape: (666, 2), y_train shape: (666,)
Validation Accuracy: 0.4950
Test Accuracy for Layer 3: 0.5000
Test AUC for Layer 3: 0.4572
Model coefficients: [0.11094941 0.23882796]
Model intercept: 0.252633
Confusion Matrix:
[[ 7 31]
 [ 4 28]]

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.18      0.29        38
           1       0.47      0.88      0.62        32

    accuracy                           0.50        70
   macro avg       0.56      0.53      0.45        70
weighted avg       0.56      0.50      0.44        70

Predicted class distribution: [11 59]

A


Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (S_label): {0: 282, 1: 250}
Val class distribution (S_label): {1: 73, 0: 60}
Test class distribution (S_label): {1: 54, 0: 47}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.5338
Test Accuracy for Layer 2: 0.5248
Test AUC for Layer 2: 0.5725
Model coefficients: [ 0.34218383 -0.38460854]
Model intercept: -0.120250
Confusion Matrix:
[[37 10]
 [38 16]]

Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.79      0.61        47
           1       0.62      0.30      0.40        54

    accuracy                           0.52       101
   macro avg       0.55      0.54      0.50       101
weighted avg       0.56      0.52      0.50       101

Predicted class distribution: [75 26


Average Test Accuracy across all layers: 0.4737
Average Test AUC across all layers: 0.4745

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinGPT_score_norm - L_label

Training logistic regression model for FinGPT merged approach, Long-term

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Train class distribution (L_label): {0: 227, 1: 174}
Val class distribution (L_label): {1: 87, 0: 44}
Test class distribution (L_label): {1: 95, 0: 38}
X_train shape: (401, 2), y_train shape: (401,)
Validation Accuracy: 0.3359
Test Accuracy for Layer 1: 0.2857
Test AUC for Layer 1: 0.5090
Model coefficients: [-0.00680684 -0.20774345]
Model intercept: -0.433152
Confusion Matrix:
[[38  0]
 [95  0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.29      1.00      0.44        38
           1 


Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Train class distribution (L_label): {1: 356, 0: 310}
Val class distribution (L_label): {0: 51, 1: 50}
Test class distribution (L_label): {0: 38, 1: 32}
X_train shape: (666, 2), y_train shape: (666,)
Validation Accuracy: 0.4950
Test Accuracy for Layer 3: 0.4571
Test AUC for Layer 3: 0.4827
Model coefficients: [0.09308901 0.07834031]
Model intercept: 0.173831
Confusion Matrix:
[[ 0 38]
 [ 0 32]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        38
           1       0.46      1.00      0.63        32

    accuracy                           0.46        70
   macro avg       0.23      0.50      0.31        70
weighted avg       0.21      0.46      0.29        70

Predicted class distribution: [ 0 70]

A


SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - COP)

Approach: McDonld
----------------------------------------
Merged Vector + Short-term:
  Average Accuracy: 0.5106
  Average AUC: 0.5325
    Layer 1 - Accuracy: 0.4962, AUC: 0.5773
    Layer 1 - Coefficients: Title: -0.0238, Full_text: 0.3045, Intercept: -0.06272425627602735
    Layer 2 - Accuracy: 0.4356, AUC: 0.4387
    Layer 2 - Coefficients: Title: -0.0983, Full_text: 0.4452, Intercept: 0.033371474415833165
    Layer 3 - Accuracy: 0.6000, AUC: 0.5814
    Layer 3 - Coefficients: Title: -0.1015, Full_text: 0.5897, Intercept: 0.13904347605992606
Merged Vector + Long-term:
  Average Accuracy: 0.4434
  Average AUC: 0.4869
    Layer 1 - Accuracy: 0.2857, AUC: 0.4824
    Layer 1 - Coefficients: Title: 0.1037, Full_text: -0.0713, Intercept: -0.270311581873593
    Layer 2 - Accuracy: 0.5446, AUC: 0.5210
    Layer 2 - Coefficients: Title: 0.1835, Full_text: 0.0744, Intercept: 0.03250767279307531
    Layer 3 - Accuracy: 0.5000, AUC: 0.457


Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (S_label): {0: 282, 1: 250}
Val class distribution (S_label): {1: 73, 0: 60}
Test class distribution (S_label): {1: 54, 0: 47}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.6015
Test Accuracy for Layer 2: 0.4752
Test AUC for Layer 2: 0.4403
Model coefficients: [-0.09821656  0.44184083]
Model intercept: 0.151570
Confusion Matrix:
[[19 28]
 [25 29]]

Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.40      0.42        47
           1       0.51      0.54      0.52        54

    accuracy                           0.48       101
   macro avg       0.47      0.47      0.47       101
weighted avg       0.47      0.48      0.47       101

Predicted class distribution: [44 57]


Average Test Accuracy across all layers: 0.4939
Average Test AUC across all layers: 0.4677

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinBERT_ProsusAI_score_norm - L_label

Training logistic regression model for FinBERT_ProsusAI merged approach, Long-term

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Train class distribution (L_label): {0: 227, 1: 174}
Val class distribution (L_label): {1: 87, 0: 44}
Test class distribution (L_label): {1: 95, 0: 38}
X_train shape: (401, 2), y_train shape: (401,)
Validation Accuracy: 0.4885
Test Accuracy for Layer 1: 0.4060
Test AUC for Layer 1: 0.4152
Model coefficients: [ 0.14120611 -0.26852144]
Model intercept: -0.049407
Confusion Matrix:
[[18 20]
 [59 36]]

Classification Report:
              precision    recall  f1-score   support

           0       0.23      0.47      0.31    


Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Train class distribution (L_label): {1: 356, 0: 310}
Val class distribution (L_label): {0: 51, 1: 50}
Test class distribution (L_label): {0: 38, 1: 32}
X_train shape: (666, 2), y_train shape: (666,)
Validation Accuracy: 0.4653
Test Accuracy for Layer 3: 0.5571
Test AUC for Layer 3: 0.5732
Model coefficients: [ 0.33153838 -0.37525294]
Model intercept: -0.000919
Confusion Matrix:
[[24 14]
 [17 15]]

Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.63      0.61        38
           1       0.52      0.47      0.49        32

    accuracy                           0.56        70
   macro avg       0.55      0.55      0.55        70
weighted avg       0.55      0.56      0.55        70

Predicted class distribution: [41 29]


Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Train class distribution (S_label): {0: 343, 1: 323}
Val class distribution (S_label): {1: 54, 0: 47}
Test class distribution (S_label): {0: 43, 1: 27}
X_train shape: (666, 2), y_train shape: (666,)
Validation Accuracy: 0.3762
Test Accuracy for Layer 3: 0.5714
Test AUC for Layer 3: 0.5805
Model coefficients: [-0.00301822  0.13363277]
Model intercept: 0.038669
Confusion Matrix:
[[28 15]
 [15 12]]

Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.65      0.65        43
           1       0.44      0.44      0.44        27

    accuracy                           0.57        70
   macro avg       0.55      0.55      0.55        70
weighted avg       0.57      0.57      0.57        70

Predicted class distribution: [43 27]



Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (L_label): {0: 271, 1: 261}
Val class distribution (L_label): {1: 95, 0: 38}
Test class distribution (L_label): {0: 51, 1: 50}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.3684
Test Accuracy for Layer 2: 0.5248
Test AUC for Layer 2: 0.5200
Model coefficients: [ 0.24554983 -0.22722808]
Model intercept: -0.029446
Confusion Matrix:
[[45  6]
 [42  8]]

Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.88      0.65        51
           1       0.57      0.16      0.25        50

    accuracy                           0.52       101
   macro avg       0.54      0.52      0.45       101
weighted avg       0.54      0.52      0.45       101

Predicted class distribution: [87 14

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import os
from datetime import datetime
import ast  # For parsing string representations of lists
import warnings
warnings.filterwarnings('ignore')

class MergedSentimentStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor using logistic regression for merged financial sentiment analysis.
        
        Args:
            csv_path: Path to the CSV file containing merged sentiment score vectors and stock labels
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        self.sentiment_approaches = [
            'McDonld', 'FinBERT_ProsusAI', 'FinBERT_yiyang', 
            'FinGPT', 'Majority_vote_mean', 'Fino1'
        ]
        self.scalers = {}
        
        # Create directories for visualizations
        os.makedirs('Merged_Sentiment_Logistic_Plots/CVX/visualizations', exist_ok=True)
        os.makedirs('Merged_Sentiment_Logistic_Plots/CVX/visualizations/confusion_matrices', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing merged sentiment scores and stock labels."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='%d/%m/%Y')
        self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='%d/%m/%Y')
        self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='%d/%m/%Y')
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Display initial data info
        print(f"Loaded {len(self.data)} financial news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        
        # Show class distribution for each label type
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        # Process merged sentiment score columns
        self._process_merged_scores()
        
        return self
    
    def _process_merged_scores(self):
        """
        Process and normalize the merged sentiment score columns.
        Each merged score is a vector [Title_score, Full_text_score]
        """
        print("\nProcessing merged sentiment scores...")
        
        # Get all merged sentiment score columns
        merged_columns = [f"Merged_{approach}_score" for approach in self.sentiment_approaches]
        
        # Check if these columns exist in the data
        existing_columns = [col for col in merged_columns if col in self.data.columns]
        
        if not existing_columns:
            raise ValueError("No merged sentiment score columns found in the data")
        
        # First, parse the string vectors into numerical arrays
        for col in existing_columns:
            # Check the format of the data to determine parsing method
            sample_value = self.data[col].iloc[0]
            
            # If values are already numerical arrays, we don't need to parse
            if isinstance(sample_value, (list, np.ndarray)):
                print(f"Column {col} already contains numerical arrays")
                continue
                
            # If values are stored as strings, parse them
            try:
                # Try parsing as literal Python representation
                self.data[col] = self.data[col].apply(ast.literal_eval)
                print(f"Parsed {col} using ast.literal_eval")
            except (ValueError, SyntaxError):
                try:
                    # Alternative: try parsing as comma-separated values
                    self.data[col] = self.data[col].str.strip('[]').str.split(',').apply(
                        lambda x: [float(val.strip()) for val in x]
                    )
                    print(f"Parsed {col} as comma-separated values")
                except Exception as e:
                    raise ValueError(f"Could not parse sentiment vectors in column {col}: {e}")
        
        # Create two separate normalized versions of each element in the vector
        print("\nNormalizing merged sentiment scores...")
        for col in existing_columns:
            # Extract Title and Full_text components
            title_scores = np.array([vec[0] for vec in self.data[col]])
            fulltext_scores = np.array([vec[1] for vec in self.data[col]])
            
            # Display original score ranges
            print(f"{col} (Title component): min={title_scores.min():.4f}, max={title_scores.max():.4f}")
            print(f"{col} (Full_text component): min={fulltext_scores.min():.4f}, max={fulltext_scores.max():.4f}")
            
            # Normalize each component separately
            title_scaler = MinMaxScaler(feature_range=(-1, 1))
            fulltext_scaler = MinMaxScaler(feature_range=(-1, 1))
            
            title_norm = title_scaler.fit_transform(title_scores.reshape(-1, 1)).flatten()
            fulltext_norm = fulltext_scaler.fit_transform(fulltext_scores.reshape(-1, 1)).flatten()
            
            # Store scalers for potential later use
            self.scalers[f"{col}_title"] = title_scaler
            self.scalers[f"{col}_fulltext"] = fulltext_scaler
            
            # Create normalized vectors
            self.data[f"{col}_norm"] = [
                [title_norm[i], fulltext_norm[i]] for i in range(len(title_norm))
            ]
            
            # Display normalized score ranges
            print(f"{col}_norm (Title component): min={title_norm.min():.4f}, max={title_norm.max():.4f}")
            print(f"{col}_norm (Full_text component): min={fulltext_norm.min():.4f}, max={fulltext_norm.max():.4f}")
        
        # Print sample of normalized vectors
        print("\nSample of normalized merged sentiment vectors (first 3 rows):")
        for col in [f"{col}_norm" for col in existing_columns]:
            print(f"{col}: {self.data[col].iloc[:3].tolist()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, sentiment_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            sentiment_col: The column containing the merged sentiment score vectors
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            Split datasets and corresponding indices
        """
        # Create masks for each time period
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        # Get data for each period
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Additional debug information: class distribution in each split
        print(f"Train class distribution ({label_col}): {train_data[label_col].value_counts().to_dict()}")
        print(f"Val class distribution ({label_col}): {val_data[label_col].value_counts().to_dict()}")
        print(f"Test class distribution ({label_col}): {test_data[label_col].value_counts().to_dict()}")
        
        # Extract merged sentiment vectors and convert to numpy arrays
        X_train = np.array([vec for vec in train_data[sentiment_col]])
        X_val = np.array([vec for vec in val_data[sentiment_col]])
        X_test = np.array([vec for vec in test_data[sentiment_col]])
        
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        # Get indices for later reference
        train_indices = train_data.index
        val_indices = val_data.index
        test_indices = test_data.index
        
        return (X_train, y_train, train_indices), (X_val, y_val, val_indices), (X_test, y_test, test_indices)
    
    def create_logistic_model(self, C=1.0, solver='liblinear', max_iter=1000, class_weight=None):
        """
        Create a logistic regression model for prediction.
        
        Args:
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
            
        Returns:
            Logistic regression model
        """
        return LogisticRegression(
            C=C,
            solver=solver,
            max_iter=max_iter,
            class_weight=class_weight,
            random_state=42
        )
    
    def train_and_evaluate(self, sentiment_column, label_column, C=1.0, solver='liblinear', 
                          max_iter=1000, class_weight=None):
        """
        Train and evaluate logistic regression model for a specific merged sentiment column and label column.
        
        Args:
            sentiment_column: The merged sentiment score column to use
            label_column: The label column to use ('S_label' or 'L_label')
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
        """
        # Determine sentiment approach
        approach = None
        for app in self.sentiment_approaches:
            if app in sentiment_column:
                approach = app
                break
        
        if approach is None:
            raise ValueError(f"Could not determine sentiment approach from column name: {sentiment_column}")
        
        # Determine label type
        if label_column == 'S_label':
            label_type = 'Short-term'
        else:
            label_type = 'Long-term'
        
        # Store results
        combination_key = f"{approach}_{label_type}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training logistic regression model for {approach} merged approach, {label_type}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data for this layer
            (X_train, y_train, train_indices), \
            (X_val, y_val, val_indices), \
            (X_test, y_test, test_indices) = self.split_data(layer, sentiment_column, label_column)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or \
               len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_logistic_model(
                C=C,
                solver=solver,
                max_iter=max_iter,
                class_weight=class_weight
            )
            
            # Debug: Print shapes
            print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
            
            # Train model
            model.fit(X_train, y_train)
            
            # Use validation set to evaluate
            val_pred_prob = model.predict_proba(X_val)[:, 1]
            val_pred = model.predict(X_val)
            val_accuracy = accuracy_score(y_val, val_pred)
            
            print(f"Validation Accuracy: {val_accuracy:.4f}")
            
            # Evaluate on test set
            y_pred_prob = model.predict_proba(X_test)[:, 1]
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_prob)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Print the coefs - now we have two coefficients for [Title, Full_text]
            print(f"Model coefficients: {model.coef_[0]}")
            print(f"Model intercept: {model.intercept_[0]:.6f}")
            
            # Print confusion matrix
            cm = confusion_matrix(y_test, y_pred)
            print("Confusion Matrix:")
            print(cm)
            
            # Print classification report
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            
            # Predicted class distribution
            print(f"Predicted class distribution: {np.bincount(y_pred)}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_prob': y_pred_prob,
                'accuracy': accuracy,
                'auc': auc,
                'confusion_matrix': cm,
                'coefficients': model.coef_[0],
                'intercept': model.intercept_[0]
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize confusion matrix for this layer
            self.visualize_confusion_matrix(
                cm,
                approach,
                label_type,
                i+1
            )
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid layers to calculate average metrics")
            self.results[combination_key]['avg_accuracy'] = np.nan
            self.results[combination_key]['avg_auc'] = np.nan
        
        return self
    
    def visualize_confusion_matrix(self, cm, approach, label_type, layer_num):
        """
        Visualize confusion matrix for a specific model and layer.
        
        Args:
            cm: Confusion matrix
            approach: Sentiment analysis approach
            label_type: Label type (Short-term or Long-term)
            layer_num: Layer number
        """
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"Confusion Matrix: {approach} - Merged - {label_type} (Layer {layer_num})")
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        
        # Save figure
        save_path = f'Merged_Sentiment_Logistic_Plots/CVX/visualizations/confusion_matrices/{approach}_Merged_{label_type}_layer{layer_num}.png'
        plt.savefig(save_path)
        plt.close()
    
    def run_all_combinations(self, use_class_weights=False):
        """
        Run the logistic regression analysis for all combinations of merged sentiment approaches and label columns.
        
        Args:
            use_class_weights: Whether to use balanced class weights
        """
        # Define all combinations - now only looping through approaches and label types
        combinations = []
        
        for approach in self.sentiment_approaches:
            for label_type in ['S_label', 'L_label']:
                sentiment_col = f"Merged_{approach}_score_norm"
                if sentiment_col in self.data.columns:
                    combinations.append((sentiment_col, label_type))
        
        # Run analysis for each combination
        for sentiment_col, label_col in combinations:
            # Train model with or without class weights
            class_weight = 'balanced' if use_class_weights else None
            
            print(f"\n{'='*80}")
            print(f"TRAINING LOGISTIC REGRESSION MODEL FOR {sentiment_col} - {label_col}")
            print(f"{'='*80}")
            
            self.train_and_evaluate(
                sentiment_column=sentiment_col, 
                label_column=label_col,
                class_weight=class_weight
            )
        
        # Create summary visualizations
        self.create_summary_visualizations()
        
        # Print final summary
        self.print_summary()
        
        return self
    
    def create_summary_visualizations(self):
        """Create summary visualizations comparing model performances."""
        self._create_performance_by_approach_visualizations()
    
    def _create_performance_by_approach_visualizations(self):
        """Create visualizations comparing performance by approach."""
        # Collect all results
        approaches = []
        combinations = []
        accuracies = []
        aucs = []
        
        for combo_key, results in self.results.items():
            if 'avg_accuracy' in results:
                # Parse combination key
                parts = combo_key.split('_')
                approach = parts[0]
                label_type = '_'.join(parts[1:])
                
                approaches.append(approach)
                combinations.append(f"{approach}_{label_type}")
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            return
        
        # Create dataframe for plotting
        df = pd.DataFrame({
            'Approach': approaches,
            'Combination': combinations,
            'Accuracy': accuracies,
            'AUC': aucs
        })
        
        # Plot accuracy comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='Accuracy', hue='Combination', data=df)
        plt.title('Accuracy by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/CVX/visualizations/Accuracy_performance_comparison.png')
        plt.close()
        
        # Plot AUC comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='AUC', hue='Combination', data=df)
        plt.title('AUC by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/CVX/visualizations/AUC_performance_comparison.png')
        plt.close()
        
        # Also create a coefficient analysis visualization
        self._create_coefficient_analysis()
    
    def _create_coefficient_analysis(self):
        """Create visualization showing the relative importance of Title vs Full_text in each approach."""
        coef_data = []
        
        for combo_key, results in self.results.items():
            for layer_result in results.get('layer_results', []):
                if 'coefficients' in layer_result and len(layer_result['coefficients']) == 2:
                    parts = combo_key.split('_')
                    approach = parts[0]
                    label_type = '_'.join(parts[1:])
                    
                    coef_data.append({
                        'Approach': approach,
                        'Label Type': label_type,
                        'Layer': f"Layer {layer_result['layer']}",
                        'Title Coefficient': layer_result['coefficients'][0],
                        'Full_text Coefficient': layer_result['coefficients'][1],
                        'Title to Full_text Ratio': abs(layer_result['coefficients'][0] / 
                                                      (layer_result['coefficients'][1] 
                                                       if layer_result['coefficients'][1] != 0 else 1e-6))
                    })
        
        if not coef_data:
            return
        
        df = pd.DataFrame(coef_data)
        
        # Plot coefficient comparison
        plt.figure(figsize=(14, 10))
        
        plt.subplot(2, 1, 1)
        sns.barplot(x='Approach', y='Title Coefficient', hue='Label Type', data=df)
        plt.title('Title Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.subplot(2, 1, 2)
        sns.barplot(x='Approach', y='Full_text Coefficient', hue='Label Type', data=df)
        plt.title('Full_text Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.tight_layout()
        plt.savefig('Merged_Sentiment_Logistic_Plots/CVX/visualizations/Coefficient_comparison.png')
        plt.close()
    
    def print_summary(self):
        """Print a summary of all results."""
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - CVX)")
        print("="*80)
        
        # Organize results by approach
        for approach in self.sentiment_approaches:
            print(f"\nApproach: {approach}")
            print("-" * 40)
            
            for label_type in ['Short-term', 'Long-term']:
                label_col = 'S_label' if label_type == 'Short-term' else 'L_label'
                combination_key = f"{approach}_{label_type}"
                
                if combination_key in self.results and 'avg_accuracy' in self.results[combination_key]:
                    avg_accuracy = self.results[combination_key]['avg_accuracy']
                    avg_auc = self.results[combination_key]['avg_auc']
                    
                    print(f"Merged Vector + {label_type}:")
                    print(f"  Average Accuracy: {avg_accuracy:.4f}")
                    print(f"  Average AUC: {avg_auc:.4f}")
                    
                    # Print layer-specific results
                    for i, layer_result in enumerate(self.results[combination_key].get('layer_results', [])):
                        accuracy = layer_result['accuracy']
                        auc = layer_result['auc']
                        coefficients = layer_result.get('coefficients', 'N/A')
                        intercept = layer_result.get('intercept', 'N/A')
                        
                        # More readable coefficient display - showing Title and Full_text components
                        coef_str = "N/A"
                        if isinstance(coefficients, np.ndarray) and len(coefficients) == 2:
                            coef_str = f"Title: {coefficients[0]:.4f}, Full_text: {coefficients[1]:.4f}"
                        
                        print(f"    Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                        print(f"    Layer {i+1} - Coefficients: {coef_str}, Intercept: {intercept}")
            
            print()
        
        # Find best overall combination
        best_accuracy = 0
        best_auc = 0
        best_accuracy_combo = None
        best_auc_combo = None
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results:
                if results['avg_accuracy'] > best_accuracy:
                    best_accuracy = results['avg_accuracy']
                    best_accuracy_combo = combo
                
                if results['avg_auc'] > best_auc:
                    best_auc = results['avg_auc']
                    best_auc_combo = combo
        
        print("\nBest Overall Combinations:")
        print(f"Best Accuracy: {best_accuracy:.4f} - {best_accuracy_combo}")
        print(f"Best AUC: {best_auc:.4f} - {best_auc_combo}")
        
        return self

# Main execution
if __name__ == "__main__":
    # Initialize predictor with the new CSV file
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_sentiment_SP500_database/wall_street_news_sentiment_database_part2_Merge_CVX.csv')
    
    # Run prediction pipeline
    # First run without class weights
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS")
    print("="*80)
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=False)
    
    # Then run with class weights to address potential class imbalance
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITH BALANCED CLASS WEIGHTS")
    print("="*80)
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_sentiment_SP500_database/wall_street_news_sentiment_database_part2_Merge_CVX.csv')
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=True)


RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS
Loaded 838 financial news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 446, 0: 392}
Class distribution for long-term prediction: {1: 464, 0: 374}

Processing merged sentiment scores...
Parsed Merged_McDonld_score using ast.literal_eval
Parsed Merged_FinBERT_ProsusAI_score using ast.literal_eval
Parsed Merged_FinBERT_yiyang_score using ast.literal_eval
Parsed Merged_FinGPT_score using ast.literal_eval
Parsed Merged_Majority_vote_mean_score using ast.literal_eval
Parsed Merged_Fino1_score using ast.literal_eval

Normalizing merged sentiment scores...
Merged_McDonld_score (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score (Full_text component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Full_text component): min=-1.0000, max=1.0000
Merged_FinBERT_ProsusAI_score (Title component)


Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (L_label): {1: 282, 0: 250}
Val class distribution (L_label): {1: 97, 0: 36}
Test class distribution (L_label): {0: 57, 1: 44}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.6391
Test Accuracy for Layer 2: 0.4653
Test AUC for Layer 2: 0.5381
Model coefficients: [0.08894628 0.36612057]
Model intercept: 0.283901
Confusion Matrix:
[[14 43]
 [11 33]]

Classification Report:
              precision    recall  f1-score   support

           0       0.56      0.25      0.34        57
           1       0.43      0.75      0.55        44

    accuracy                           0.47       101
   macro avg       0.50      0.50      0.45       101
weighted avg       0.51      0.47      0.43       101

Predicted class distribution: [25 76]




Average Test Accuracy across all layers: 0.4351
Average Test AUC across all layers: 0.4753

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinBERT_yiyang_score_norm - S_label

Training logistic regression model for FinBERT_yiyang merged approach, Short-term

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Train class distribution (S_label): {1: 206, 0: 195}
Val class distribution (S_label): {1: 69, 0: 62}
Test class distribution (S_label): {1: 81, 0: 52}
X_train shape: (401, 2), y_train shape: (401,)
Validation Accuracy: 0.4656
Test Accuracy for Layer 1: 0.5940
Test AUC for Layer 1: 0.5169
Model coefficients: [-0.10458802 -0.04859313]
Model intercept: 0.042673
Confusion Matrix:
[[10 42]
 [12 69]]

Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.19      0.27        


Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Train class distribution (S_label): {1: 356, 0: 310}
Val class distribution (S_label): {1: 54, 0: 47}
Test class distribution (S_label): {1: 36, 0: 34}
X_train shape: (666, 2), y_train shape: (666,)
Validation Accuracy: 0.5347
Test Accuracy for Layer 3: 0.4857
Test AUC for Layer 3: 0.4400
Model coefficients: [-0.17581424  0.15153111]
Model intercept: 0.260100
Confusion Matrix:
[[ 0 34]
 [ 2 34]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        34
           1       0.50      0.94      0.65        36

    accuracy                           0.49        70
   macro avg       0.25      0.47      0.33        70
weighted avg       0.26      0.49      0.34        70

Predicted class distribution: [ 2 68]



Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (L_label): {1: 282, 0: 250}
Val class distribution (L_label): {1: 97, 0: 36}
Test class distribution (L_label): {0: 57, 1: 44}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.6316
Test Accuracy for Layer 2: 0.4356
Test AUC for Layer 2: 0.4737
Model coefficients: [0.19712818 0.35929778]
Model intercept: 0.253956
Confusion Matrix:
[[15 42]
 [15 29]]

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.26      0.34        57
           1       0.41      0.66      0.50        44

    accuracy                           0.44       101
   macro avg       0.45      0.46      0.42       101
weighted avg       0.46      0.44      0.41       101

Predicted class distribution: [30 71]




Average Test Accuracy across all layers: 0.4367
Average Test AUC across all layers: 0.5000

SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - CVX)

Approach: McDonld
----------------------------------------
Merged Vector + Short-term:
  Average Accuracy: 0.5306
  Average AUC: 0.5162
    Layer 1 - Accuracy: 0.5714, AUC: 0.5471
    Layer 1 - Coefficients: Title: -0.0064, Full_text: 0.3038, Intercept: 0.17880042410658056
    Layer 2 - Accuracy: 0.5347, AUC: 0.4951
    Layer 2 - Coefficients: Title: -0.1451, Full_text: 0.4108, Intercept: 0.19698430306279857
    Layer 3 - Accuracy: 0.4857, AUC: 0.5065
    Layer 3 - Coefficients: Title: -0.0938, Full_text: 0.4991, Intercept: 0.30510657204134095
Merged Vector + Long-term:
  Average Accuracy: 0.4962
  Average AUC: 0.5021
    Layer 1 - Accuracy: 0.4662, AUC: 0.4877
    Layer 1 - Coefficients: Title: 0.1603, Full_text: 0.1510, Intercept: 0.052845344319754184
    Layer 2 - Accuracy: 0.4653, AUC: 0.5381
    Layer 2 - Coefficients: Title: 0.0889, F


Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (S_label): {1: 275, 0: 257}
Val class distribution (S_label): {1: 81, 0: 52}
Test class distribution (S_label): {1: 54, 0: 47}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.6015
Test Accuracy for Layer 2: 0.4950
Test AUC for Layer 2: 0.4951
Model coefficients: [-0.14502524  0.41348528]
Model intercept: 0.130903
Confusion Matrix:
[[21 26]
 [25 29]]

Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.45      0.45        47
           1       0.53      0.54      0.53        54

    accuracy                           0.50       101
   macro avg       0.49      0.49      0.49       101
weighted avg       0.49      0.50      0.49       101

Predicted class distribution: [46 55]


Average Test Accuracy across all layers: 0.4968
Average Test AUC across all layers: 0.4752

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinBERT_ProsusAI_score_norm - L_label

Training logistic regression model for FinBERT_ProsusAI merged approach, Long-term

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Train class distribution (L_label): {0: 205, 1: 196}
Val class distribution (L_label): {1: 86, 0: 45}
Test class distribution (L_label): {1: 97, 0: 36}
X_train shape: (401, 2), y_train shape: (401,)
Validation Accuracy: 0.5649
Test Accuracy for Layer 1: 0.5489
Test AUC for Layer 1: 0.5326
Model coefficients: [0.03552134 0.06858927]
Model intercept: 0.015367
Confusion Matrix:
[[18 18]
 [42 55]]

Classification Report:
              precision    recall  f1-score   support

           0       0.30      0.50      0.38       


Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (L_label): {1: 282, 0: 250}
Val class distribution (L_label): {1: 97, 0: 36}
Test class distribution (L_label): {0: 57, 1: 44}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.6015
Test Accuracy for Layer 2: 0.4950
Test AUC for Layer 2: 0.5187
Model coefficients: [ 0.3373725  -0.28067341]
Model intercept: 0.008623
Confusion Matrix:
[[22 35]
 [16 28]]

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.39      0.46        57
           1       0.44      0.64      0.52        44

    accuracy                           0.50       101
   macro avg       0.51      0.51      0.49       101
weighted avg       0.52      0.50      0.49       101

Predicted class distribution: [38 63]


Average Test Accuracy across all layers: 0.4270
Average Test AUC across all layers: 0.4678

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_Majority_vote_mean_score_norm - S_label

Training logistic regression model for Majority_vote_mean merged approach, Short-term

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Train class distribution (S_label): {1: 206, 0: 195}
Val class distribution (S_label): {1: 69, 0: 62}
Test class distribution (S_label): {1: 81, 0: 52}
X_train shape: (401, 2), y_train shape: (401,)
Validation Accuracy: 0.5191
Test Accuracy for Layer 1: 0.4511
Test AUC for Layer 1: 0.4658
Model coefficients: [-0.13680905 -0.09589145]
Model intercept: -0.047059
Confusion Matrix:
[[24 28]
 [45 36]]

Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.46      0.4


Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Train class distribution (S_label): {1: 356, 0: 310}
Val class distribution (S_label): {1: 54, 0: 47}
Test class distribution (S_label): {1: 36, 0: 34}
X_train shape: (666, 2), y_train shape: (666,)
Validation Accuracy: 0.4950
Test Accuracy for Layer 3: 0.5571
Test AUC for Layer 3: 0.5196
Model coefficients: [0.12242383 0.06962545]
Model intercept: -0.002188
Confusion Matrix:
[[ 8 26]
 [ 5 31]]

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.24      0.34        34
           1       0.54      0.86      0.67        36

    accuracy                           0.56        70
   macro avg       0.58      0.55      0.50        70
weighted avg       0.58      0.56      0.51        70

Predicted class distribution: [13 57]



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import os
from datetime import datetime
import ast  # For parsing string representations of lists
import warnings
warnings.filterwarnings('ignore')

class MergedSentimentStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor using logistic regression for merged financial sentiment analysis.
        
        Args:
            csv_path: Path to the CSV file containing merged sentiment score vectors and stock labels
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        self.sentiment_approaches = [
            'McDonld', 'FinBERT_ProsusAI', 'FinBERT_yiyang', 
            'FinGPT', 'Majority_vote_mean', 'Fino1'
        ]
        self.scalers = {}
        
        # Create directories for visualizations
        os.makedirs('Merged_Sentiment_Logistic_Plots/MPC/visualizations', exist_ok=True)
        os.makedirs('Merged_Sentiment_Logistic_Plots/MPC/visualizations/confusion_matrices', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing merged sentiment scores and stock labels."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='%d/%m/%Y')
        self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='%d/%m/%Y')
        self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='%d/%m/%Y')
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Display initial data info
        print(f"Loaded {len(self.data)} financial news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        
        # Show class distribution for each label type
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        # Process merged sentiment score columns
        self._process_merged_scores()
        
        return self
    
    def _process_merged_scores(self):
        """
        Process and normalize the merged sentiment score columns.
        Each merged score is a vector [Title_score, Full_text_score]
        """
        print("\nProcessing merged sentiment scores...")
        
        # Get all merged sentiment score columns
        merged_columns = [f"Merged_{approach}_score" for approach in self.sentiment_approaches]
        
        # Check if these columns exist in the data
        existing_columns = [col for col in merged_columns if col in self.data.columns]
        
        if not existing_columns:
            raise ValueError("No merged sentiment score columns found in the data")
        
        # First, parse the string vectors into numerical arrays
        for col in existing_columns:
            # Check the format of the data to determine parsing method
            sample_value = self.data[col].iloc[0]
            
            # If values are already numerical arrays, we don't need to parse
            if isinstance(sample_value, (list, np.ndarray)):
                print(f"Column {col} already contains numerical arrays")
                continue
                
            # If values are stored as strings, parse them
            try:
                # Try parsing as literal Python representation
                self.data[col] = self.data[col].apply(ast.literal_eval)
                print(f"Parsed {col} using ast.literal_eval")
            except (ValueError, SyntaxError):
                try:
                    # Alternative: try parsing as comma-separated values
                    self.data[col] = self.data[col].str.strip('[]').str.split(',').apply(
                        lambda x: [float(val.strip()) for val in x]
                    )
                    print(f"Parsed {col} as comma-separated values")
                except Exception as e:
                    raise ValueError(f"Could not parse sentiment vectors in column {col}: {e}")
        
        # Create two separate normalized versions of each element in the vector
        print("\nNormalizing merged sentiment scores...")
        for col in existing_columns:
            # Extract Title and Full_text components
            title_scores = np.array([vec[0] for vec in self.data[col]])
            fulltext_scores = np.array([vec[1] for vec in self.data[col]])
            
            # Display original score ranges
            print(f"{col} (Title component): min={title_scores.min():.4f}, max={title_scores.max():.4f}")
            print(f"{col} (Full_text component): min={fulltext_scores.min():.4f}, max={fulltext_scores.max():.4f}")
            
            # Normalize each component separately
            title_scaler = MinMaxScaler(feature_range=(-1, 1))
            fulltext_scaler = MinMaxScaler(feature_range=(-1, 1))
            
            title_norm = title_scaler.fit_transform(title_scores.reshape(-1, 1)).flatten()
            fulltext_norm = fulltext_scaler.fit_transform(fulltext_scores.reshape(-1, 1)).flatten()
            
            # Store scalers for potential later use
            self.scalers[f"{col}_title"] = title_scaler
            self.scalers[f"{col}_fulltext"] = fulltext_scaler
            
            # Create normalized vectors
            self.data[f"{col}_norm"] = [
                [title_norm[i], fulltext_norm[i]] for i in range(len(title_norm))
            ]
            
            # Display normalized score ranges
            print(f"{col}_norm (Title component): min={title_norm.min():.4f}, max={title_norm.max():.4f}")
            print(f"{col}_norm (Full_text component): min={fulltext_norm.min():.4f}, max={fulltext_norm.max():.4f}")
        
        # Print sample of normalized vectors
        print("\nSample of normalized merged sentiment vectors (first 3 rows):")
        for col in [f"{col}_norm" for col in existing_columns]:
            print(f"{col}: {self.data[col].iloc[:3].tolist()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, sentiment_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            sentiment_col: The column containing the merged sentiment score vectors
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            Split datasets and corresponding indices
        """
        # Create masks for each time period
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        # Get data for each period
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Additional debug information: class distribution in each split
        print(f"Train class distribution ({label_col}): {train_data[label_col].value_counts().to_dict()}")
        print(f"Val class distribution ({label_col}): {val_data[label_col].value_counts().to_dict()}")
        print(f"Test class distribution ({label_col}): {test_data[label_col].value_counts().to_dict()}")
        
        # Extract merged sentiment vectors and convert to numpy arrays
        X_train = np.array([vec for vec in train_data[sentiment_col]])
        X_val = np.array([vec for vec in val_data[sentiment_col]])
        X_test = np.array([vec for vec in test_data[sentiment_col]])
        
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        # Get indices for later reference
        train_indices = train_data.index
        val_indices = val_data.index
        test_indices = test_data.index
        
        return (X_train, y_train, train_indices), (X_val, y_val, val_indices), (X_test, y_test, test_indices)
    
    def create_logistic_model(self, C=1.0, solver='liblinear', max_iter=1000, class_weight=None):
        """
        Create a logistic regression model for prediction.
        
        Args:
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
            
        Returns:
            Logistic regression model
        """
        return LogisticRegression(
            C=C,
            solver=solver,
            max_iter=max_iter,
            class_weight=class_weight,
            random_state=42
        )
    
    def train_and_evaluate(self, sentiment_column, label_column, C=1.0, solver='liblinear', 
                          max_iter=1000, class_weight=None):
        """
        Train and evaluate logistic regression model for a specific merged sentiment column and label column.
        
        Args:
            sentiment_column: The merged sentiment score column to use
            label_column: The label column to use ('S_label' or 'L_label')
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
        """
        # Determine sentiment approach
        approach = None
        for app in self.sentiment_approaches:
            if app in sentiment_column:
                approach = app
                break
        
        if approach is None:
            raise ValueError(f"Could not determine sentiment approach from column name: {sentiment_column}")
        
        # Determine label type
        if label_column == 'S_label':
            label_type = 'Short-term'
        else:
            label_type = 'Long-term'
        
        # Store results
        combination_key = f"{approach}_{label_type}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training logistic regression model for {approach} merged approach, {label_type}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data for this layer
            (X_train, y_train, train_indices), \
            (X_val, y_val, val_indices), \
            (X_test, y_test, test_indices) = self.split_data(layer, sentiment_column, label_column)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or \
               len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_logistic_model(
                C=C,
                solver=solver,
                max_iter=max_iter,
                class_weight=class_weight
            )
            
            # Debug: Print shapes
            print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
            
            # Train model
            model.fit(X_train, y_train)
            
            # Use validation set to evaluate
            val_pred_prob = model.predict_proba(X_val)[:, 1]
            val_pred = model.predict(X_val)
            val_accuracy = accuracy_score(y_val, val_pred)
            
            print(f"Validation Accuracy: {val_accuracy:.4f}")
            
            # Evaluate on test set
            y_pred_prob = model.predict_proba(X_test)[:, 1]
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_prob)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Print the coefs - now we have two coefficients for [Title, Full_text]
            print(f"Model coefficients: {model.coef_[0]}")
            print(f"Model intercept: {model.intercept_[0]:.6f}")
            
            # Print confusion matrix
            cm = confusion_matrix(y_test, y_pred)
            print("Confusion Matrix:")
            print(cm)
            
            # Print classification report
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            
            # Predicted class distribution
            print(f"Predicted class distribution: {np.bincount(y_pred)}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_prob': y_pred_prob,
                'accuracy': accuracy,
                'auc': auc,
                'confusion_matrix': cm,
                'coefficients': model.coef_[0],
                'intercept': model.intercept_[0]
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize confusion matrix for this layer
            self.visualize_confusion_matrix(
                cm,
                approach,
                label_type,
                i+1
            )
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid layers to calculate average metrics")
            self.results[combination_key]['avg_accuracy'] = np.nan
            self.results[combination_key]['avg_auc'] = np.nan
        
        return self
    
    def visualize_confusion_matrix(self, cm, approach, label_type, layer_num):
        """
        Visualize confusion matrix for a specific model and layer.
        
        Args:
            cm: Confusion matrix
            approach: Sentiment analysis approach
            label_type: Label type (Short-term or Long-term)
            layer_num: Layer number
        """
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"Confusion Matrix: {approach} - Merged - {label_type} (Layer {layer_num})")
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        
        # Save figure
        save_path = f'Merged_Sentiment_Logistic_Plots/MPC/visualizations/confusion_matrices/{approach}_Merged_{label_type}_layer{layer_num}.png'
        plt.savefig(save_path)
        plt.close()
    
    def run_all_combinations(self, use_class_weights=False):
        """
        Run the logistic regression analysis for all combinations of merged sentiment approaches and label columns.
        
        Args:
            use_class_weights: Whether to use balanced class weights
        """
        # Define all combinations - now only looping through approaches and label types
        combinations = []
        
        for approach in self.sentiment_approaches:
            for label_type in ['S_label', 'L_label']:
                sentiment_col = f"Merged_{approach}_score_norm"
                if sentiment_col in self.data.columns:
                    combinations.append((sentiment_col, label_type))
        
        # Run analysis for each combination
        for sentiment_col, label_col in combinations:
            # Train model with or without class weights
            class_weight = 'balanced' if use_class_weights else None
            
            print(f"\n{'='*80}")
            print(f"TRAINING LOGISTIC REGRESSION MODEL FOR {sentiment_col} - {label_col}")
            print(f"{'='*80}")
            
            self.train_and_evaluate(
                sentiment_column=sentiment_col, 
                label_column=label_col,
                class_weight=class_weight
            )
        
        # Create summary visualizations
        self.create_summary_visualizations()
        
        # Print final summary
        self.print_summary()
        
        return self
    
    def create_summary_visualizations(self):
        """Create summary visualizations comparing model performances."""
        self._create_performance_by_approach_visualizations()
    
    def _create_performance_by_approach_visualizations(self):
        """Create visualizations comparing performance by approach."""
        # Collect all results
        approaches = []
        combinations = []
        accuracies = []
        aucs = []
        
        for combo_key, results in self.results.items():
            if 'avg_accuracy' in results:
                # Parse combination key
                parts = combo_key.split('_')
                approach = parts[0]
                label_type = '_'.join(parts[1:])
                
                approaches.append(approach)
                combinations.append(f"{approach}_{label_type}")
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            return
        
        # Create dataframe for plotting
        df = pd.DataFrame({
            'Approach': approaches,
            'Combination': combinations,
            'Accuracy': accuracies,
            'AUC': aucs
        })
        
        # Plot accuracy comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='Accuracy', hue='Combination', data=df)
        plt.title('Accuracy by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/MPC/visualizations/Accuracy_performance_comparison.png')
        plt.close()
        
        # Plot AUC comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='AUC', hue='Combination', data=df)
        plt.title('AUC by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/MPC/visualizations/AUC_performance_comparison.png')
        plt.close()
        
        # Also create a coefficient analysis visualization
        self._create_coefficient_analysis()
    
    def _create_coefficient_analysis(self):
        """Create visualization showing the relative importance of Title vs Full_text in each approach."""
        coef_data = []
        
        for combo_key, results in self.results.items():
            for layer_result in results.get('layer_results', []):
                if 'coefficients' in layer_result and len(layer_result['coefficients']) == 2:
                    parts = combo_key.split('_')
                    approach = parts[0]
                    label_type = '_'.join(parts[1:])
                    
                    coef_data.append({
                        'Approach': approach,
                        'Label Type': label_type,
                        'Layer': f"Layer {layer_result['layer']}",
                        'Title Coefficient': layer_result['coefficients'][0],
                        'Full_text Coefficient': layer_result['coefficients'][1],
                        'Title to Full_text Ratio': abs(layer_result['coefficients'][0] / 
                                                      (layer_result['coefficients'][1] 
                                                       if layer_result['coefficients'][1] != 0 else 1e-6))
                    })
        
        if not coef_data:
            return
        
        df = pd.DataFrame(coef_data)
        
        # Plot coefficient comparison
        plt.figure(figsize=(14, 10))
        
        plt.subplot(2, 1, 1)
        sns.barplot(x='Approach', y='Title Coefficient', hue='Label Type', data=df)
        plt.title('Title Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.subplot(2, 1, 2)
        sns.barplot(x='Approach', y='Full_text Coefficient', hue='Label Type', data=df)
        plt.title('Full_text Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.tight_layout()
        plt.savefig('Merged_Sentiment_Logistic_Plots/MPC/visualizations/Coefficient_comparison.png')
        plt.close()
    
    def print_summary(self):
        """Print a summary of all results."""
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - MPC)")
        print("="*80)
        
        # Organize results by approach
        for approach in self.sentiment_approaches:
            print(f"\nApproach: {approach}")
            print("-" * 40)
            
            for label_type in ['Short-term', 'Long-term']:
                label_col = 'S_label' if label_type == 'Short-term' else 'L_label'
                combination_key = f"{approach}_{label_type}"
                
                if combination_key in self.results and 'avg_accuracy' in self.results[combination_key]:
                    avg_accuracy = self.results[combination_key]['avg_accuracy']
                    avg_auc = self.results[combination_key]['avg_auc']
                    
                    print(f"Merged Vector + {label_type}:")
                    print(f"  Average Accuracy: {avg_accuracy:.4f}")
                    print(f"  Average AUC: {avg_auc:.4f}")
                    
                    # Print layer-specific results
                    for i, layer_result in enumerate(self.results[combination_key].get('layer_results', [])):
                        accuracy = layer_result['accuracy']
                        auc = layer_result['auc']
                        coefficients = layer_result.get('coefficients', 'N/A')
                        intercept = layer_result.get('intercept', 'N/A')
                        
                        # More readable coefficient display - showing Title and Full_text components
                        coef_str = "N/A"
                        if isinstance(coefficients, np.ndarray) and len(coefficients) == 2:
                            coef_str = f"Title: {coefficients[0]:.4f}, Full_text: {coefficients[1]:.4f}"
                        
                        print(f"    Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                        print(f"    Layer {i+1} - Coefficients: {coef_str}, Intercept: {intercept}")
            
            print()
        
        # Find best overall combination
        best_accuracy = 0
        best_auc = 0
        best_accuracy_combo = None
        best_auc_combo = None
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results:
                if results['avg_accuracy'] > best_accuracy:
                    best_accuracy = results['avg_accuracy']
                    best_accuracy_combo = combo
                
                if results['avg_auc'] > best_auc:
                    best_auc = results['avg_auc']
                    best_auc_combo = combo
        
        print("\nBest Overall Combinations:")
        print(f"Best Accuracy: {best_accuracy:.4f} - {best_accuracy_combo}")
        print(f"Best AUC: {best_auc:.4f} - {best_auc_combo}")
        
        return self

# Main execution
if __name__ == "__main__":
    # Initialize predictor with the new CSV file
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_sentiment_SP500_database/wall_street_news_sentiment_database_part2_Merge_MPC.csv')
    
    # Run prediction pipeline
    # First run without class weights
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS")
    print("="*80)
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=False)
    
    # Then run with class weights to address potential class imbalance
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITH BALANCED CLASS WEIGHTS")
    print("="*80)
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_sentiment_SP500_database/wall_street_news_sentiment_database_part2_Merge_MPC.csv')
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=True)


RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS
Loaded 838 financial news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 455, 0: 383}
Class distribution for long-term prediction: {1: 497, 0: 341}

Processing merged sentiment scores...
Parsed Merged_McDonld_score using ast.literal_eval
Parsed Merged_FinBERT_ProsusAI_score using ast.literal_eval
Parsed Merged_FinBERT_yiyang_score using ast.literal_eval
Parsed Merged_FinGPT_score using ast.literal_eval
Parsed Merged_Majority_vote_mean_score using ast.literal_eval
Parsed Merged_Fino1_score using ast.literal_eval

Normalizing merged sentiment scores...
Merged_McDonld_score (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score (Full_text component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Full_text component): min=-1.0000, max=1.0000
Merged_FinBERT_ProsusAI_score (Title component)


Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Train class distribution (L_label): {1: 402, 0: 264}
Val class distribution (L_label): {1: 60, 0: 41}
Test class distribution (L_label): {0: 36, 1: 34}
X_train shape: (666, 2), y_train shape: (666,)
Validation Accuracy: 0.5941
Test Accuracy for Layer 3: 0.4714
Test AUC for Layer 3: 0.5658
Model coefficients: [0.10049872 0.55143055]
Model intercept: 0.651769
Confusion Matrix:
[[ 0 36]
 [ 1 33]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        36
           1       0.48      0.97      0.64        34

    accuracy                           0.47        70
   macro avg       0.24      0.49      0.32        70
weighted avg       0.23      0.47      0.31        70

Predicted class distribution: [ 1 69]

A


Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (S_label): {1: 290, 0: 242}
Val class distribution (S_label): {1: 73, 0: 60}
Test class distribution (S_label): {1: 53, 0: 48}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.5489
Test Accuracy for Layer 2: 0.5248
Test AUC for Layer 2: 0.4811
Model coefficients: [ 0.14056078 -0.05640883]
Model intercept: 0.187389
Confusion Matrix:
[[ 0 48]
 [ 0 53]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        48
           1       0.52      1.00      0.69        53

    accuracy                           0.52       101
   macro avg       0.26      0.50      0.34       101
weighted avg       0.28      0.52      0.36       101

Predicted class distribution: [  0 10


Average Test Accuracy across all layers: 0.5479
Average Test AUC across all layers: 0.5358

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinGPT_score_norm - L_label

Training logistic regression model for FinGPT merged approach, Long-term

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Train class distribution (L_label): {1: 218, 0: 183}
Val class distribution (L_label): {1: 86, 0: 45}
Test class distribution (L_label): {1: 98, 0: 35}
X_train shape: (401, 2), y_train shape: (401,)
Validation Accuracy: 0.6412
Test Accuracy for Layer 1: 0.6842
Test AUC for Layer 1: 0.5058
Model coefficients: [0.60142368 0.47678854]
Model intercept: 0.578106
Confusion Matrix:
[[ 2 33]
 [ 9 89]]

Classification Report:
              precision    recall  f1-score   support

           0       0.18      0.06      0.09        35
           1    


Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (L_label): {1: 304, 0: 228}
Val class distribution (L_label): {1: 98, 0: 35}
Test class distribution (L_label): {1: 60, 0: 41}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.6391
Test Accuracy for Layer 2: 0.5149
Test AUC for Layer 2: 0.5187
Model coefficients: [0.3186544  0.51243846]
Model intercept: 0.485391
Confusion Matrix:
[[ 5 36]
 [13 47]]

Classification Report:
              precision    recall  f1-score   support

           0       0.28      0.12      0.17        41
           1       0.57      0.78      0.66        60

    accuracy                           0.51       101
   macro avg       0.42      0.45      0.41       101
weighted avg       0.45      0.51      0.46       101

Predicted class distribution: [18 83]




Average Test Accuracy across all layers: 0.5938
Average Test AUC across all layers: 0.4761

SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - MPC)

Approach: McDonld
----------------------------------------
Merged Vector + Short-term:
  Average Accuracy: 0.5243
  Average AUC: 0.4994
    Layer 1 - Accuracy: 0.5338, AUC: 0.5588
    Layer 1 - Coefficients: Title: 0.0907, Full_text: 0.4806, Intercept: 0.3732049619561798
    Layer 2 - Accuracy: 0.5248, AUC: 0.4544
    Layer 2 - Coefficients: Title: -0.0011, Full_text: 0.4257, Intercept: 0.3476667690576358
    Layer 3 - Accuracy: 0.5143, AUC: 0.4851
    Layer 3 - Coefficients: Title: 0.0536, Full_text: 0.4372, Intercept: 0.35706167678483736
Merged Vector + Long-term:
  Average Accuracy: 0.5492
  Average AUC: 0.5520
    Layer 1 - Accuracy: 0.6316, AUC: 0.5472
    Layer 1 - Coefficients: Title: 0.1303, Full_text: 0.6104, Intercept: 0.4565235763262658
    Layer 2 - Accuracy: 0.5446, AUC: 0.5429
    Layer 2 - Coefficients: Title: 0.1517, Full_te


Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (S_label): {1: 290, 0: 242}
Val class distribution (S_label): {1: 73, 0: 60}
Test class distribution (S_label): {1: 53, 0: 48}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.5338
Test Accuracy for Layer 2: 0.4554
Test AUC for Layer 2: 0.4548
Model coefficients: [-0.00169173  0.43140824]
Model intercept: 0.170308
Confusion Matrix:
[[23 25]
 [30 23]]

Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.48      0.46        48
           1       0.48      0.43      0.46        53

    accuracy                           0.46       101
   macro avg       0.46      0.46      0.46       101
weighted avg       0.46      0.46      0.46       101

Predicted class distribution: [53 48]


Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (L_label): {1: 304, 0: 228}
Val class distribution (L_label): {1: 98, 0: 35}
Test class distribution (L_label): {1: 60, 0: 41}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.5038
Test Accuracy for Layer 2: 0.5050
Test AUC for Layer 2: 0.4764
Model coefficients: [0.16699075 0.25660253]
Model intercept: 0.055082
Confusion Matrix:
[[21 20]
 [30 30]]

Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.51      0.46        41
           1       0.60      0.50      0.55        60

    accuracy                           0.50       101
   macro avg       0.51      0.51      0.50       101
weighted avg       0.52      0.50      0.51       101

Predicted class distribution: [51 50]




Average Test Accuracy across all layers: 0.5434
Average Test AUC across all layers: 0.5304

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinGPT_score_norm - S_label

Training logistic regression model for FinGPT merged approach, Short-term

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Train class distribution (S_label): {1: 216, 0: 185}
Val class distribution (S_label): {1: 74, 0: 57}
Test class distribution (S_label): {1: 73, 0: 60}
X_train shape: (401, 2), y_train shape: (401,)
Validation Accuracy: 0.4962
Test Accuracy for Layer 1: 0.4812
Test AUC for Layer 1: 0.5301
Model coefficients: [0.22203705 0.21324878]
Model intercept: 0.179216
Confusion Matrix:
[[54  6]
 [63 10]]

Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.90      0.61        60
           1   


Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Train class distribution (S_label): {1: 363, 0: 303}
Val class distribution (S_label): {1: 53, 0: 48}
Test class distribution (S_label): {1: 39, 0: 31}
X_train shape: (666, 2), y_train shape: (666,)
Validation Accuracy: 0.3960
Test Accuracy for Layer 3: 0.4571
Test AUC for Layer 3: 0.5318
Model coefficients: [0.09375117 0.23628331]
Model intercept: 0.083041
Confusion Matrix:
[[15 16]
 [22 17]]

Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.48      0.44        31
           1       0.52      0.44      0.47        39

    accuracy                           0.46        70
   macro avg       0.46      0.46      0.46        70
weighted avg       0.47      0.46      0.46        70

Predicted class distribution: [37 33]

A


Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (L_label): {1: 304, 0: 228}
Val class distribution (L_label): {1: 98, 0: 35}
Test class distribution (L_label): {1: 60, 0: 41}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.6541
Test Accuracy for Layer 2: 0.5446
Test AUC for Layer 2: 0.4671
Model coefficients: [0.51971198 0.19596287]
Model intercept: -0.089158
Confusion Matrix:
[[ 4 37]
 [ 9 51]]

Classification Report:
              precision    recall  f1-score   support

           0       0.31      0.10      0.15        41
           1       0.58      0.85      0.69        60

    accuracy                           0.54       101
   macro avg       0.44      0.47      0.42       101
weighted avg       0.47      0.54      0.47       101

Predicted class distribution: [13 88]


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import os
from datetime import datetime
import ast  # For parsing string representations of lists
import warnings
warnings.filterwarnings('ignore')

class MergedSentimentStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor using logistic regression for merged financial sentiment analysis.
        
        Args:
            csv_path: Path to the CSV file containing merged sentiment score vectors and stock labels
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        self.sentiment_approaches = [
            'McDonld', 'FinBERT_ProsusAI', 'FinBERT_yiyang', 
            'FinGPT', 'Majority_vote_mean', 'Fino1'
        ]
        self.scalers = {}
        
        # Create directories for visualizations
        os.makedirs('Merged_Sentiment_Logistic_Plots/SLB/visualizations', exist_ok=True)
        os.makedirs('Merged_Sentiment_Logistic_Plots/SLB/visualizations/confusion_matrices', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing merged sentiment scores and stock labels."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='%d/%m/%Y')
        self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='%d/%m/%Y')
        self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='%d/%m/%Y')
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Display initial data info
        print(f"Loaded {len(self.data)} financial news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        
        # Show class distribution for each label type
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        # Process merged sentiment score columns
        self._process_merged_scores()
        
        return self
    
    def _process_merged_scores(self):
        """
        Process and normalize the merged sentiment score columns.
        Each merged score is a vector [Title_score, Full_text_score]
        """
        print("\nProcessing merged sentiment scores...")
        
        # Get all merged sentiment score columns
        merged_columns = [f"Merged_{approach}_score" for approach in self.sentiment_approaches]
        
        # Check if these columns exist in the data
        existing_columns = [col for col in merged_columns if col in self.data.columns]
        
        if not existing_columns:
            raise ValueError("No merged sentiment score columns found in the data")
        
        # First, parse the string vectors into numerical arrays
        for col in existing_columns:
            # Check the format of the data to determine parsing method
            sample_value = self.data[col].iloc[0]
            
            # If values are already numerical arrays, we don't need to parse
            if isinstance(sample_value, (list, np.ndarray)):
                print(f"Column {col} already contains numerical arrays")
                continue
                
            # If values are stored as strings, parse them
            try:
                # Try parsing as literal Python representation
                self.data[col] = self.data[col].apply(ast.literal_eval)
                print(f"Parsed {col} using ast.literal_eval")
            except (ValueError, SyntaxError):
                try:
                    # Alternative: try parsing as comma-separated values
                    self.data[col] = self.data[col].str.strip('[]').str.split(',').apply(
                        lambda x: [float(val.strip()) for val in x]
                    )
                    print(f"Parsed {col} as comma-separated values")
                except Exception as e:
                    raise ValueError(f"Could not parse sentiment vectors in column {col}: {e}")
        
        # Create two separate normalized versions of each element in the vector
        print("\nNormalizing merged sentiment scores...")
        for col in existing_columns:
            # Extract Title and Full_text components
            title_scores = np.array([vec[0] for vec in self.data[col]])
            fulltext_scores = np.array([vec[1] for vec in self.data[col]])
            
            # Display original score ranges
            print(f"{col} (Title component): min={title_scores.min():.4f}, max={title_scores.max():.4f}")
            print(f"{col} (Full_text component): min={fulltext_scores.min():.4f}, max={fulltext_scores.max():.4f}")
            
            # Normalize each component separately
            title_scaler = MinMaxScaler(feature_range=(-1, 1))
            fulltext_scaler = MinMaxScaler(feature_range=(-1, 1))
            
            title_norm = title_scaler.fit_transform(title_scores.reshape(-1, 1)).flatten()
            fulltext_norm = fulltext_scaler.fit_transform(fulltext_scores.reshape(-1, 1)).flatten()
            
            # Store scalers for potential later use
            self.scalers[f"{col}_title"] = title_scaler
            self.scalers[f"{col}_fulltext"] = fulltext_scaler
            
            # Create normalized vectors
            self.data[f"{col}_norm"] = [
                [title_norm[i], fulltext_norm[i]] for i in range(len(title_norm))
            ]
            
            # Display normalized score ranges
            print(f"{col}_norm (Title component): min={title_norm.min():.4f}, max={title_norm.max():.4f}")
            print(f"{col}_norm (Full_text component): min={fulltext_norm.min():.4f}, max={fulltext_norm.max():.4f}")
        
        # Print sample of normalized vectors
        print("\nSample of normalized merged sentiment vectors (first 3 rows):")
        for col in [f"{col}_norm" for col in existing_columns]:
            print(f"{col}: {self.data[col].iloc[:3].tolist()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, sentiment_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            sentiment_col: The column containing the merged sentiment score vectors
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            Split datasets and corresponding indices
        """
        # Create masks for each time period
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        # Get data for each period
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Additional debug information: class distribution in each split
        print(f"Train class distribution ({label_col}): {train_data[label_col].value_counts().to_dict()}")
        print(f"Val class distribution ({label_col}): {val_data[label_col].value_counts().to_dict()}")
        print(f"Test class distribution ({label_col}): {test_data[label_col].value_counts().to_dict()}")
        
        # Extract merged sentiment vectors and convert to numpy arrays
        X_train = np.array([vec for vec in train_data[sentiment_col]])
        X_val = np.array([vec for vec in val_data[sentiment_col]])
        X_test = np.array([vec for vec in test_data[sentiment_col]])
        
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        # Get indices for later reference
        train_indices = train_data.index
        val_indices = val_data.index
        test_indices = test_data.index
        
        return (X_train, y_train, train_indices), (X_val, y_val, val_indices), (X_test, y_test, test_indices)
    
    def create_logistic_model(self, C=1.0, solver='liblinear', max_iter=1000, class_weight=None):
        """
        Create a logistic regression model for prediction.
        
        Args:
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
            
        Returns:
            Logistic regression model
        """
        return LogisticRegression(
            C=C,
            solver=solver,
            max_iter=max_iter,
            class_weight=class_weight,
            random_state=42
        )
    
    def train_and_evaluate(self, sentiment_column, label_column, C=1.0, solver='liblinear', 
                          max_iter=1000, class_weight=None):
        """
        Train and evaluate logistic regression model for a specific merged sentiment column and label column.
        
        Args:
            sentiment_column: The merged sentiment score column to use
            label_column: The label column to use ('S_label' or 'L_label')
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
        """
        # Determine sentiment approach
        approach = None
        for app in self.sentiment_approaches:
            if app in sentiment_column:
                approach = app
                break
        
        if approach is None:
            raise ValueError(f"Could not determine sentiment approach from column name: {sentiment_column}")
        
        # Determine label type
        if label_column == 'S_label':
            label_type = 'Short-term'
        else:
            label_type = 'Long-term'
        
        # Store results
        combination_key = f"{approach}_{label_type}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training logistic regression model for {approach} merged approach, {label_type}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data for this layer
            (X_train, y_train, train_indices), \
            (X_val, y_val, val_indices), \
            (X_test, y_test, test_indices) = self.split_data(layer, sentiment_column, label_column)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or \
               len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_logistic_model(
                C=C,
                solver=solver,
                max_iter=max_iter,
                class_weight=class_weight
            )
            
            # Debug: Print shapes
            print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
            
            # Train model
            model.fit(X_train, y_train)
            
            # Use validation set to evaluate
            val_pred_prob = model.predict_proba(X_val)[:, 1]
            val_pred = model.predict(X_val)
            val_accuracy = accuracy_score(y_val, val_pred)
            
            print(f"Validation Accuracy: {val_accuracy:.4f}")
            
            # Evaluate on test set
            y_pred_prob = model.predict_proba(X_test)[:, 1]
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_prob)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Print the coefs - now we have two coefficients for [Title, Full_text]
            print(f"Model coefficients: {model.coef_[0]}")
            print(f"Model intercept: {model.intercept_[0]:.6f}")
            
            # Print confusion matrix
            cm = confusion_matrix(y_test, y_pred)
            print("Confusion Matrix:")
            print(cm)
            
            # Print classification report
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            
            # Predicted class distribution
            print(f"Predicted class distribution: {np.bincount(y_pred)}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_prob': y_pred_prob,
                'accuracy': accuracy,
                'auc': auc,
                'confusion_matrix': cm,
                'coefficients': model.coef_[0],
                'intercept': model.intercept_[0]
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize confusion matrix for this layer
            self.visualize_confusion_matrix(
                cm,
                approach,
                label_type,
                i+1
            )
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid layers to calculate average metrics")
            self.results[combination_key]['avg_accuracy'] = np.nan
            self.results[combination_key]['avg_auc'] = np.nan
        
        return self
    
    def visualize_confusion_matrix(self, cm, approach, label_type, layer_num):
        """
        Visualize confusion matrix for a specific model and layer.
        
        Args:
            cm: Confusion matrix
            approach: Sentiment analysis approach
            label_type: Label type (Short-term or Long-term)
            layer_num: Layer number
        """
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"Confusion Matrix: {approach} - Merged - {label_type} (Layer {layer_num})")
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        
        # Save figure
        save_path = f'Merged_Sentiment_Logistic_Plots/SLB/visualizations/confusion_matrices/{approach}_Merged_{label_type}_layer{layer_num}.png'
        plt.savefig(save_path)
        plt.close()
    
    def run_all_combinations(self, use_class_weights=False):
        """
        Run the logistic regression analysis for all combinations of merged sentiment approaches and label columns.
        
        Args:
            use_class_weights: Whether to use balanced class weights
        """
        # Define all combinations - now only looping through approaches and label types
        combinations = []
        
        for approach in self.sentiment_approaches:
            for label_type in ['S_label', 'L_label']:
                sentiment_col = f"Merged_{approach}_score_norm"
                if sentiment_col in self.data.columns:
                    combinations.append((sentiment_col, label_type))
        
        # Run analysis for each combination
        for sentiment_col, label_col in combinations:
            # Train model with or without class weights
            class_weight = 'balanced' if use_class_weights else None
            
            print(f"\n{'='*80}")
            print(f"TRAINING LOGISTIC REGRESSION MODEL FOR {sentiment_col} - {label_col}")
            print(f"{'='*80}")
            
            self.train_and_evaluate(
                sentiment_column=sentiment_col, 
                label_column=label_col,
                class_weight=class_weight
            )
        
        # Create summary visualizations
        self.create_summary_visualizations()
        
        # Print final summary
        self.print_summary()
        
        return self
    
    def create_summary_visualizations(self):
        """Create summary visualizations comparing model performances."""
        self._create_performance_by_approach_visualizations()
    
    def _create_performance_by_approach_visualizations(self):
        """Create visualizations comparing performance by approach."""
        # Collect all results
        approaches = []
        combinations = []
        accuracies = []
        aucs = []
        
        for combo_key, results in self.results.items():
            if 'avg_accuracy' in results:
                # Parse combination key
                parts = combo_key.split('_')
                approach = parts[0]
                label_type = '_'.join(parts[1:])
                
                approaches.append(approach)
                combinations.append(f"{approach}_{label_type}")
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            return
        
        # Create dataframe for plotting
        df = pd.DataFrame({
            'Approach': approaches,
            'Combination': combinations,
            'Accuracy': accuracies,
            'AUC': aucs
        })
        
        # Plot accuracy comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='Accuracy', hue='Combination', data=df)
        plt.title('Accuracy by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/SLB/visualizations/Accuracy_performance_comparison.png')
        plt.close()
        
        # Plot AUC comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='AUC', hue='Combination', data=df)
        plt.title('AUC by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/SLB/visualizations/AUC_performance_comparison.png')
        plt.close()
        
        # Also create a coefficient analysis visualization
        self._create_coefficient_analysis()
    
    def _create_coefficient_analysis(self):
        """Create visualization showing the relative importance of Title vs Full_text in each approach."""
        coef_data = []
        
        for combo_key, results in self.results.items():
            for layer_result in results.get('layer_results', []):
                if 'coefficients' in layer_result and len(layer_result['coefficients']) == 2:
                    parts = combo_key.split('_')
                    approach = parts[0]
                    label_type = '_'.join(parts[1:])
                    
                    coef_data.append({
                        'Approach': approach,
                        'Label Type': label_type,
                        'Layer': f"Layer {layer_result['layer']}",
                        'Title Coefficient': layer_result['coefficients'][0],
                        'Full_text Coefficient': layer_result['coefficients'][1],
                        'Title to Full_text Ratio': abs(layer_result['coefficients'][0] / 
                                                      (layer_result['coefficients'][1] 
                                                       if layer_result['coefficients'][1] != 0 else 1e-6))
                    })
        
        if not coef_data:
            return
        
        df = pd.DataFrame(coef_data)
        
        # Plot coefficient comparison
        plt.figure(figsize=(14, 10))
        
        plt.subplot(2, 1, 1)
        sns.barplot(x='Approach', y='Title Coefficient', hue='Label Type', data=df)
        plt.title('Title Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.subplot(2, 1, 2)
        sns.barplot(x='Approach', y='Full_text Coefficient', hue='Label Type', data=df)
        plt.title('Full_text Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.tight_layout()
        plt.savefig('Merged_Sentiment_Logistic_Plots/SLB/visualizations/Coefficient_comparison.png')
        plt.close()
    
    def print_summary(self):
        """Print a summary of all results."""
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - SLB)")
        print("="*80)
        
        # Organize results by approach
        for approach in self.sentiment_approaches:
            print(f"\nApproach: {approach}")
            print("-" * 40)
            
            for label_type in ['Short-term', 'Long-term']:
                label_col = 'S_label' if label_type == 'Short-term' else 'L_label'
                combination_key = f"{approach}_{label_type}"
                
                if combination_key in self.results and 'avg_accuracy' in self.results[combination_key]:
                    avg_accuracy = self.results[combination_key]['avg_accuracy']
                    avg_auc = self.results[combination_key]['avg_auc']
                    
                    print(f"Merged Vector + {label_type}:")
                    print(f"  Average Accuracy: {avg_accuracy:.4f}")
                    print(f"  Average AUC: {avg_auc:.4f}")
                    
                    # Print layer-specific results
                    for i, layer_result in enumerate(self.results[combination_key].get('layer_results', [])):
                        accuracy = layer_result['accuracy']
                        auc = layer_result['auc']
                        coefficients = layer_result.get('coefficients', 'N/A')
                        intercept = layer_result.get('intercept', 'N/A')
                        
                        # More readable coefficient display - showing Title and Full_text components
                        coef_str = "N/A"
                        if isinstance(coefficients, np.ndarray) and len(coefficients) == 2:
                            coef_str = f"Title: {coefficients[0]:.4f}, Full_text: {coefficients[1]:.4f}"
                        
                        print(f"    Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                        print(f"    Layer {i+1} - Coefficients: {coef_str}, Intercept: {intercept}")
            
            print()
        
        # Find best overall combination
        best_accuracy = 0
        best_auc = 0
        best_accuracy_combo = None
        best_auc_combo = None
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results:
                if results['avg_accuracy'] > best_accuracy:
                    best_accuracy = results['avg_accuracy']
                    best_accuracy_combo = combo
                
                if results['avg_auc'] > best_auc:
                    best_auc = results['avg_auc']
                    best_auc_combo = combo
        
        print("\nBest Overall Combinations:")
        print(f"Best Accuracy: {best_accuracy:.4f} - {best_accuracy_combo}")
        print(f"Best AUC: {best_auc:.4f} - {best_auc_combo}")
        
        return self

# Main execution
if __name__ == "__main__":
    # Initialize predictor with the new CSV file
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_sentiment_SP500_database/wall_street_news_sentiment_database_part2_Merge_SLB.csv')
    
    # Run prediction pipeline
    # First run without class weights
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS")
    print("="*80)
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=False)
    
    # Then run with class weights to address potential class imbalance
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITH BALANCED CLASS WEIGHTS")
    print("="*80)
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_sentiment_SP500_database/wall_street_news_sentiment_database_part2_Merge_SLB.csv')
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=True)


RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS
Loaded 838 financial news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0: 437, 1: 401}
Class distribution for long-term prediction: {0: 441, 1: 397}

Processing merged sentiment scores...
Parsed Merged_McDonld_score using ast.literal_eval
Parsed Merged_FinBERT_ProsusAI_score using ast.literal_eval
Parsed Merged_FinBERT_yiyang_score using ast.literal_eval
Parsed Merged_FinGPT_score using ast.literal_eval
Parsed Merged_Majority_vote_mean_score using ast.literal_eval
Parsed Merged_Fino1_score using ast.literal_eval

Normalizing merged sentiment scores...
Merged_McDonld_score (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score (Full_text component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Full_text component): min=-1.0000, max=1.0000
Merged_FinBERT_ProsusAI_score (Title component)


Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Train class distribution (L_label): {0: 343, 1: 323}
Val class distribution (L_label): {0: 52, 1: 49}
Test class distribution (L_label): {0: 46, 1: 24}
X_train shape: (666, 2), y_train shape: (666,)
Validation Accuracy: 0.5149
Test Accuracy for Layer 3: 0.5286
Test AUC for Layer 3: 0.4547
Model coefficients: [0.08450863 0.41705299]
Model intercept: 0.115863
Confusion Matrix:
[[30 16]
 [17  7]]

Classification Report:
              precision    recall  f1-score   support

           0       0.64      0.65      0.65        46
           1       0.30      0.29      0.30        24

    accuracy                           0.53        70
   macro avg       0.47      0.47      0.47        70
weighted avg       0.52      0.53      0.53        70

Predicted class distribution: [47 23]

A


Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Train class distribution (S_label): {0: 353, 1: 313}
Val class distribution (S_label): {1: 58, 0: 43}
Test class distribution (S_label): {0: 40, 1: 30}
X_train shape: (666, 2), y_train shape: (666,)
Validation Accuracy: 0.4554
Test Accuracy for Layer 3: 0.6143
Test AUC for Layer 3: 0.6258
Model coefficients: [ 0.18526452 -0.21635986]
Model intercept: -0.121623
Confusion Matrix:
[[38  2]
 [25  5]]

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.95      0.74        40
           1       0.71      0.17      0.27        30

    accuracy                           0.61        70
   macro avg       0.66      0.56      0.50        70
weighted avg       0.65      0.61      0.54        70

Predicted class distribution: [63  7]


Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (L_label): {0: 292, 1: 240}
Val class distribution (L_label): {1: 83, 0: 50}
Test class distribution (L_label): {0: 52, 1: 49}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.4135
Test Accuracy for Layer 2: 0.5545
Test AUC for Layer 2: 0.5006
Model coefficients: [0.13658618 0.25160605]
Model intercept: 0.020542
Confusion Matrix:
[[52  0]
 [45  4]]

Classification Report:
              precision    recall  f1-score   support

           0       0.54      1.00      0.70        52
           1       1.00      0.08      0.15        49

    accuracy                           0.55       101
   macro avg       0.77      0.54      0.42       101
weighted avg       0.76      0.55      0.43       101

Predicted class distribution: [97  4]




Average Test Accuracy across all layers: 0.4865
Average Test AUC across all layers: 0.5003

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_Fino1_score_norm - S_label

Training logistic regression model for Fino1 merged approach, Short-term

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Train class distribution (S_label): {0: 220, 1: 181}
Val class distribution (S_label): {0: 66, 1: 65}
Test class distribution (S_label): {1: 67, 0: 66}
X_train shape: (401, 2), y_train shape: (401,)
Validation Accuracy: 0.5267
Test Accuracy for Layer 1: 0.5263
Test AUC for Layer 1: 0.5276
Model coefficients: [ 0.06293893 -0.37741914]
Model intercept: -0.275639
Confusion Matrix:
[[60  6]
 [57 10]]

Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.91      0.66        66
           1  


Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (S_label): {0: 286, 1: 246}
Val class distribution (S_label): {1: 67, 0: 66}
Test class distribution (S_label): {1: 58, 0: 43}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.5714
Test Accuracy for Layer 2: 0.4455
Test AUC for Layer 2: 0.4130
Model coefficients: [-0.12217065  0.39356513]
Model intercept: 0.127191
Confusion Matrix:
[[17 26]
 [30 28]]

Classification Report:
              precision    recall  f1-score   support

           0       0.36      0.40      0.38        43
           1       0.52      0.48      0.50        58

    accuracy                           0.45       101
   macro avg       0.44      0.44      0.44       101
weighted avg       0.45      0.45      0.45       101

Predicted class distribution: [47 54]


Average Test Accuracy across all layers: 0.5508
Average Test AUC across all layers: 0.5116

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinBERT_ProsusAI_score_norm - L_label

Training logistic regression model for FinBERT_ProsusAI merged approach, Long-term

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Train class distribution (L_label): {0: 238, 1: 163}
Val class distribution (L_label): {1: 77, 0: 54}
Test class distribution (L_label): {1: 83, 0: 50}
X_train shape: (401, 2), y_train shape: (401,)
Validation Accuracy: 0.4962
Test Accuracy for Layer 1: 0.5489
Test AUC for Layer 1: 0.5410
Model coefficients: [-0.00433673  0.11338436]
Model intercept: 0.022619
Confusion Matrix:
[[24 26]
 [34 49]]

Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.48      0.44     


Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Train class distribution (L_label): {0: 343, 1: 323}
Val class distribution (L_label): {0: 52, 1: 49}
Test class distribution (L_label): {0: 46, 1: 24}
X_train shape: (666, 2), y_train shape: (666,)
Validation Accuracy: 0.4455
Test Accuracy for Layer 3: 0.4714
Test AUC for Layer 3: 0.5236
Model coefficients: [ 0.22643594 -0.08169705]
Model intercept: 0.010546
Confusion Matrix:
[[17 29]
 [ 8 16]]

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.37      0.48        46
           1       0.36      0.67      0.46        24

    accuracy                           0.47        70
   macro avg       0.52      0.52      0.47        70
weighted avg       0.57      0.47      0.47        70

Predicted class distribution: [25 45]



Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (S_label): {0: 286, 1: 246}
Val class distribution (S_label): {1: 67, 0: 66}
Test class distribution (S_label): {1: 58, 0: 43}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.5113
Test Accuracy for Layer 2: 0.6040
Test AUC for Layer 2: 0.6026
Model coefficients: [-0.08072473 -0.25899411]
Model intercept: -0.088355
Confusion Matrix:
[[21 22]
 [18 40]]

Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.49      0.51        43
           1       0.65      0.69      0.67        58

    accuracy                           0.60       101
   macro avg       0.59      0.59      0.59       101
weighted avg       0.60      0.60      0.60       101

Predicted class distribution: [39 62


Average Test Accuracy across all layers: 0.5492
Average Test AUC across all layers: 0.5537

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_Fino1_score_norm - L_label

Training logistic regression model for Fino1 merged approach, Long-term

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Train class distribution (L_label): {0: 238, 1: 163}
Val class distribution (L_label): {1: 77, 0: 54}
Test class distribution (L_label): {1: 83, 0: 50}
X_train shape: (401, 2), y_train shape: (401,)
Validation Accuracy: 0.5649
Test Accuracy for Layer 1: 0.5789
Test AUC for Layer 1: 0.5010
Model coefficients: [-0.27256084  0.1809685 ]
Model intercept: 0.115711
Confusion Matrix:
[[ 7 43]
 [13 70]]

Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.14      0.20        50
           1    

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import os
from datetime import datetime
import ast  # For parsing string representations of lists
import warnings
warnings.filterwarnings('ignore')

class MergedSentimentStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor using logistic regression for merged financial sentiment analysis.
        
        Args:
            csv_path: Path to the CSV file containing merged sentiment score vectors and stock labels
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        self.sentiment_approaches = [
            'McDonld', 'FinBERT_ProsusAI', 'FinBERT_yiyang', 
            'FinGPT', 'Majority_vote_mean', 'Fino1'
        ]
        self.scalers = {}
        
        # Create directories for visualizations
        os.makedirs('Merged_Sentiment_Logistic_Plots/XOM/visualizations', exist_ok=True)
        os.makedirs('Merged_Sentiment_Logistic_Plots/XOM/visualizations/confusion_matrices', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing merged sentiment scores and stock labels."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='%d/%m/%Y')
        self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='%d/%m/%Y')
        self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='%d/%m/%Y')
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Display initial data info
        print(f"Loaded {len(self.data)} financial news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        
        # Show class distribution for each label type
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        # Process merged sentiment score columns
        self._process_merged_scores()
        
        return self
    
    def _process_merged_scores(self):
        """
        Process and normalize the merged sentiment score columns.
        Each merged score is a vector [Title_score, Full_text_score]
        """
        print("\nProcessing merged sentiment scores...")
        
        # Get all merged sentiment score columns
        merged_columns = [f"Merged_{approach}_score" for approach in self.sentiment_approaches]
        
        # Check if these columns exist in the data
        existing_columns = [col for col in merged_columns if col in self.data.columns]
        
        if not existing_columns:
            raise ValueError("No merged sentiment score columns found in the data")
        
        # First, parse the string vectors into numerical arrays
        for col in existing_columns:
            # Check the format of the data to determine parsing method
            sample_value = self.data[col].iloc[0]
            
            # If values are already numerical arrays, we don't need to parse
            if isinstance(sample_value, (list, np.ndarray)):
                print(f"Column {col} already contains numerical arrays")
                continue
                
            # If values are stored as strings, parse them
            try:
                # Try parsing as literal Python representation
                self.data[col] = self.data[col].apply(ast.literal_eval)
                print(f"Parsed {col} using ast.literal_eval")
            except (ValueError, SyntaxError):
                try:
                    # Alternative: try parsing as comma-separated values
                    self.data[col] = self.data[col].str.strip('[]').str.split(',').apply(
                        lambda x: [float(val.strip()) for val in x]
                    )
                    print(f"Parsed {col} as comma-separated values")
                except Exception as e:
                    raise ValueError(f"Could not parse sentiment vectors in column {col}: {e}")
        
        # Create two separate normalized versions of each element in the vector
        print("\nNormalizing merged sentiment scores...")
        for col in existing_columns:
            # Extract Title and Full_text components
            title_scores = np.array([vec[0] for vec in self.data[col]])
            fulltext_scores = np.array([vec[1] for vec in self.data[col]])
            
            # Display original score ranges
            print(f"{col} (Title component): min={title_scores.min():.4f}, max={title_scores.max():.4f}")
            print(f"{col} (Full_text component): min={fulltext_scores.min():.4f}, max={fulltext_scores.max():.4f}")
            
            # Normalize each component separately
            title_scaler = MinMaxScaler(feature_range=(-1, 1))
            fulltext_scaler = MinMaxScaler(feature_range=(-1, 1))
            
            title_norm = title_scaler.fit_transform(title_scores.reshape(-1, 1)).flatten()
            fulltext_norm = fulltext_scaler.fit_transform(fulltext_scores.reshape(-1, 1)).flatten()
            
            # Store scalers for potential later use
            self.scalers[f"{col}_title"] = title_scaler
            self.scalers[f"{col}_fulltext"] = fulltext_scaler
            
            # Create normalized vectors
            self.data[f"{col}_norm"] = [
                [title_norm[i], fulltext_norm[i]] for i in range(len(title_norm))
            ]
            
            # Display normalized score ranges
            print(f"{col}_norm (Title component): min={title_norm.min():.4f}, max={title_norm.max():.4f}")
            print(f"{col}_norm (Full_text component): min={fulltext_norm.min():.4f}, max={fulltext_norm.max():.4f}")
        
        # Print sample of normalized vectors
        print("\nSample of normalized merged sentiment vectors (first 3 rows):")
        for col in [f"{col}_norm" for col in existing_columns]:
            print(f"{col}: {self.data[col].iloc[:3].tolist()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, sentiment_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            sentiment_col: The column containing the merged sentiment score vectors
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            Split datasets and corresponding indices
        """
        # Create masks for each time period
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        # Get data for each period
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Additional debug information: class distribution in each split
        print(f"Train class distribution ({label_col}): {train_data[label_col].value_counts().to_dict()}")
        print(f"Val class distribution ({label_col}): {val_data[label_col].value_counts().to_dict()}")
        print(f"Test class distribution ({label_col}): {test_data[label_col].value_counts().to_dict()}")
        
        # Extract merged sentiment vectors and convert to numpy arrays
        X_train = np.array([vec for vec in train_data[sentiment_col]])
        X_val = np.array([vec for vec in val_data[sentiment_col]])
        X_test = np.array([vec for vec in test_data[sentiment_col]])
        
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        # Get indices for later reference
        train_indices = train_data.index
        val_indices = val_data.index
        test_indices = test_data.index
        
        return (X_train, y_train, train_indices), (X_val, y_val, val_indices), (X_test, y_test, test_indices)
    
    def create_logistic_model(self, C=1.0, solver='liblinear', max_iter=1000, class_weight=None):
        """
        Create a logistic regression model for prediction.
        
        Args:
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
            
        Returns:
            Logistic regression model
        """
        return LogisticRegression(
            C=C,
            solver=solver,
            max_iter=max_iter,
            class_weight=class_weight,
            random_state=42
        )
    
    def train_and_evaluate(self, sentiment_column, label_column, C=1.0, solver='liblinear', 
                          max_iter=1000, class_weight=None):
        """
        Train and evaluate logistic regression model for a specific merged sentiment column and label column.
        
        Args:
            sentiment_column: The merged sentiment score column to use
            label_column: The label column to use ('S_label' or 'L_label')
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
        """
        # Determine sentiment approach
        approach = None
        for app in self.sentiment_approaches:
            if app in sentiment_column:
                approach = app
                break
        
        if approach is None:
            raise ValueError(f"Could not determine sentiment approach from column name: {sentiment_column}")
        
        # Determine label type
        if label_column == 'S_label':
            label_type = 'Short-term'
        else:
            label_type = 'Long-term'
        
        # Store results
        combination_key = f"{approach}_{label_type}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training logistic regression model for {approach} merged approach, {label_type}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data for this layer
            (X_train, y_train, train_indices), \
            (X_val, y_val, val_indices), \
            (X_test, y_test, test_indices) = self.split_data(layer, sentiment_column, label_column)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or \
               len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_logistic_model(
                C=C,
                solver=solver,
                max_iter=max_iter,
                class_weight=class_weight
            )
            
            # Debug: Print shapes
            print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
            
            # Train model
            model.fit(X_train, y_train)
            
            # Use validation set to evaluate
            val_pred_prob = model.predict_proba(X_val)[:, 1]
            val_pred = model.predict(X_val)
            val_accuracy = accuracy_score(y_val, val_pred)
            
            print(f"Validation Accuracy: {val_accuracy:.4f}")
            
            # Evaluate on test set
            y_pred_prob = model.predict_proba(X_test)[:, 1]
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_prob)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Print the coefs - now we have two coefficients for [Title, Full_text]
            print(f"Model coefficients: {model.coef_[0]}")
            print(f"Model intercept: {model.intercept_[0]:.6f}")
            
            # Print confusion matrix
            cm = confusion_matrix(y_test, y_pred)
            print("Confusion Matrix:")
            print(cm)
            
            # Print classification report
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            
            # Predicted class distribution
            print(f"Predicted class distribution: {np.bincount(y_pred)}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_prob': y_pred_prob,
                'accuracy': accuracy,
                'auc': auc,
                'confusion_matrix': cm,
                'coefficients': model.coef_[0],
                'intercept': model.intercept_[0]
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize confusion matrix for this layer
            self.visualize_confusion_matrix(
                cm,
                approach,
                label_type,
                i+1
            )
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid layers to calculate average metrics")
            self.results[combination_key]['avg_accuracy'] = np.nan
            self.results[combination_key]['avg_auc'] = np.nan
        
        return self
    
    def visualize_confusion_matrix(self, cm, approach, label_type, layer_num):
        """
        Visualize confusion matrix for a specific model and layer.
        
        Args:
            cm: Confusion matrix
            approach: Sentiment analysis approach
            label_type: Label type (Short-term or Long-term)
            layer_num: Layer number
        """
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"Confusion Matrix: {approach} - Merged - {label_type} (Layer {layer_num})")
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        
        # Save figure
        save_path = f'Merged_Sentiment_Logistic_Plots/XOM/visualizations/confusion_matrices/{approach}_Merged_{label_type}_layer{layer_num}.png'
        plt.savefig(save_path)
        plt.close()
    
    def run_all_combinations(self, use_class_weights=False):
        """
        Run the logistic regression analysis for all combinations of merged sentiment approaches and label columns.
        
        Args:
            use_class_weights: Whether to use balanced class weights
        """
        # Define all combinations - now only looping through approaches and label types
        combinations = []
        
        for approach in self.sentiment_approaches:
            for label_type in ['S_label', 'L_label']:
                sentiment_col = f"Merged_{approach}_score_norm"
                if sentiment_col in self.data.columns:
                    combinations.append((sentiment_col, label_type))
        
        # Run analysis for each combination
        for sentiment_col, label_col in combinations:
            # Train model with or without class weights
            class_weight = 'balanced' if use_class_weights else None
            
            print(f"\n{'='*80}")
            print(f"TRAINING LOGISTIC REGRESSION MODEL FOR {sentiment_col} - {label_col}")
            print(f"{'='*80}")
            
            self.train_and_evaluate(
                sentiment_column=sentiment_col, 
                label_column=label_col,
                class_weight=class_weight
            )
        
        # Create summary visualizations
        self.create_summary_visualizations()
        
        # Print final summary
        self.print_summary()
        
        return self
    
    def create_summary_visualizations(self):
        """Create summary visualizations comparing model performances."""
        self._create_performance_by_approach_visualizations()
    
    def _create_performance_by_approach_visualizations(self):
        """Create visualizations comparing performance by approach."""
        # Collect all results
        approaches = []
        combinations = []
        accuracies = []
        aucs = []
        
        for combo_key, results in self.results.items():
            if 'avg_accuracy' in results:
                # Parse combination key
                parts = combo_key.split('_')
                approach = parts[0]
                label_type = '_'.join(parts[1:])
                
                approaches.append(approach)
                combinations.append(f"{approach}_{label_type}")
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            return
        
        # Create dataframe for plotting
        df = pd.DataFrame({
            'Approach': approaches,
            'Combination': combinations,
            'Accuracy': accuracies,
            'AUC': aucs
        })
        
        # Plot accuracy comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='Accuracy', hue='Combination', data=df)
        plt.title('Accuracy by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/XOM/visualizations/Accuracy_performance_comparison.png')
        plt.close()
        
        # Plot AUC comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='AUC', hue='Combination', data=df)
        plt.title('AUC by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/XOM/visualizations/AUC_performance_comparison.png')
        plt.close()
        
        # Also create a coefficient analysis visualization
        self._create_coefficient_analysis()
    
    def _create_coefficient_analysis(self):
        """Create visualization showing the relative importance of Title vs Full_text in each approach."""
        coef_data = []
        
        for combo_key, results in self.results.items():
            for layer_result in results.get('layer_results', []):
                if 'coefficients' in layer_result and len(layer_result['coefficients']) == 2:
                    parts = combo_key.split('_')
                    approach = parts[0]
                    label_type = '_'.join(parts[1:])
                    
                    coef_data.append({
                        'Approach': approach,
                        'Label Type': label_type,
                        'Layer': f"Layer {layer_result['layer']}",
                        'Title Coefficient': layer_result['coefficients'][0],
                        'Full_text Coefficient': layer_result['coefficients'][1],
                        'Title to Full_text Ratio': abs(layer_result['coefficients'][0] / 
                                                      (layer_result['coefficients'][1] 
                                                       if layer_result['coefficients'][1] != 0 else 1e-6))
                    })
        
        if not coef_data:
            return
        
        df = pd.DataFrame(coef_data)
        
        # Plot coefficient comparison
        plt.figure(figsize=(14, 10))
        
        plt.subplot(2, 1, 1)
        sns.barplot(x='Approach', y='Title Coefficient', hue='Label Type', data=df)
        plt.title('Title Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.subplot(2, 1, 2)
        sns.barplot(x='Approach', y='Full_text Coefficient', hue='Label Type', data=df)
        plt.title('Full_text Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.tight_layout()
        plt.savefig('Merged_Sentiment_Logistic_Plots/XOM/visualizations/Coefficient_comparison.png')
        plt.close()
    
    def print_summary(self):
        """Print a summary of all results."""
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - XOM)")
        print("="*80)
        
        # Organize results by approach
        for approach in self.sentiment_approaches:
            print(f"\nApproach: {approach}")
            print("-" * 40)
            
            for label_type in ['Short-term', 'Long-term']:
                label_col = 'S_label' if label_type == 'Short-term' else 'L_label'
                combination_key = f"{approach}_{label_type}"
                
                if combination_key in self.results and 'avg_accuracy' in self.results[combination_key]:
                    avg_accuracy = self.results[combination_key]['avg_accuracy']
                    avg_auc = self.results[combination_key]['avg_auc']
                    
                    print(f"Merged Vector + {label_type}:")
                    print(f"  Average Accuracy: {avg_accuracy:.4f}")
                    print(f"  Average AUC: {avg_auc:.4f}")
                    
                    # Print layer-specific results
                    for i, layer_result in enumerate(self.results[combination_key].get('layer_results', [])):
                        accuracy = layer_result['accuracy']
                        auc = layer_result['auc']
                        coefficients = layer_result.get('coefficients', 'N/A')
                        intercept = layer_result.get('intercept', 'N/A')
                        
                        # More readable coefficient display - showing Title and Full_text components
                        coef_str = "N/A"
                        if isinstance(coefficients, np.ndarray) and len(coefficients) == 2:
                            coef_str = f"Title: {coefficients[0]:.4f}, Full_text: {coefficients[1]:.4f}"
                        
                        print(f"    Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                        print(f"    Layer {i+1} - Coefficients: {coef_str}, Intercept: {intercept}")
            
            print()
        
        # Find best overall combination
        best_accuracy = 0
        best_auc = 0
        best_accuracy_combo = None
        best_auc_combo = None
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results:
                if results['avg_accuracy'] > best_accuracy:
                    best_accuracy = results['avg_accuracy']
                    best_accuracy_combo = combo
                
                if results['avg_auc'] > best_auc:
                    best_auc = results['avg_auc']
                    best_auc_combo = combo
        
        print("\nBest Overall Combinations:")
        print(f"Best Accuracy: {best_accuracy:.4f} - {best_accuracy_combo}")
        print(f"Best AUC: {best_auc:.4f} - {best_auc_combo}")
        
        return self

# Main execution
if __name__ == "__main__":
    # Initialize predictor with the new CSV file
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_sentiment_SP500_database/wall_street_news_sentiment_database_part2_Merge_XOM.csv')
    
    # Run prediction pipeline
    # First run without class weights
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS")
    print("="*80)
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=False)
    
    # Then run with class weights to address potential class imbalance
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITH BALANCED CLASS WEIGHTS")
    print("="*80)
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_sentiment_SP500_database/wall_street_news_sentiment_database_part2_Merge_XOM.csv')
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=True)


RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS
Loaded 838 financial news articles spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 421, 0: 417}
Class distribution for long-term prediction: {1: 428, 0: 410}

Processing merged sentiment scores...
Parsed Merged_McDonld_score using ast.literal_eval
Parsed Merged_FinBERT_ProsusAI_score using ast.literal_eval
Parsed Merged_FinBERT_yiyang_score using ast.literal_eval
Parsed Merged_FinGPT_score using ast.literal_eval
Parsed Merged_Majority_vote_mean_score using ast.literal_eval
Parsed Merged_Fino1_score using ast.literal_eval

Normalizing merged sentiment scores...
Merged_McDonld_score (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score (Full_text component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Full_text component): min=-1.0000, max=1.0000
Merged_FinBERT_ProsusAI_score (Title component)


Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Train class distribution (L_label): {1: 350, 0: 316}
Val class distribution (L_label): {0: 57, 1: 44}
Test class distribution (L_label): {0: 36, 1: 34}
X_train shape: (666, 2), y_train shape: (666,)
Validation Accuracy: 0.4455
Test Accuracy for Layer 3: 0.5000
Test AUC for Layer 3: 0.4796
Model coefficients: [0.01775315 0.40368601]
Model intercept: 0.257877
Confusion Matrix:
[[11 25]
 [10 24]]

Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.31      0.39        36
           1       0.49      0.71      0.58        34

    accuracy                           0.50        70
   macro avg       0.51      0.51      0.48        70
weighted avg       0.51      0.50      0.48        70

Predicted class distribution: [21 49]

A


Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (S_label): {0: 271, 1: 261}
Val class distribution (S_label): {1: 77, 0: 56}
Test class distribution (S_label): {1: 52, 0: 49}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.4662
Test Accuracy for Layer 2: 0.5050
Test AUC for Layer 2: 0.5428
Model coefficients: [ 0.15508543 -0.11736629]
Model intercept: -0.033020
Confusion Matrix:
[[37 12]
 [38 14]]

Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.76      0.60        49
           1       0.54      0.27      0.36        52

    accuracy                           0.50       101
   macro avg       0.52      0.51      0.48       101
weighted avg       0.52      0.50      0.47       101

Predicted class distribution: [75 26

Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Train class distribution (L_label): {0: 235, 1: 166}
Val class distribution (L_label): {1: 89, 0: 42}
Test class distribution (L_label): {1: 95, 0: 38}
X_train shape: (401, 2), y_train shape: (401,)
Validation Accuracy: 0.3206
Test Accuracy for Layer 1: 0.2932
Test AUC for Layer 1: 0.4931
Model coefficients: [0.21341585 0.33441531]
Model intercept: -0.068400
Confusion Matrix:
[[38  0]
 [94  1]]

Classification Report:
              precision    recall  f1-score   support

           0       0.29      1.00      0.45        38
           1       1.00      0.01      0.02        95

    accuracy                           0.29       133
   macro avg       0.64      0.51      0.23       133
weighted avg       0.80      0.29      0.14       133

Predicted class distribution: [132   1]

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/202


Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Train class distribution (L_label): {1: 350, 0: 316}
Val class distribution (L_label): {0: 57, 1: 44}
Test class distribution (L_label): {0: 36, 1: 34}
X_train shape: (666, 2), y_train shape: (666,)
Validation Accuracy: 0.4059
Test Accuracy for Layer 3: 0.4857
Test AUC for Layer 3: 0.5057
Model coefficients: [0.09992308 0.47846858]
Model intercept: 0.257064
Confusion Matrix:
[[11 25]
 [11 23]]

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.31      0.38        36
           1       0.48      0.68      0.56        34

    accuracy                           0.49        70
   macro avg       0.49      0.49      0.47        70
weighted avg       0.49      0.49      0.47        70

Predicted class distribution: [22 48]

A


SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - XOM)

Approach: McDonld
----------------------------------------
Merged Vector + Short-term:
  Average Accuracy: 0.5080
  Average AUC: 0.5173
    Layer 1 - Accuracy: 0.5489, AUC: 0.5712
    Layer 1 - Coefficients: Title: 0.1651, Full_text: 0.4519, Intercept: 0.14842814694480044
    Layer 2 - Accuracy: 0.4752, AUC: 0.4370
    Layer 2 - Coefficients: Title: 0.1349, Full_text: 0.4809, Intercept: 0.18170025977244417
    Layer 3 - Accuracy: 0.5000, AUC: 0.5438
    Layer 3 - Coefficients: Title: 0.1184, Full_text: 0.6184, Intercept: 0.28974043483403944
Merged Vector + Long-term:
  Average Accuracy: 0.4500
  Average AUC: 0.5000
    Layer 1 - Accuracy: 0.2857, AUC: 0.5183
    Layer 1 - Coefficients: Title: 0.0489, Full_text: 0.1051, Intercept: -0.2900991290983406
    Layer 2 - Accuracy: 0.5644, AUC: 0.5022
    Layer 2 - Coefficients: Title: 0.0110, Full_text: 0.3427, Intercept: 0.05495760264197114
    Layer 3 - Accuracy: 0.5000, AUC: 0.4796
   


Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (S_label): {0: 271, 1: 261}
Val class distribution (S_label): {1: 77, 0: 56}
Test class distribution (S_label): {1: 52, 0: 49}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.5714
Test Accuracy for Layer 2: 0.4851
Test AUC for Layer 2: 0.4370
Model coefficients: [0.13517323 0.48024669]
Model intercept: 0.218780
Confusion Matrix:
[[24 25]
 [27 25]]

Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.49      0.48        49
           1       0.50      0.48      0.49        52

    accuracy                           0.49       101
   macro avg       0.49      0.49      0.49       101
weighted avg       0.49      0.49      0.49       101

Predicted class distribution: [51 50]




Average Test Accuracy across all layers: 0.5522
Average Test AUC across all layers: 0.5624

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinBERT_ProsusAI_score_norm - L_label

Training logistic regression model for FinBERT_ProsusAI merged approach, Long-term

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Train class distribution (L_label): {0: 235, 1: 166}
Val class distribution (L_label): {1: 89, 0: 42}
Test class distribution (L_label): {1: 95, 0: 38}
X_train shape: (401, 2), y_train shape: (401,)
Validation Accuracy: 0.5573
Test Accuracy for Layer 1: 0.4286
Test AUC for Layer 1: 0.3928
Model coefficients: [ 0.42797521 -0.10160921]
Model intercept: -0.006250
Confusion Matrix:
[[11 27]
 [49 46]]

Classification Report:
              precision    recall  f1-score   support

           0       0.18      0.29      0.22    


Layer 3:
Training period: 01/11/2014 - 31/10/2022
Validation period: 01/11/2022 - 31/10/2023
Testing period: 01/11/2023 - 01/11/2024
Training data: 666 samples
Validation data: 101 samples
Test data: 70 samples
Train class distribution (L_label): {1: 350, 0: 316}
Val class distribution (L_label): {0: 57, 1: 44}
Test class distribution (L_label): {0: 36, 1: 34}
X_train shape: (666, 2), y_train shape: (666,)
Validation Accuracy: 0.5347
Test Accuracy for Layer 3: 0.5429
Test AUC for Layer 3: 0.5131
Model coefficients: [ 0.14894519 -0.22606804]
Model intercept: -0.004782
Confusion Matrix:
[[26 10]
 [22 12]]

Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.72      0.62        36
           1       0.55      0.35      0.43        34

    accuracy                           0.54        70
   macro avg       0.54      0.54      0.52        70
weighted avg       0.54      0.54      0.53        70

Predicted class distribution: [48 22]


Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Train class distribution (S_label): {0: 271, 1: 261}
Val class distribution (S_label): {1: 77, 0: 56}
Test class distribution (S_label): {1: 52, 0: 49}
X_train shape: (532, 2), y_train shape: (532,)
Validation Accuracy: 0.5188
Test Accuracy for Layer 2: 0.5050
Test AUC for Layer 2: 0.5153
Model coefficients: [ 0.27936963 -0.21021686]
Model intercept: -0.024790
Confusion Matrix:
[[26 23]
 [27 25]]

Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.53      0.51        49
           1       0.52      0.48      0.50        52

    accuracy                           0.50       101
   macro avg       0.51      0.51      0.50       101
weighted avg       0.51      0.50      0.50       101

Predicted class distribution: [53 48


Average Test Accuracy across all layers: 0.4607
Average Test AUC across all layers: 0.4607

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_Fino1_score_norm - L_label

Training logistic regression model for Fino1 merged approach, Long-term

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Train class distribution (L_label): {0: 235, 1: 166}
Val class distribution (L_label): {1: 89, 0: 42}
Test class distribution (L_label): {1: 95, 0: 38}
X_train shape: (401, 2), y_train shape: (401,)
Validation Accuracy: 0.4046
Test Accuracy for Layer 1: 0.3308
Test AUC for Layer 1: 0.4677
Model coefficients: [ 0.00726486 -0.02657687]
Model intercept: -0.005532
Confusion Matrix:
[[32  6]
 [83 12]]

Classification Report:
              precision    recall  f1-score   support

           0       0.28      0.84      0.42        38
           1   

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import os
from datetime import datetime
import ast  # For parsing string representations of lists
import warnings
warnings.filterwarnings('ignore')

class MergedSentimentStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor using logistic regression for merged financial sentiment analysis.
        
        Args:
            csv_path: Path to the CSV file containing merged sentiment score vectors and stock labels
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        self.sentiment_approaches = [
            'McDonld', 'FinBERT_ProsusAI', 'FinBERT_yiyang', 
            'FinGPT', 'Majority_vote_mean', 'Fino1'
        ]
        self.scalers = {}
        
        # Create directories for visualizations
        os.makedirs('Merged_Sentiment_Logistic_Plots/COP/visualizations', exist_ok=True)
        os.makedirs('Merged_Sentiment_Logistic_Plots/COP/visualizations/confusion_matrices', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing merged sentiment scores and stock labels."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='%d/%m/%Y')
        self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='%d/%m/%Y')
        self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='%d/%m/%Y')
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Display initial data info
        print(f"Loaded {len(self.data)} financial news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        
        # Show class distribution for each label type
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        # Process merged sentiment score columns
        self._process_merged_scores()
        
        return self
    
    def _process_merged_scores(self):
        """
        Process and normalize the merged sentiment score columns.
        Each merged score is a vector [Title_score, Full_text_score]
        """
        print("\nProcessing merged sentiment scores...")
        
        # Get all merged sentiment score columns
        merged_columns = [f"Merged_{approach}_score" for approach in self.sentiment_approaches]
        
        # Check if these columns exist in the data
        existing_columns = [col for col in merged_columns if col in self.data.columns]
        
        if not existing_columns:
            raise ValueError("No merged sentiment score columns found in the data")
        
        # First, parse the string vectors into numerical arrays
        for col in existing_columns:
            # Check the format of the data to determine parsing method
            sample_value = self.data[col].iloc[0]
            
            # If values are already numerical arrays, we don't need to parse
            if isinstance(sample_value, (list, np.ndarray)):
                print(f"Column {col} already contains numerical arrays")
                continue
                
            # If values are stored as strings, parse them
            try:
                # Try parsing as literal Python representation
                self.data[col] = self.data[col].apply(ast.literal_eval)
                print(f"Parsed {col} using ast.literal_eval")
            except (ValueError, SyntaxError):
                try:
                    # Alternative: try parsing as comma-separated values
                    self.data[col] = self.data[col].str.strip('[]').str.split(',').apply(
                        lambda x: [float(val.strip()) for val in x]
                    )
                    print(f"Parsed {col} as comma-separated values")
                except Exception as e:
                    raise ValueError(f"Could not parse sentiment vectors in column {col}: {e}")
        
        # Create two separate normalized versions of each element in the vector
        print("\nNormalizing merged sentiment scores...")
        for col in existing_columns:
            # Extract Title and Full_text components
            title_scores = np.array([vec[0] for vec in self.data[col]])
            fulltext_scores = np.array([vec[1] for vec in self.data[col]])
            
            # Display original score ranges
            print(f"{col} (Title component): min={title_scores.min():.4f}, max={title_scores.max():.4f}")
            print(f"{col} (Full_text component): min={fulltext_scores.min():.4f}, max={fulltext_scores.max():.4f}")
            
            # Normalize each component separately
            title_scaler = MinMaxScaler(feature_range=(-1, 1))
            fulltext_scaler = MinMaxScaler(feature_range=(-1, 1))
            
            title_norm = title_scaler.fit_transform(title_scores.reshape(-1, 1)).flatten()
            fulltext_norm = fulltext_scaler.fit_transform(fulltext_scores.reshape(-1, 1)).flatten()
            
            # Store scalers for potential later use
            self.scalers[f"{col}_title"] = title_scaler
            self.scalers[f"{col}_fulltext"] = fulltext_scaler
            
            # Create normalized vectors
            self.data[f"{col}_norm"] = [
                [title_norm[i], fulltext_norm[i]] for i in range(len(title_norm))
            ]
            
            # Display normalized score ranges
            print(f"{col}_norm (Title component): min={title_norm.min():.4f}, max={title_norm.max():.4f}")
            print(f"{col}_norm (Full_text component): min={fulltext_norm.min():.4f}, max={fulltext_norm.max():.4f}")
        
        # Print sample of normalized vectors
        print("\nSample of normalized merged sentiment vectors (first 3 rows):")
        for col in [f"{col}_norm" for col in existing_columns]:
            print(f"{col}: {self.data[col].iloc[:3].tolist()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, sentiment_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            sentiment_col: The column containing the merged sentiment score vectors
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            Split datasets and corresponding indices
        """
        # Create masks for each time period
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        # Get data for each period
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Additional debug information: class distribution in each split
        print(f"Train class distribution ({label_col}): {train_data[label_col].value_counts().to_dict()}")
        print(f"Val class distribution ({label_col}): {val_data[label_col].value_counts().to_dict()}")
        print(f"Test class distribution ({label_col}): {test_data[label_col].value_counts().to_dict()}")
        
        # Extract merged sentiment vectors and convert to numpy arrays
        X_train = np.array([vec for vec in train_data[sentiment_col]])
        X_val = np.array([vec for vec in val_data[sentiment_col]])
        X_test = np.array([vec for vec in test_data[sentiment_col]])
        
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        # Get indices for later reference
        train_indices = train_data.index
        val_indices = val_data.index
        test_indices = test_data.index
        
        return (X_train, y_train, train_indices), (X_val, y_val, val_indices), (X_test, y_test, test_indices)
    
    def create_logistic_model(self, C=1.0, solver='liblinear', max_iter=1000, class_weight=None):
        """
        Create a logistic regression model for prediction.
        
        Args:
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
            
        Returns:
            Logistic regression model
        """
        return LogisticRegression(
            C=C,
            solver=solver,
            max_iter=max_iter,
            class_weight=class_weight,
            random_state=42
        )
    
    def train_and_evaluate(self, sentiment_column, label_column, C=1.0, solver='liblinear', 
                          max_iter=1000, class_weight=None):
        """
        Train and evaluate logistic regression model for a specific merged sentiment column and label column.
        
        Args:
            sentiment_column: The merged sentiment score column to use
            label_column: The label column to use ('S_label' or 'L_label')
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
        """
        # Determine sentiment approach
        approach = None
        for app in self.sentiment_approaches:
            if app in sentiment_column:
                approach = app
                break
        
        if approach is None:
            raise ValueError(f"Could not determine sentiment approach from column name: {sentiment_column}")
        
        # Determine label type
        if label_column == 'S_label':
            label_type = 'Short-term'
        else:
            label_type = 'Long-term'
        
        # Store results
        combination_key = f"{approach}_{label_type}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training logistic regression model for {approach} merged approach, {label_type}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data for this layer
            (X_train, y_train, train_indices), \
            (X_val, y_val, val_indices), \
            (X_test, y_test, test_indices) = self.split_data(layer, sentiment_column, label_column)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or \
               len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_logistic_model(
                C=C,
                solver=solver,
                max_iter=max_iter,
                class_weight=class_weight
            )
            
            # Debug: Print shapes
            print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
            
            # Train model
            model.fit(X_train, y_train)
            
            # Use validation set to evaluate
            val_pred_prob = model.predict_proba(X_val)[:, 1]
            val_pred = model.predict(X_val)
            val_accuracy = accuracy_score(y_val, val_pred)
            
            print(f"Validation Accuracy: {val_accuracy:.4f}")
            
            # Evaluate on test set
            y_pred_prob = model.predict_proba(X_test)[:, 1]
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_prob)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Print the coefs - now we have two coefficients for [Title, Full_text]
            print(f"Model coefficients: {model.coef_[0]}")
            print(f"Model intercept: {model.intercept_[0]:.6f}")
            
            # Print confusion matrix
            cm = confusion_matrix(y_test, y_pred)
            print("Confusion Matrix:")
            print(cm)
            
            # Print classification report
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            
            # Predicted class distribution
            print(f"Predicted class distribution: {np.bincount(y_pred)}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_prob': y_pred_prob,
                'accuracy': accuracy,
                'auc': auc,
                'confusion_matrix': cm,
                'coefficients': model.coef_[0],
                'intercept': model.intercept_[0]
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize confusion matrix for this layer
            self.visualize_confusion_matrix(
                cm,
                approach,
                label_type,
                i+1
            )
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid layers to calculate average metrics")
            self.results[combination_key]['avg_accuracy'] = np.nan
            self.results[combination_key]['avg_auc'] = np.nan
        
        return self
    
    def visualize_confusion_matrix(self, cm, approach, label_type, layer_num):
        """
        Visualize confusion matrix for a specific model and layer.
        
        Args:
            cm: Confusion matrix
            approach: Sentiment analysis approach
            label_type: Label type (Short-term or Long-term)
            layer_num: Layer number
        """
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"Confusion Matrix: {approach} - Merged - {label_type} (Layer {layer_num})")
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        
        # Save figure
        save_path = f'Merged_Sentiment_Logistic_Plots/COP/visualizations/confusion_matrices/{approach}_Merged_{label_type}_layer{layer_num}.png'
        plt.savefig(save_path)
        plt.close()
    
    def run_all_combinations(self, use_class_weights=False):
        """
        Run the logistic regression analysis for all combinations of merged sentiment approaches and label columns.
        
        Args:
            use_class_weights: Whether to use balanced class weights
        """
        # Define all combinations - now only looping through approaches and label types
        combinations = []
        
        for approach in self.sentiment_approaches:
            for label_type in ['S_label', 'L_label']:
                sentiment_col = f"Merged_{approach}_score_norm"
                if sentiment_col in self.data.columns:
                    combinations.append((sentiment_col, label_type))
        
        # Run analysis for each combination
        for sentiment_col, label_col in combinations:
            # Train model with or without class weights
            class_weight = 'balanced' if use_class_weights else None
            
            print(f"\n{'='*80}")
            print(f"TRAINING LOGISTIC REGRESSION MODEL FOR {sentiment_col} - {label_col}")
            print(f"{'='*80}")
            
            self.train_and_evaluate(
                sentiment_column=sentiment_col, 
                label_column=label_col,
                class_weight=class_weight
            )
        
        # Create summary visualizations
        self.create_summary_visualizations()
        
        # Print final summary
        self.print_summary()
        
        return self
    
    def create_summary_visualizations(self):
        """Create summary visualizations comparing model performances."""
        self._create_performance_by_approach_visualizations()
    
    def _create_performance_by_approach_visualizations(self):
        """Create visualizations comparing performance by approach."""
        # Collect all results
        approaches = []
        combinations = []
        accuracies = []
        aucs = []
        
        for combo_key, results in self.results.items():
            if 'avg_accuracy' in results:
                # Parse combination key
                parts = combo_key.split('_')
                approach = parts[0]
                label_type = '_'.join(parts[1:])
                
                approaches.append(approach)
                combinations.append(f"{approach}_{label_type}")
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            return
        
        # Create dataframe for plotting
        df = pd.DataFrame({
            'Approach': approaches,
            'Combination': combinations,
            'Accuracy': accuracies,
            'AUC': aucs
        })
        
        # Plot accuracy comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='Accuracy', hue='Combination', data=df)
        plt.title('Accuracy by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/COP/visualizations/Accuracy_performance_comparison.png')
        plt.close()
        
        # Plot AUC comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='AUC', hue='Combination', data=df)
        plt.title('AUC by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/COP/visualizations/AUC_performance_comparison.png')
        plt.close()
        
        # Also create a coefficient analysis visualization
        self._create_coefficient_analysis()
    
    def _create_coefficient_analysis(self):
        """Create visualization showing the relative importance of Title vs Full_text in each approach."""
        coef_data = []
        
        for combo_key, results in self.results.items():
            for layer_result in results.get('layer_results', []):
                if 'coefficients' in layer_result and len(layer_result['coefficients']) == 2:
                    parts = combo_key.split('_')
                    approach = parts[0]
                    label_type = '_'.join(parts[1:])
                    
                    coef_data.append({
                        'Approach': approach,
                        'Label Type': label_type,
                        'Layer': f"Layer {layer_result['layer']}",
                        'Title Coefficient': layer_result['coefficients'][0],
                        'Full_text Coefficient': layer_result['coefficients'][1],
                        'Title to Full_text Ratio': abs(layer_result['coefficients'][0] / 
                                                      (layer_result['coefficients'][1] 
                                                       if layer_result['coefficients'][1] != 0 else 1e-6))
                    })
        
        if not coef_data:
            return
        
        df = pd.DataFrame(coef_data)
        
        # Plot coefficient comparison
        plt.figure(figsize=(14, 10))
        
        plt.subplot(2, 1, 1)
        sns.barplot(x='Approach', y='Title Coefficient', hue='Label Type', data=df)
        plt.title('Title Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.subplot(2, 1, 2)
        sns.barplot(x='Approach', y='Full_text Coefficient', hue='Label Type', data=df)
        plt.title('Full_text Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.tight_layout()
        plt.savefig('Merged_Sentiment_Logistic_Plots/COP/visualizations/Coefficient_comparison.png')
        plt.close()
    
    def print_summary(self):
        """Print a summary of all results."""
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - COP)")
        print("="*80)
        
        # Organize results by approach
        for approach in self.sentiment_approaches:
            print(f"\nApproach: {approach}")
            print("-" * 40)
            
            for label_type in ['Short-term', 'Long-term']:
                label_col = 'S_label' if label_type == 'Short-term' else 'L_label'
                combination_key = f"{approach}_{label_type}"
                
                if combination_key in self.results and 'avg_accuracy' in self.results[combination_key]:
                    avg_accuracy = self.results[combination_key]['avg_accuracy']
                    avg_auc = self.results[combination_key]['avg_auc']
                    
                    print(f"Merged Vector + {label_type}:")
                    print(f"  Average Accuracy: {avg_accuracy:.4f}")
                    print(f"  Average AUC: {avg_auc:.4f}")
                    
                    # Print layer-specific results
                    for i, layer_result in enumerate(self.results[combination_key].get('layer_results', [])):
                        accuracy = layer_result['accuracy']
                        auc = layer_result['auc']
                        coefficients = layer_result.get('coefficients', 'N/A')
                        intercept = layer_result.get('intercept', 'N/A')
                        
                        # More readable coefficient display - showing Title and Full_text components
                        coef_str = "N/A"
                        if isinstance(coefficients, np.ndarray) and len(coefficients) == 2:
                            coef_str = f"Title: {coefficients[0]:.4f}, Full_text: {coefficients[1]:.4f}"
                        
                        print(f"    Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                        print(f"    Layer {i+1} - Coefficients: {coef_str}, Intercept: {intercept}")
            
            print()
        
        # Find best overall combination
        best_accuracy = 0
        best_auc = 0
        best_accuracy_combo = None
        best_auc_combo = None
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results:
                if results['avg_accuracy'] > best_accuracy:
                    best_accuracy = results['avg_accuracy']
                    best_accuracy_combo = combo
                
                if results['avg_auc'] > best_auc:
                    best_auc = results['avg_auc']
                    best_auc_combo = combo
        
        print("\nBest Overall Combinations:")
        print(f"Best Accuracy: {best_accuracy:.4f} - {best_accuracy_combo}")
        print(f"Best AUC: {best_auc:.4f} - {best_auc_combo}")
        
        return self

# Main execution
if __name__ == "__main__":
    # Initialize predictor with the new CSV file
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_sentiment/us_news_sentiment_database_part1_COP.csv')
    
    # Run prediction pipeline
    # First run without class weights
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS")
    print("="*80)
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=False)
    
    # Then run with class weights to address potential class imbalance
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITH BALANCED CLASS WEIGHTS")
    print("="*80)
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_sentiment/us_news_sentiment_database_part1_COP.csv')
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=True)


RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS
Loaded 929 financial news articles spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {0: 479, 1: 450}
Class distribution for long-term prediction: {1: 494, 0: 435}

Processing merged sentiment scores...
Parsed Merged_McDonld_score using ast.literal_eval
Parsed Merged_FinBERT_ProsusAI_score using ast.literal_eval
Parsed Merged_FinBERT_yiyang_score using ast.literal_eval
Parsed Merged_FinGPT_score using ast.literal_eval
Parsed Merged_Majority_vote_mean_score using ast.literal_eval
Parsed Merged_Fino1_score using ast.literal_eval

Normalizing merged sentiment scores...
Merged_McDonld_score (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score (Full_text component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Full_text component): min=-1.0000, max=1.0000
Merged_FinBERT_ProsusAI_score (Title component)


Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Train class distribution (L_label): {1: 397, 0: 354}
Val class distribution (L_label): {1: 67, 0: 41}
Test class distribution (L_label): {0: 40, 1: 29}
X_train shape: (751, 2), y_train shape: (751,)
Validation Accuracy: 0.6111
Test Accuracy for Layer 3: 0.4638
Test AUC for Layer 3: 0.4746
Model coefficients: [-0.10566966 -0.15888277]
Model intercept: 0.009556
Confusion Matrix:
[[ 4 36]
 [ 1 28]]

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.10      0.18        40
           1       0.44      0.97      0.60        29

    accuracy                           0.46        69
   macro avg       0.62      0.53      0.39        69
weighted avg       0.65      0.46      0.36        69

Predicted class distribution: [ 5 64]



Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (S_label): {0: 356, 1: 329}
Val class distribution (S_label): {1: 36, 0: 30}
Test class distribution (S_label): {1: 55, 0: 53}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.4545
Test Accuracy for Layer 2: 0.5093
Test AUC for Layer 2: 0.4902
Model coefficients: [ 0.27625587 -0.16776366]
Model intercept: -0.044971
Confusion Matrix:
[[46  7]
 [46  9]]

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.87      0.63        53
           1       0.56      0.16      0.25        55

    accuracy                           0.51       108
   macro avg       0.53      0.52      0.44       108
weighted avg       0.53      0.51      0.44       108

Predicted class distribution: [92 16]


Average Test Accuracy across all layers: 0.4996
Average Test AUC across all layers: 0.4612

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinGPT_score_norm - L_label

Training logistic regression model for FinGPT merged approach, Long-term

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Train class distribution (L_label): {0: 265, 1: 257}
Val class distribution (L_label): {1: 85, 0: 78}
Test class distribution (L_label): {1: 55, 0: 11}
X_train shape: (522, 2), y_train shape: (522,)
Validation Accuracy: 0.4724
Test Accuracy for Layer 1: 0.1667
Test AUC for Layer 1: 0.4595
Model coefficients: [-0.00784612  0.62662724]
Model intercept: 0.560745
Confusion Matrix:
[[10  1]
 [54  1]]

Classification Report:
              precision    recall  f1-score   support

           0       0.16      0.91      0.27        11
           1   


Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Train class distribution (L_label): {1: 397, 0: 354}
Val class distribution (L_label): {1: 67, 0: 41}
Test class distribution (L_label): {0: 40, 1: 29}
X_train shape: (751, 2), y_train shape: (751,)
Validation Accuracy: 0.6204
Test Accuracy for Layer 3: 0.4348
Test AUC for Layer 3: 0.4888
Model coefficients: [-0.13862968  0.23038747]
Model intercept: 0.175570
Confusion Matrix:
[[ 1 39]
 [ 0 29]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.03      0.05        40
           1       0.43      1.00      0.60        29

    accuracy                           0.43        69
   macro avg       0.71      0.51      0.32        69
weighted avg       0.76      0.43      0.28        69

Predicted class distribution: [ 1 68]



SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - COP)

Approach: McDonld
----------------------------------------
Merged Vector + Short-term:
  Average Accuracy: 0.4937
  Average AUC: 0.4697
    Layer 1 - Accuracy: 0.4848, AUC: 0.5074
    Layer 1 - Coefficients: Title: -0.3212, Full_text: 0.3609, Intercept: -0.03270101344798935
    Layer 2 - Accuracy: 0.4167, AUC: 0.3528
    Layer 2 - Coefficients: Title: -0.2457, Full_text: 0.2749, Intercept: -0.03471255183676921
    Layer 3 - Accuracy: 0.5797, AUC: 0.5487
    Layer 3 - Coefficients: Title: -0.2110, Full_text: 0.3615, Intercept: 0.03959541955573644
Merged Vector + Long-term:
  Average Accuracy: 0.4635
  Average AUC: 0.4922
    Layer 1 - Accuracy: 0.4545, AUC: 0.4975
    Layer 1 - Coefficients: Title: 0.0133, Full_text: -0.1894, Intercept: -0.10699916915638599
    Layer 2 - Accuracy: 0.4722, AUC: 0.5044
    Layer 2 - Coefficients: Title: -0.1351, Full_text: -0.1536, Intercept: -0.11479290118387274
    Layer 3 - Accuracy: 0.4638, AUC: 


Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (S_label): {0: 356, 1: 329}
Val class distribution (S_label): {1: 36, 0: 30}
Test class distribution (S_label): {1: 55, 0: 53}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.4697
Test Accuracy for Layer 2: 0.3519
Test AUC for Layer 2: 0.3528
Model coefficients: [-0.24591771  0.27229233]
Model intercept: 0.042422
Confusion Matrix:
[[19 34]
 [36 19]]

Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.36      0.35        53
           1       0.36      0.35      0.35        55

    accuracy                           0.35       108
   macro avg       0.35      0.35      0.35       108
weighted avg       0.35      0.35      0.35       108

Predicted class distribution: [55 53]



Average Test Accuracy across all layers: 0.4705
Average Test AUC across all layers: 0.5057

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinBERT_ProsusAI_score_norm - L_label

Training logistic regression model for FinBERT_ProsusAI merged approach, Long-term

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Train class distribution (L_label): {0: 265, 1: 257}
Val class distribution (L_label): {1: 85, 0: 78}
Test class distribution (L_label): {1: 55, 0: 11}
X_train shape: (522, 2), y_train shape: (522,)
Validation Accuracy: 0.5092
Test Accuracy for Layer 1: 0.4091
Test AUC for Layer 1: 0.4909
Model coefficients: [0.08148849 0.20574206]
Model intercept: 0.019858
Confusion Matrix:
[[ 5  6]
 [33 22]]

Classification Report:
              precision    recall  f1-score   support

           0       0.13      0.45      0.20        


Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Train class distribution (L_label): {1: 397, 0: 354}
Val class distribution (L_label): {1: 67, 0: 41}
Test class distribution (L_label): {0: 40, 1: 29}
X_train shape: (751, 2), y_train shape: (751,)
Validation Accuracy: 0.5278
Test Accuracy for Layer 3: 0.4928
Test AUC for Layer 3: 0.4681
Model coefficients: [0.11531328 0.11726909]
Model intercept: 0.037236
Confusion Matrix:
[[13 27]
 [ 8 21]]

Classification Report:
              precision    recall  f1-score   support

           0       0.62      0.33      0.43        40
           1       0.44      0.72      0.55        29

    accuracy                           0.49        69
   macro avg       0.53      0.52      0.49        69
weighted avg       0.54      0.49      0.48        69

Predicted class distribution: [21 48]

A


Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Train class distribution (S_label): {0: 386, 1: 365}
Val class distribution (S_label): {1: 55, 0: 53}
Test class distribution (S_label): {0: 39, 1: 30}
X_train shape: (751, 2), y_train shape: (751,)
Validation Accuracy: 0.4722
Test Accuracy for Layer 3: 0.5072
Test AUC for Layer 3: 0.5231
Model coefficients: [-0.0754791  0.1140765]
Model intercept: 0.028938
Confusion Matrix:
[[18 21]
 [13 17]]

Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.46      0.51        39
           1       0.45      0.57      0.50        30

    accuracy                           0.51        69
   macro avg       0.51      0.51      0.51        69
weighted avg       0.52      0.51      0.51        69

Predicted class distribution: [31 38]

A


Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (L_label): {0: 343, 1: 342}
Val class distribution (L_label): {1: 55, 0: 11}
Test class distribution (L_label): {1: 67, 0: 41}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.3939
Test Accuracy for Layer 2: 0.4907
Test AUC for Layer 2: 0.5490
Model coefficients: [ 0.00648513 -0.22689401]
Model intercept: -0.047836
Confusion Matrix:
[[30 11]
 [44 23]]

Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.73      0.52        41
           1       0.68      0.34      0.46        67

    accuracy                           0.49       108
   macro avg       0.54      0.54      0.49       108
weighted avg       0.57      0.49      0.48       108

Predicted class distribution: [74 34]

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import os
from datetime import datetime
import ast  # For parsing string representations of lists
import warnings
warnings.filterwarnings('ignore')

class MergedSentimentStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor using logistic regression for merged financial sentiment analysis.
        
        Args:
            csv_path: Path to the CSV file containing merged sentiment score vectors and stock labels
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        self.sentiment_approaches = [
            'McDonld', 'FinBERT_ProsusAI', 'FinBERT_yiyang', 
            'FinGPT', 'Majority_vote_mean', 'Fino1'
        ]
        self.scalers = {}
        
        # Create directories for visualizations
        os.makedirs('Merged_Sentiment_Logistic_Plots/CVX/visualizations', exist_ok=True)
        os.makedirs('Merged_Sentiment_Logistic_Plots/CVX/visualizations/confusion_matrices', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing merged sentiment scores and stock labels."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='%d/%m/%Y')
        self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='%d/%m/%Y')
        self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='%d/%m/%Y')
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Display initial data info
        print(f"Loaded {len(self.data)} financial news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        
        # Show class distribution for each label type
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        # Process merged sentiment score columns
        self._process_merged_scores()
        
        return self
    
    def _process_merged_scores(self):
        """
        Process and normalize the merged sentiment score columns.
        Each merged score is a vector [Title_score, Full_text_score]
        """
        print("\nProcessing merged sentiment scores...")
        
        # Get all merged sentiment score columns
        merged_columns = [f"Merged_{approach}_score" for approach in self.sentiment_approaches]
        
        # Check if these columns exist in the data
        existing_columns = [col for col in merged_columns if col in self.data.columns]
        
        if not existing_columns:
            raise ValueError("No merged sentiment score columns found in the data")
        
        # First, parse the string vectors into numerical arrays
        for col in existing_columns:
            # Check the format of the data to determine parsing method
            sample_value = self.data[col].iloc[0]
            
            # If values are already numerical arrays, we don't need to parse
            if isinstance(sample_value, (list, np.ndarray)):
                print(f"Column {col} already contains numerical arrays")
                continue
                
            # If values are stored as strings, parse them
            try:
                # Try parsing as literal Python representation
                self.data[col] = self.data[col].apply(ast.literal_eval)
                print(f"Parsed {col} using ast.literal_eval")
            except (ValueError, SyntaxError):
                try:
                    # Alternative: try parsing as comma-separated values
                    self.data[col] = self.data[col].str.strip('[]').str.split(',').apply(
                        lambda x: [float(val.strip()) for val in x]
                    )
                    print(f"Parsed {col} as comma-separated values")
                except Exception as e:
                    raise ValueError(f"Could not parse sentiment vectors in column {col}: {e}")
        
        # Create two separate normalized versions of each element in the vector
        print("\nNormalizing merged sentiment scores...")
        for col in existing_columns:
            # Extract Title and Full_text components
            title_scores = np.array([vec[0] for vec in self.data[col]])
            fulltext_scores = np.array([vec[1] for vec in self.data[col]])
            
            # Display original score ranges
            print(f"{col} (Title component): min={title_scores.min():.4f}, max={title_scores.max():.4f}")
            print(f"{col} (Full_text component): min={fulltext_scores.min():.4f}, max={fulltext_scores.max():.4f}")
            
            # Normalize each component separately
            title_scaler = MinMaxScaler(feature_range=(-1, 1))
            fulltext_scaler = MinMaxScaler(feature_range=(-1, 1))
            
            title_norm = title_scaler.fit_transform(title_scores.reshape(-1, 1)).flatten()
            fulltext_norm = fulltext_scaler.fit_transform(fulltext_scores.reshape(-1, 1)).flatten()
            
            # Store scalers for potential later use
            self.scalers[f"{col}_title"] = title_scaler
            self.scalers[f"{col}_fulltext"] = fulltext_scaler
            
            # Create normalized vectors
            self.data[f"{col}_norm"] = [
                [title_norm[i], fulltext_norm[i]] for i in range(len(title_norm))
            ]
            
            # Display normalized score ranges
            print(f"{col}_norm (Title component): min={title_norm.min():.4f}, max={title_norm.max():.4f}")
            print(f"{col}_norm (Full_text component): min={fulltext_norm.min():.4f}, max={fulltext_norm.max():.4f}")
        
        # Print sample of normalized vectors
        print("\nSample of normalized merged sentiment vectors (first 3 rows):")
        for col in [f"{col}_norm" for col in existing_columns]:
            print(f"{col}: {self.data[col].iloc[:3].tolist()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, sentiment_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            sentiment_col: The column containing the merged sentiment score vectors
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            Split datasets and corresponding indices
        """
        # Create masks for each time period
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        # Get data for each period
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Additional debug information: class distribution in each split
        print(f"Train class distribution ({label_col}): {train_data[label_col].value_counts().to_dict()}")
        print(f"Val class distribution ({label_col}): {val_data[label_col].value_counts().to_dict()}")
        print(f"Test class distribution ({label_col}): {test_data[label_col].value_counts().to_dict()}")
        
        # Extract merged sentiment vectors and convert to numpy arrays
        X_train = np.array([vec for vec in train_data[sentiment_col]])
        X_val = np.array([vec for vec in val_data[sentiment_col]])
        X_test = np.array([vec for vec in test_data[sentiment_col]])
        
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        # Get indices for later reference
        train_indices = train_data.index
        val_indices = val_data.index
        test_indices = test_data.index
        
        return (X_train, y_train, train_indices), (X_val, y_val, val_indices), (X_test, y_test, test_indices)
    
    def create_logistic_model(self, C=1.0, solver='liblinear', max_iter=1000, class_weight=None):
        """
        Create a logistic regression model for prediction.
        
        Args:
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
            
        Returns:
            Logistic regression model
        """
        return LogisticRegression(
            C=C,
            solver=solver,
            max_iter=max_iter,
            class_weight=class_weight,
            random_state=42
        )
    
    def train_and_evaluate(self, sentiment_column, label_column, C=1.0, solver='liblinear', 
                          max_iter=1000, class_weight=None):
        """
        Train and evaluate logistic regression model for a specific merged sentiment column and label column.
        
        Args:
            sentiment_column: The merged sentiment score column to use
            label_column: The label column to use ('S_label' or 'L_label')
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
        """
        # Determine sentiment approach
        approach = None
        for app in self.sentiment_approaches:
            if app in sentiment_column:
                approach = app
                break
        
        if approach is None:
            raise ValueError(f"Could not determine sentiment approach from column name: {sentiment_column}")
        
        # Determine label type
        if label_column == 'S_label':
            label_type = 'Short-term'
        else:
            label_type = 'Long-term'
        
        # Store results
        combination_key = f"{approach}_{label_type}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training logistic regression model for {approach} merged approach, {label_type}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data for this layer
            (X_train, y_train, train_indices), \
            (X_val, y_val, val_indices), \
            (X_test, y_test, test_indices) = self.split_data(layer, sentiment_column, label_column)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or \
               len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_logistic_model(
                C=C,
                solver=solver,
                max_iter=max_iter,
                class_weight=class_weight
            )
            
            # Debug: Print shapes
            print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
            
            # Train model
            model.fit(X_train, y_train)
            
            # Use validation set to evaluate
            val_pred_prob = model.predict_proba(X_val)[:, 1]
            val_pred = model.predict(X_val)
            val_accuracy = accuracy_score(y_val, val_pred)
            
            print(f"Validation Accuracy: {val_accuracy:.4f}")
            
            # Evaluate on test set
            y_pred_prob = model.predict_proba(X_test)[:, 1]
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_prob)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Print the coefs - now we have two coefficients for [Title, Full_text]
            print(f"Model coefficients: {model.coef_[0]}")
            print(f"Model intercept: {model.intercept_[0]:.6f}")
            
            # Print confusion matrix
            cm = confusion_matrix(y_test, y_pred)
            print("Confusion Matrix:")
            print(cm)
            
            # Print classification report
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            
            # Predicted class distribution
            print(f"Predicted class distribution: {np.bincount(y_pred)}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_prob': y_pred_prob,
                'accuracy': accuracy,
                'auc': auc,
                'confusion_matrix': cm,
                'coefficients': model.coef_[0],
                'intercept': model.intercept_[0]
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize confusion matrix for this layer
            self.visualize_confusion_matrix(
                cm,
                approach,
                label_type,
                i+1
            )
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid layers to calculate average metrics")
            self.results[combination_key]['avg_accuracy'] = np.nan
            self.results[combination_key]['avg_auc'] = np.nan
        
        return self
    
    def visualize_confusion_matrix(self, cm, approach, label_type, layer_num):
        """
        Visualize confusion matrix for a specific model and layer.
        
        Args:
            cm: Confusion matrix
            approach: Sentiment analysis approach
            label_type: Label type (Short-term or Long-term)
            layer_num: Layer number
        """
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"Confusion Matrix: {approach} - Merged - {label_type} (Layer {layer_num})")
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        
        # Save figure
        save_path = f'Merged_Sentiment_Logistic_Plots/CVX/visualizations/confusion_matrices/{approach}_Merged_{label_type}_layer{layer_num}.png'
        plt.savefig(save_path)
        plt.close()
    
    def run_all_combinations(self, use_class_weights=False):
        """
        Run the logistic regression analysis for all combinations of merged sentiment approaches and label columns.
        
        Args:
            use_class_weights: Whether to use balanced class weights
        """
        # Define all combinations - now only looping through approaches and label types
        combinations = []
        
        for approach in self.sentiment_approaches:
            for label_type in ['S_label', 'L_label']:
                sentiment_col = f"Merged_{approach}_score_norm"
                if sentiment_col in self.data.columns:
                    combinations.append((sentiment_col, label_type))
        
        # Run analysis for each combination
        for sentiment_col, label_col in combinations:
            # Train model with or without class weights
            class_weight = 'balanced' if use_class_weights else None
            
            print(f"\n{'='*80}")
            print(f"TRAINING LOGISTIC REGRESSION MODEL FOR {sentiment_col} - {label_col}")
            print(f"{'='*80}")
            
            self.train_and_evaluate(
                sentiment_column=sentiment_col, 
                label_column=label_col,
                class_weight=class_weight
            )
        
        # Create summary visualizations
        self.create_summary_visualizations()
        
        # Print final summary
        self.print_summary()
        
        return self
    
    def create_summary_visualizations(self):
        """Create summary visualizations comparing model performances."""
        self._create_performance_by_approach_visualizations()
    
    def _create_performance_by_approach_visualizations(self):
        """Create visualizations comparing performance by approach."""
        # Collect all results
        approaches = []
        combinations = []
        accuracies = []
        aucs = []
        
        for combo_key, results in self.results.items():
            if 'avg_accuracy' in results:
                # Parse combination key
                parts = combo_key.split('_')
                approach = parts[0]
                label_type = '_'.join(parts[1:])
                
                approaches.append(approach)
                combinations.append(f"{approach}_{label_type}")
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            return
        
        # Create dataframe for plotting
        df = pd.DataFrame({
            'Approach': approaches,
            'Combination': combinations,
            'Accuracy': accuracies,
            'AUC': aucs
        })
        
        # Plot accuracy comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='Accuracy', hue='Combination', data=df)
        plt.title('Accuracy by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/CVX/visualizations/Accuracy_performance_comparison.png')
        plt.close()
        
        # Plot AUC comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='AUC', hue='Combination', data=df)
        plt.title('AUC by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/CVX/visualizations/AUC_performance_comparison.png')
        plt.close()
        
        # Also create a coefficient analysis visualization
        self._create_coefficient_analysis()
    
    def _create_coefficient_analysis(self):
        """Create visualization showing the relative importance of Title vs Full_text in each approach."""
        coef_data = []
        
        for combo_key, results in self.results.items():
            for layer_result in results.get('layer_results', []):
                if 'coefficients' in layer_result and len(layer_result['coefficients']) == 2:
                    parts = combo_key.split('_')
                    approach = parts[0]
                    label_type = '_'.join(parts[1:])
                    
                    coef_data.append({
                        'Approach': approach,
                        'Label Type': label_type,
                        'Layer': f"Layer {layer_result['layer']}",
                        'Title Coefficient': layer_result['coefficients'][0],
                        'Full_text Coefficient': layer_result['coefficients'][1],
                        'Title to Full_text Ratio': abs(layer_result['coefficients'][0] / 
                                                      (layer_result['coefficients'][1] 
                                                       if layer_result['coefficients'][1] != 0 else 1e-6))
                    })
        
        if not coef_data:
            return
        
        df = pd.DataFrame(coef_data)
        
        # Plot coefficient comparison
        plt.figure(figsize=(14, 10))
        
        plt.subplot(2, 1, 1)
        sns.barplot(x='Approach', y='Title Coefficient', hue='Label Type', data=df)
        plt.title('Title Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.subplot(2, 1, 2)
        sns.barplot(x='Approach', y='Full_text Coefficient', hue='Label Type', data=df)
        plt.title('Full_text Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.tight_layout()
        plt.savefig('Merged_Sentiment_Logistic_Plots/CVX/visualizations/Coefficient_comparison.png')
        plt.close()
    
    def print_summary(self):
        """Print a summary of all results."""
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - CVX)")
        print("="*80)
        
        # Organize results by approach
        for approach in self.sentiment_approaches:
            print(f"\nApproach: {approach}")
            print("-" * 40)
            
            for label_type in ['Short-term', 'Long-term']:
                label_col = 'S_label' if label_type == 'Short-term' else 'L_label'
                combination_key = f"{approach}_{label_type}"
                
                if combination_key in self.results and 'avg_accuracy' in self.results[combination_key]:
                    avg_accuracy = self.results[combination_key]['avg_accuracy']
                    avg_auc = self.results[combination_key]['avg_auc']
                    
                    print(f"Merged Vector + {label_type}:")
                    print(f"  Average Accuracy: {avg_accuracy:.4f}")
                    print(f"  Average AUC: {avg_auc:.4f}")
                    
                    # Print layer-specific results
                    for i, layer_result in enumerate(self.results[combination_key].get('layer_results', [])):
                        accuracy = layer_result['accuracy']
                        auc = layer_result['auc']
                        coefficients = layer_result.get('coefficients', 'N/A')
                        intercept = layer_result.get('intercept', 'N/A')
                        
                        # More readable coefficient display - showing Title and Full_text components
                        coef_str = "N/A"
                        if isinstance(coefficients, np.ndarray) and len(coefficients) == 2:
                            coef_str = f"Title: {coefficients[0]:.4f}, Full_text: {coefficients[1]:.4f}"
                        
                        print(f"    Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                        print(f"    Layer {i+1} - Coefficients: {coef_str}, Intercept: {intercept}")
            
            print()
        
        # Find best overall combination
        best_accuracy = 0
        best_auc = 0
        best_accuracy_combo = None
        best_auc_combo = None
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results:
                if results['avg_accuracy'] > best_accuracy:
                    best_accuracy = results['avg_accuracy']
                    best_accuracy_combo = combo
                
                if results['avg_auc'] > best_auc:
                    best_auc = results['avg_auc']
                    best_auc_combo = combo
        
        print("\nBest Overall Combinations:")
        print(f"Best Accuracy: {best_accuracy:.4f} - {best_accuracy_combo}")
        print(f"Best AUC: {best_auc:.4f} - {best_auc_combo}")
        
        return self

# Main execution
if __name__ == "__main__":
    # Initialize predictor with the new CSV file
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_sentiment/us_news_sentiment_database_part1_CVX.csv')
    
    # Run prediction pipeline
    # First run without class weights
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS")
    print("="*80)
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=False)
    
    # Then run with class weights to address potential class imbalance
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITH BALANCED CLASS WEIGHTS")
    print("="*80)
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_sentiment/us_news_sentiment_database_part1_CVX.csv')
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=True)


RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS
Loaded 929 financial news articles spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {1: 487, 0: 442}
Class distribution for long-term prediction: {1: 512, 0: 417}

Processing merged sentiment scores...
Parsed Merged_McDonld_score using ast.literal_eval
Parsed Merged_FinBERT_ProsusAI_score using ast.literal_eval
Parsed Merged_FinBERT_yiyang_score using ast.literal_eval
Parsed Merged_FinGPT_score using ast.literal_eval
Parsed Merged_Majority_vote_mean_score using ast.literal_eval
Parsed Merged_Fino1_score using ast.literal_eval

Normalizing merged sentiment scores...
Merged_McDonld_score (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score (Full_text component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Full_text component): min=-1.0000, max=1.0000
Merged_FinBERT_ProsusAI_score (Title component)


Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (L_label): {1: 377, 0: 308}
Val class distribution (L_label): {1: 51, 0: 15}
Test class distribution (L_label): {1: 66, 0: 42}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.7727
Test Accuracy for Layer 2: 0.6111
Test AUC for Layer 2: 0.4760
Model coefficients: [-0.07185638  0.02510221]
Model intercept: 0.189237
Confusion Matrix:
[[ 0 42]
 [ 0 66]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        42
           1       0.61      1.00      0.76        66

    accuracy                           0.61       108
   macro avg       0.31      0.50      0.38       108
weighted avg       0.37      0.61      0.46       108

Predicted class distribution: [  0 108


Average Test Accuracy across all layers: 0.5564
Average Test AUC across all layers: 0.5406

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinBERT_yiyang_score_norm - S_label

Training logistic regression model for FinBERT_yiyang merged approach, Short-term

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Train class distribution (S_label): {1: 268, 0: 254}
Val class distribution (S_label): {1: 86, 0: 77}
Test class distribution (S_label): {1: 41, 0: 25}
X_train shape: (522, 2), y_train shape: (522,)
Validation Accuracy: 0.6012
Test Accuracy for Layer 1: 0.5758
Test AUC for Layer 1: 0.4468
Model coefficients: [ 0.19017253 -0.29601403]
Model intercept: 0.055003
Confusion Matrix:
[[ 8 17]
 [11 30]]

Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.32      0.36        2

Validation Accuracy: 0.6212
Test Accuracy for Layer 2: 0.5648
Test AUC for Layer 2: 0.5403
Model coefficients: [-0.13712476  0.43311534]
Model intercept: 0.467846
Confusion Matrix:
[[ 0 47]
 [ 0 61]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        47
           1       0.56      1.00      0.72        61

    accuracy                           0.56       108
   macro avg       0.28      0.50      0.36       108
weighted avg       0.32      0.56      0.41       108

Predicted class distribution: [  0 108]

Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Train class distribution (S_label): {1: 395, 0: 356}
Val class distribution (S_label): {1: 61, 0: 47}
Test class distribution (S_label): {0: 38, 1: 31}
X_train shape: (751, 2), y_train shape: (751,


Average Test Accuracy across all layers: 0.5213
Average Test AUC across all layers: 0.4670

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_Majority_vote_mean_score_norm - L_label

Training logistic regression model for Majority_vote_mean merged approach, Long-term

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Train class distribution (L_label): {1: 277, 0: 245}
Val class distribution (L_label): {1: 100, 0: 63}
Test class distribution (L_label): {1: 51, 0: 15}
X_train shape: (522, 2), y_train shape: (522,)
Validation Accuracy: 0.5706
Test Accuracy for Layer 1: 0.6667
Test AUC for Layer 1: 0.5869
Model coefficients: [-0.19582024  0.37036113]
Model intercept: 0.222423
Confusion Matrix:
[[ 4 11]
 [11 40]]

Classification Report:
              precision    recall  f1-score   support

           0       0.27      0.27      0.27 


Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Train class distribution (L_label): {1: 428, 0: 323}
Val class distribution (L_label): {1: 66, 0: 42}
Test class distribution (L_label): {0: 51, 1: 18}
X_train shape: (751, 2), y_train shape: (751,)
Validation Accuracy: 0.6111
Test Accuracy for Layer 3: 0.2609
Test AUC for Layer 3: 0.5163
Model coefficients: [ 0.05710643 -0.09116148]
Model intercept: 0.260294
Confusion Matrix:
[[ 0 51]
 [ 0 18]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        51
           1       0.26      1.00      0.41        18

    accuracy                           0.26        69
   macro avg       0.13      0.50      0.21        69
weighted avg       0.07      0.26      0.11        69

Predicted class distribution: [ 0 69]



Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (S_label): {1: 354, 0: 331}
Val class distribution (S_label): {1: 41, 0: 25}
Test class distribution (S_label): {1: 61, 0: 47}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.5758
Test Accuracy for Layer 2: 0.4074
Test AUC for Layer 2: 0.3887
Model coefficients: [-0.11495065  0.32654874]
Model intercept: 0.109201
Confusion Matrix:
[[19 28]
 [36 25]]

Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.40      0.37        47
           1       0.47      0.41      0.44        61

    accuracy                           0.41       108
   macro avg       0.41      0.41      0.41       108
weighted avg       0.42      0.41      0.41       108

Predicted class distribution: [55 53]



Average Test Accuracy across all layers: 0.4450
Average Test AUC across all layers: 0.4381

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinBERT_ProsusAI_score_norm - L_label

Training logistic regression model for FinBERT_ProsusAI merged approach, Long-term

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Train class distribution (L_label): {1: 277, 0: 245}
Val class distribution (L_label): {1: 100, 0: 63}
Test class distribution (L_label): {1: 51, 0: 15}
X_train shape: (522, 2), y_train shape: (522,)
Validation Accuracy: 0.4908
Test Accuracy for Layer 1: 0.4848
Test AUC for Layer 1: 0.6327
Model coefficients: [0.04846929 0.25563562]
Model intercept: 0.021423
Confusion Matrix:
[[ 9  6]
 [28 23]]

Classification Report:
              precision    recall  f1-score   support

           0       0.24      0.60      0.35       


Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Train class distribution (L_label): {1: 428, 0: 323}
Val class distribution (L_label): {1: 66, 0: 42}
Test class distribution (L_label): {0: 51, 1: 18}
X_train shape: (751, 2), y_train shape: (751,)
Validation Accuracy: 0.5556
Test Accuracy for Layer 3: 0.4638
Test AUC for Layer 3: 0.5545
Model coefficients: [0.14299547 0.07952096]
Model intercept: 0.038304
Confusion Matrix:
[[18 33]
 [ 4 14]]

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.35      0.49        51
           1       0.30      0.78      0.43        18

    accuracy                           0.46        69
   macro avg       0.56      0.57      0.46        69
weighted avg       0.68      0.46      0.48        69

Predicted class distribution: [22 47]

A


Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (S_label): {1: 354, 0: 331}
Val class distribution (S_label): {1: 41, 0: 25}
Test class distribution (S_label): {1: 61, 0: 47}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.4545
Test Accuracy for Layer 2: 0.5278
Test AUC for Layer 2: 0.5183
Model coefficients: [ 0.16699006 -0.29103277]
Model intercept: -0.077867
Confusion Matrix:
[[20 27]
 [24 37]]

Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.43      0.44        47
           1       0.58      0.61      0.59        61

    accuracy                           0.53       108
   macro avg       0.52      0.52      0.52       108
weighted avg       0.52      0.53      0.53       108

Predicted class distribution: [44 64]


Average Test Accuracy across all layers: 0.4980
Average Test AUC across all layers: 0.4683

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_Fino1_score_norm - L_label

Training logistic regression model for Fino1 merged approach, Long-term

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Train class distribution (L_label): {1: 277, 0: 245}
Val class distribution (L_label): {1: 100, 0: 63}
Test class distribution (L_label): {1: 51, 0: 15}
X_train shape: (522, 2), y_train shape: (522,)
Validation Accuracy: 0.3804
Test Accuracy for Layer 1: 0.2879
Test AUC for Layer 1: 0.4569
Model coefficients: [-0.16310203  0.03443958]
Model intercept: 0.016320
Confusion Matrix:
[[14  1]
 [46  5]]

Classification Report:
              precision    recall  f1-score   support

           0       0.23      0.93      0.37        15
           1    

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import os
from datetime import datetime
import ast  # For parsing string representations of lists
import warnings
warnings.filterwarnings('ignore')

class MergedSentimentStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor using logistic regression for merged financial sentiment analysis.
        
        Args:
            csv_path: Path to the CSV file containing merged sentiment score vectors and stock labels
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        self.sentiment_approaches = [
            'McDonld', 'FinBERT_ProsusAI', 'FinBERT_yiyang', 
            'FinGPT', 'Majority_vote_mean', 'Fino1'
        ]
        self.scalers = {}
        
        # Create directories for visualizations
        os.makedirs('Merged_Sentiment_Logistic_Plots/MPC/visualizations', exist_ok=True)
        os.makedirs('Merged_Sentiment_Logistic_Plots/MPC/visualizations/confusion_matrices', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing merged sentiment scores and stock labels."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='%d/%m/%Y')
        self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='%d/%m/%Y')
        self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='%d/%m/%Y')
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Display initial data info
        print(f"Loaded {len(self.data)} financial news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        
        # Show class distribution for each label type
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        # Process merged sentiment score columns
        self._process_merged_scores()
        
        return self
    
    def _process_merged_scores(self):
        """
        Process and normalize the merged sentiment score columns.
        Each merged score is a vector [Title_score, Full_text_score]
        """
        print("\nProcessing merged sentiment scores...")
        
        # Get all merged sentiment score columns
        merged_columns = [f"Merged_{approach}_score" for approach in self.sentiment_approaches]
        
        # Check if these columns exist in the data
        existing_columns = [col for col in merged_columns if col in self.data.columns]
        
        if not existing_columns:
            raise ValueError("No merged sentiment score columns found in the data")
        
        # First, parse the string vectors into numerical arrays
        for col in existing_columns:
            # Check the format of the data to determine parsing method
            sample_value = self.data[col].iloc[0]
            
            # If values are already numerical arrays, we don't need to parse
            if isinstance(sample_value, (list, np.ndarray)):
                print(f"Column {col} already contains numerical arrays")
                continue
                
            # If values are stored as strings, parse them
            try:
                # Try parsing as literal Python representation
                self.data[col] = self.data[col].apply(ast.literal_eval)
                print(f"Parsed {col} using ast.literal_eval")
            except (ValueError, SyntaxError):
                try:
                    # Alternative: try parsing as comma-separated values
                    self.data[col] = self.data[col].str.strip('[]').str.split(',').apply(
                        lambda x: [float(val.strip()) for val in x]
                    )
                    print(f"Parsed {col} as comma-separated values")
                except Exception as e:
                    raise ValueError(f"Could not parse sentiment vectors in column {col}: {e}")
        
        # Create two separate normalized versions of each element in the vector
        print("\nNormalizing merged sentiment scores...")
        for col in existing_columns:
            # Extract Title and Full_text components
            title_scores = np.array([vec[0] for vec in self.data[col]])
            fulltext_scores = np.array([vec[1] for vec in self.data[col]])
            
            # Display original score ranges
            print(f"{col} (Title component): min={title_scores.min():.4f}, max={title_scores.max():.4f}")
            print(f"{col} (Full_text component): min={fulltext_scores.min():.4f}, max={fulltext_scores.max():.4f}")
            
            # Normalize each component separately
            title_scaler = MinMaxScaler(feature_range=(-1, 1))
            fulltext_scaler = MinMaxScaler(feature_range=(-1, 1))
            
            title_norm = title_scaler.fit_transform(title_scores.reshape(-1, 1)).flatten()
            fulltext_norm = fulltext_scaler.fit_transform(fulltext_scores.reshape(-1, 1)).flatten()
            
            # Store scalers for potential later use
            self.scalers[f"{col}_title"] = title_scaler
            self.scalers[f"{col}_fulltext"] = fulltext_scaler
            
            # Create normalized vectors
            self.data[f"{col}_norm"] = [
                [title_norm[i], fulltext_norm[i]] for i in range(len(title_norm))
            ]
            
            # Display normalized score ranges
            print(f"{col}_norm (Title component): min={title_norm.min():.4f}, max={title_norm.max():.4f}")
            print(f"{col}_norm (Full_text component): min={fulltext_norm.min():.4f}, max={fulltext_norm.max():.4f}")
        
        # Print sample of normalized vectors
        print("\nSample of normalized merged sentiment vectors (first 3 rows):")
        for col in [f"{col}_norm" for col in existing_columns]:
            print(f"{col}: {self.data[col].iloc[:3].tolist()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, sentiment_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            sentiment_col: The column containing the merged sentiment score vectors
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            Split datasets and corresponding indices
        """
        # Create masks for each time period
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        # Get data for each period
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Additional debug information: class distribution in each split
        print(f"Train class distribution ({label_col}): {train_data[label_col].value_counts().to_dict()}")
        print(f"Val class distribution ({label_col}): {val_data[label_col].value_counts().to_dict()}")
        print(f"Test class distribution ({label_col}): {test_data[label_col].value_counts().to_dict()}")
        
        # Extract merged sentiment vectors and convert to numpy arrays
        X_train = np.array([vec for vec in train_data[sentiment_col]])
        X_val = np.array([vec for vec in val_data[sentiment_col]])
        X_test = np.array([vec for vec in test_data[sentiment_col]])
        
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        # Get indices for later reference
        train_indices = train_data.index
        val_indices = val_data.index
        test_indices = test_data.index
        
        return (X_train, y_train, train_indices), (X_val, y_val, val_indices), (X_test, y_test, test_indices)
    
    def create_logistic_model(self, C=1.0, solver='liblinear', max_iter=1000, class_weight=None):
        """
        Create a logistic regression model for prediction.
        
        Args:
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
            
        Returns:
            Logistic regression model
        """
        return LogisticRegression(
            C=C,
            solver=solver,
            max_iter=max_iter,
            class_weight=class_weight,
            random_state=42
        )
    
    def train_and_evaluate(self, sentiment_column, label_column, C=1.0, solver='liblinear', 
                          max_iter=1000, class_weight=None):
        """
        Train and evaluate logistic regression model for a specific merged sentiment column and label column.
        
        Args:
            sentiment_column: The merged sentiment score column to use
            label_column: The label column to use ('S_label' or 'L_label')
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
        """
        # Determine sentiment approach
        approach = None
        for app in self.sentiment_approaches:
            if app in sentiment_column:
                approach = app
                break
        
        if approach is None:
            raise ValueError(f"Could not determine sentiment approach from column name: {sentiment_column}")
        
        # Determine label type
        if label_column == 'S_label':
            label_type = 'Short-term'
        else:
            label_type = 'Long-term'
        
        # Store results
        combination_key = f"{approach}_{label_type}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training logistic regression model for {approach} merged approach, {label_type}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data for this layer
            (X_train, y_train, train_indices), \
            (X_val, y_val, val_indices), \
            (X_test, y_test, test_indices) = self.split_data(layer, sentiment_column, label_column)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or \
               len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_logistic_model(
                C=C,
                solver=solver,
                max_iter=max_iter,
                class_weight=class_weight
            )
            
            # Debug: Print shapes
            print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
            
            # Train model
            model.fit(X_train, y_train)
            
            # Use validation set to evaluate
            val_pred_prob = model.predict_proba(X_val)[:, 1]
            val_pred = model.predict(X_val)
            val_accuracy = accuracy_score(y_val, val_pred)
            
            print(f"Validation Accuracy: {val_accuracy:.4f}")
            
            # Evaluate on test set
            y_pred_prob = model.predict_proba(X_test)[:, 1]
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_prob)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Print the coefs - now we have two coefficients for [Title, Full_text]
            print(f"Model coefficients: {model.coef_[0]}")
            print(f"Model intercept: {model.intercept_[0]:.6f}")
            
            # Print confusion matrix
            cm = confusion_matrix(y_test, y_pred)
            print("Confusion Matrix:")
            print(cm)
            
            # Print classification report
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            
            # Predicted class distribution
            print(f"Predicted class distribution: {np.bincount(y_pred)}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_prob': y_pred_prob,
                'accuracy': accuracy,
                'auc': auc,
                'confusion_matrix': cm,
                'coefficients': model.coef_[0],
                'intercept': model.intercept_[0]
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize confusion matrix for this layer
            self.visualize_confusion_matrix(
                cm,
                approach,
                label_type,
                i+1
            )
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid layers to calculate average metrics")
            self.results[combination_key]['avg_accuracy'] = np.nan
            self.results[combination_key]['avg_auc'] = np.nan
        
        return self
    
    def visualize_confusion_matrix(self, cm, approach, label_type, layer_num):
        """
        Visualize confusion matrix for a specific model and layer.
        
        Args:
            cm: Confusion matrix
            approach: Sentiment analysis approach
            label_type: Label type (Short-term or Long-term)
            layer_num: Layer number
        """
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"Confusion Matrix: {approach} - Merged - {label_type} (Layer {layer_num})")
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        
        # Save figure
        save_path = f'Merged_Sentiment_Logistic_Plots/MPC/visualizations/confusion_matrices/{approach}_Merged_{label_type}_layer{layer_num}.png'
        plt.savefig(save_path)
        plt.close()
    
    def run_all_combinations(self, use_class_weights=False):
        """
        Run the logistic regression analysis for all combinations of merged sentiment approaches and label columns.
        
        Args:
            use_class_weights: Whether to use balanced class weights
        """
        # Define all combinations - now only looping through approaches and label types
        combinations = []
        
        for approach in self.sentiment_approaches:
            for label_type in ['S_label', 'L_label']:
                sentiment_col = f"Merged_{approach}_score_norm"
                if sentiment_col in self.data.columns:
                    combinations.append((sentiment_col, label_type))
        
        # Run analysis for each combination
        for sentiment_col, label_col in combinations:
            # Train model with or without class weights
            class_weight = 'balanced' if use_class_weights else None
            
            print(f"\n{'='*80}")
            print(f"TRAINING LOGISTIC REGRESSION MODEL FOR {sentiment_col} - {label_col}")
            print(f"{'='*80}")
            
            self.train_and_evaluate(
                sentiment_column=sentiment_col, 
                label_column=label_col,
                class_weight=class_weight
            )
        
        # Create summary visualizations
        self.create_summary_visualizations()
        
        # Print final summary
        self.print_summary()
        
        return self
    
    def create_summary_visualizations(self):
        """Create summary visualizations comparing model performances."""
        self._create_performance_by_approach_visualizations()
    
    def _create_performance_by_approach_visualizations(self):
        """Create visualizations comparing performance by approach."""
        # Collect all results
        approaches = []
        combinations = []
        accuracies = []
        aucs = []
        
        for combo_key, results in self.results.items():
            if 'avg_accuracy' in results:
                # Parse combination key
                parts = combo_key.split('_')
                approach = parts[0]
                label_type = '_'.join(parts[1:])
                
                approaches.append(approach)
                combinations.append(f"{approach}_{label_type}")
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            return
        
        # Create dataframe for plotting
        df = pd.DataFrame({
            'Approach': approaches,
            'Combination': combinations,
            'Accuracy': accuracies,
            'AUC': aucs
        })
        
        # Plot accuracy comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='Accuracy', hue='Combination', data=df)
        plt.title('Accuracy by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/MPC/visualizations/Accuracy_performance_comparison.png')
        plt.close()
        
        # Plot AUC comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='AUC', hue='Combination', data=df)
        plt.title('AUC by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/MPC/visualizations/AUC_performance_comparison.png')
        plt.close()
        
        # Also create a coefficient analysis visualization
        self._create_coefficient_analysis()
    
    def _create_coefficient_analysis(self):
        """Create visualization showing the relative importance of Title vs Full_text in each approach."""
        coef_data = []
        
        for combo_key, results in self.results.items():
            for layer_result in results.get('layer_results', []):
                if 'coefficients' in layer_result and len(layer_result['coefficients']) == 2:
                    parts = combo_key.split('_')
                    approach = parts[0]
                    label_type = '_'.join(parts[1:])
                    
                    coef_data.append({
                        'Approach': approach,
                        'Label Type': label_type,
                        'Layer': f"Layer {layer_result['layer']}",
                        'Title Coefficient': layer_result['coefficients'][0],
                        'Full_text Coefficient': layer_result['coefficients'][1],
                        'Title to Full_text Ratio': abs(layer_result['coefficients'][0] / 
                                                      (layer_result['coefficients'][1] 
                                                       if layer_result['coefficients'][1] != 0 else 1e-6))
                    })
        
        if not coef_data:
            return
        
        df = pd.DataFrame(coef_data)
        
        # Plot coefficient comparison
        plt.figure(figsize=(14, 10))
        
        plt.subplot(2, 1, 1)
        sns.barplot(x='Approach', y='Title Coefficient', hue='Label Type', data=df)
        plt.title('Title Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.subplot(2, 1, 2)
        sns.barplot(x='Approach', y='Full_text Coefficient', hue='Label Type', data=df)
        plt.title('Full_text Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.tight_layout()
        plt.savefig('Merged_Sentiment_Logistic_Plots/MPC/visualizations/Coefficient_comparison.png')
        plt.close()
    
    def print_summary(self):
        """Print a summary of all results."""
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - MPC)")
        print("="*80)
        
        # Organize results by approach
        for approach in self.sentiment_approaches:
            print(f"\nApproach: {approach}")
            print("-" * 40)
            
            for label_type in ['Short-term', 'Long-term']:
                label_col = 'S_label' if label_type == 'Short-term' else 'L_label'
                combination_key = f"{approach}_{label_type}"
                
                if combination_key in self.results and 'avg_accuracy' in self.results[combination_key]:
                    avg_accuracy = self.results[combination_key]['avg_accuracy']
                    avg_auc = self.results[combination_key]['avg_auc']
                    
                    print(f"Merged Vector + {label_type}:")
                    print(f"  Average Accuracy: {avg_accuracy:.4f}")
                    print(f"  Average AUC: {avg_auc:.4f}")
                    
                    # Print layer-specific results
                    for i, layer_result in enumerate(self.results[combination_key].get('layer_results', [])):
                        accuracy = layer_result['accuracy']
                        auc = layer_result['auc']
                        coefficients = layer_result.get('coefficients', 'N/A')
                        intercept = layer_result.get('intercept', 'N/A')
                        
                        # More readable coefficient display - showing Title and Full_text components
                        coef_str = "N/A"
                        if isinstance(coefficients, np.ndarray) and len(coefficients) == 2:
                            coef_str = f"Title: {coefficients[0]:.4f}, Full_text: {coefficients[1]:.4f}"
                        
                        print(f"    Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                        print(f"    Layer {i+1} - Coefficients: {coef_str}, Intercept: {intercept}")
            
            print()
        
        # Find best overall combination
        best_accuracy = 0
        best_auc = 0
        best_accuracy_combo = None
        best_auc_combo = None
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results:
                if results['avg_accuracy'] > best_accuracy:
                    best_accuracy = results['avg_accuracy']
                    best_accuracy_combo = combo
                
                if results['avg_auc'] > best_auc:
                    best_auc = results['avg_auc']
                    best_auc_combo = combo
        
        print("\nBest Overall Combinations:")
        print(f"Best Accuracy: {best_accuracy:.4f} - {best_accuracy_combo}")
        print(f"Best AUC: {best_auc:.4f} - {best_auc_combo}")
        
        return self

# Main execution
if __name__ == "__main__":
    # Initialize predictor with the new CSV file
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_sentiment/us_news_sentiment_database_part1_MPC.csv')
    
    # Run prediction pipeline
    # First run without class weights
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS")
    print("="*80)
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=False)
    
    # Then run with class weights to address potential class imbalance
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITH BALANCED CLASS WEIGHTS")
    print("="*80)
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_sentiment/us_news_sentiment_database_part1_MPC.csv')
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=True)


RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS
Loaded 929 financial news articles spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {1: 474, 0: 455}
Class distribution for long-term prediction: {1: 536, 0: 393}

Processing merged sentiment scores...
Parsed Merged_McDonld_score using ast.literal_eval
Parsed Merged_FinBERT_ProsusAI_score using ast.literal_eval
Parsed Merged_FinBERT_yiyang_score using ast.literal_eval
Parsed Merged_FinGPT_score using ast.literal_eval
Parsed Merged_Majority_vote_mean_score using ast.literal_eval
Parsed Merged_Fino1_score using ast.literal_eval

Normalizing merged sentiment scores...
Merged_McDonld_score (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score (Full_text component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Full_text component): min=-1.0000, max=1.0000
Merged_FinBERT_ProsusAI_score (Title component)


Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Train class distribution (L_label): {1: 427, 0: 324}
Val class distribution (L_label): {1: 75, 0: 33}
Test class distribution (L_label): {0: 36, 1: 33}
X_train shape: (751, 2), y_train shape: (751,)
Validation Accuracy: 0.6852
Test Accuracy for Layer 3: 0.4783
Test AUC for Layer 3: 0.5404
Model coefficients: [-0.08674216 -0.28378946]
Model intercept: 0.120562
Confusion Matrix:
[[ 1 35]
 [ 1 32]]

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.03      0.05        36
           1       0.48      0.97      0.64        33

    accuracy                           0.48        69
   macro avg       0.49      0.50      0.35        69
weighted avg       0.49      0.48      0.33        69

Predicted class distribution: [ 2 67]



Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (S_label): {0: 347, 1: 338}
Val class distribution (S_label): {1: 39, 0: 27}
Test class distribution (S_label): {1: 61, 0: 47}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.4848
Test Accuracy for Layer 2: 0.4815
Test AUC for Layer 2: 0.4656
Model coefficients: [ 0.23689508 -0.06461779]
Model intercept: 0.012306
Confusion Matrix:
[[15 32]
 [24 37]]

Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.32      0.35        47
           1       0.54      0.61      0.57        61

    accuracy                           0.48       108
   macro avg       0.46      0.46      0.46       108
weighted avg       0.47      0.48      0.47       108

Predicted class distribution: [39 69]



Average Test Accuracy across all layers: 0.4317
Average Test AUC across all layers: 0.4607

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinGPT_score_norm - L_label

Training logistic regression model for FinGPT merged approach, Long-term

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Train class distribution (L_label): {1: 275, 0: 247}
Val class distribution (L_label): {1: 95, 0: 68}
Test class distribution (L_label): {1: 57, 0: 9}
X_train shape: (522, 2), y_train shape: (522,)
Validation Accuracy: 0.5828
Test Accuracy for Layer 1: 0.8636
Test AUC for Layer 1: 0.4474
Model coefficients: [0.03630876 0.02118867]
Model intercept: 0.129238
Confusion Matrix:
[[ 0  9]
 [ 0 57]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1      


Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (L_label): {1: 370, 0: 315}
Val class distribution (L_label): {1: 57, 0: 9}
Test class distribution (L_label): {1: 75, 0: 33}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.8333
Test Accuracy for Layer 2: 0.6481
Test AUC for Layer 2: 0.4756
Model coefficients: [ 0.16647627 -0.4228803 ]
Model intercept: 0.030997
Confusion Matrix:
[[ 0 33]
 [ 5 70]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        33
           1       0.68      0.93      0.79        75

    accuracy                           0.65       108
   macro avg       0.34      0.47      0.39       108
weighted avg       0.47      0.65      0.55       108

Predicted class distribution: [  5 103]


Average Test Accuracy across all layers: 0.6401
Average Test AUC across all layers: 0.5329

SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - MPC)

Approach: McDonld
----------------------------------------
Merged Vector + Short-term:
  Average Accuracy: 0.5208
  Average AUC: 0.5152
    Layer 1 - Accuracy: 0.5000, AUC: 0.5385
    Layer 1 - Coefficients: Title: -0.1493, Full_text: 0.1696, Intercept: 0.020767981626001162
    Layer 2 - Accuracy: 0.4537, AUC: 0.4288
    Layer 2 - Coefficients: Title: -0.0543, Full_text: 0.1275, Intercept: 0.013403012807583322
    Layer 3 - Accuracy: 0.6087, AUC: 0.5783
    Layer 3 - Coefficients: Title: -0.0509, Full_text: 0.2172, Intercept: 0.08926938396064579
Merged Vector + Long-term:
  Average Accuracy: 0.6524
  Average AUC: 0.5356
    Layer 1 - Accuracy: 0.8030, AUC: 0.6140
    Layer 1 - Coefficients: Title: -0.0416, Full_text: -0.2903, Intercept: -0.02864162142836628
    Layer 2 - Accuracy: 0.6759, AUC: 0.4523
    Layer 2 - Coefficients: Title: -0.11

Merged_FinBERT_yiyang_score (Full_text component): min=-1.0000, max=1.0000
Merged_FinBERT_yiyang_score_norm (Title component): min=-1.0000, max=1.0000
Merged_FinBERT_yiyang_score_norm (Full_text component): min=-1.0000, max=1.0000
Merged_FinGPT_score (Title component): min=-1.0000, max=1.0000
Merged_FinGPT_score (Full_text component): min=-1.0000, max=1.0000
Merged_FinGPT_score_norm (Title component): min=-1.0000, max=1.0000
Merged_FinGPT_score_norm (Full_text component): min=-1.0000, max=1.0000
Merged_Majority_vote_mean_score (Title component): min=-1.0000, max=0.9680
Merged_Majority_vote_mean_score (Full_text component): min=-0.9559, max=0.6690
Merged_Majority_vote_mean_score_norm (Title component): min=-1.0000, max=1.0000
Merged_Majority_vote_mean_score_norm (Full_text component): min=-1.0000, max=1.0000
Merged_Fino1_score (Title component): min=-0.8000, max=0.0000
Merged_Fino1_score (Full_text component): min=-1.0000, max=1.0000
Merged_Fino1_score_norm (Title component): min=-1.000


Average Test Accuracy across all layers: 0.5401
Average Test AUC across all layers: 0.5346

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinBERT_ProsusAI_score_norm - S_label

Training logistic regression model for FinBERT_ProsusAI merged approach, Short-term

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Train class distribution (S_label): {0: 262, 1: 260}
Val class distribution (S_label): {0: 85, 1: 78}
Test class distribution (S_label): {1: 39, 0: 27}
X_train shape: (522, 2), y_train shape: (522,)
Validation Accuracy: 0.5583
Test Accuracy for Layer 1: 0.3485
Test AUC for Layer 1: 0.3523
Model coefficients: [ 0.01535782 -0.06321856]
Model intercept: -0.003329
Confusion Matrix:
[[ 6 21]
 [22 17]]

Classification Report:
              precision    recall  f1-score   support

           0       0.21      0.22      0.22    


Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Train class distribution (S_label): {1: 377, 0: 374}
Val class distribution (S_label): {1: 61, 0: 47}
Test class distribution (S_label): {1: 36, 0: 33}
X_train shape: (751, 2), y_train shape: (751,)
Validation Accuracy: 0.4630
Test Accuracy for Layer 3: 0.4203
Test AUC for Layer 3: 0.4798
Model coefficients: [ 0.20239081 -0.02531682]
Model intercept: 0.036618
Confusion Matrix:
[[ 8 25]
 [15 21]]

Classification Report:
              precision    recall  f1-score   support

           0       0.35      0.24      0.29        33
           1       0.46      0.58      0.51        36

    accuracy                           0.42        69
   macro avg       0.40      0.41      0.40        69
weighted avg       0.40      0.42      0.40        69

Predicted class distribution: [23 46]



Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Train class distribution (L_label): {1: 427, 0: 324}
Val class distribution (L_label): {1: 75, 0: 33}
Test class distribution (L_label): {0: 36, 1: 33}
X_train shape: (751, 2), y_train shape: (751,)
Validation Accuracy: 0.3704
Test Accuracy for Layer 3: 0.4783
Test AUC for Layer 3: 0.4865
Model coefficients: [-0.14163941 -0.11650782]
Model intercept: -0.121143
Confusion Matrix:
[[30  6]
 [30  3]]

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.83      0.62        36
           1       0.33      0.09      0.14        33

    accuracy                           0.48        69
   macro avg       0.42      0.46      0.38        69
weighted avg       0.42      0.48      0.39        69

Predicted class distribution: [60  9]


Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (S_label): {0: 347, 1: 338}
Val class distribution (S_label): {1: 39, 0: 27}
Test class distribution (S_label): {1: 61, 0: 47}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.4242
Test Accuracy for Layer 2: 0.4537
Test AUC for Layer 2: 0.4667
Model coefficients: [ 0.1093974  -0.36101236]
Model intercept: -0.153059
Confusion Matrix:
[[33 14]
 [45 16]]

Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.70      0.53        47
           1       0.53      0.26      0.35        61

    accuracy                           0.45       108
   macro avg       0.48      0.48      0.44       108
weighted avg       0.49      0.45      0.43       108

Predicted class distribution: [78 30]

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import os
from datetime import datetime
import ast  # For parsing string representations of lists
import warnings
warnings.filterwarnings('ignore')

class MergedSentimentStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor using logistic regression for merged financial sentiment analysis.
        
        Args:
            csv_path: Path to the CSV file containing merged sentiment score vectors and stock labels
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        self.sentiment_approaches = [
            'McDonld', 'FinBERT_ProsusAI', 'FinBERT_yiyang', 
            'FinGPT', 'Majority_vote_mean', 'Fino1'
        ]
        self.scalers = {}
        
        # Create directories for visualizations
        os.makedirs('Merged_Sentiment_Logistic_Plots/SLB/visualizations', exist_ok=True)
        os.makedirs('Merged_Sentiment_Logistic_Plots/SLB/visualizations/confusion_matrices', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing merged sentiment scores and stock labels."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='%d/%m/%Y')
        self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='%d/%m/%Y')
        self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='%d/%m/%Y')
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Display initial data info
        print(f"Loaded {len(self.data)} financial news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        
        # Show class distribution for each label type
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        # Process merged sentiment score columns
        self._process_merged_scores()
        
        return self
    
    def _process_merged_scores(self):
        """
        Process and normalize the merged sentiment score columns.
        Each merged score is a vector [Title_score, Full_text_score]
        """
        print("\nProcessing merged sentiment scores...")
        
        # Get all merged sentiment score columns
        merged_columns = [f"Merged_{approach}_score" for approach in self.sentiment_approaches]
        
        # Check if these columns exist in the data
        existing_columns = [col for col in merged_columns if col in self.data.columns]
        
        if not existing_columns:
            raise ValueError("No merged sentiment score columns found in the data")
        
        # First, parse the string vectors into numerical arrays
        for col in existing_columns:
            # Check the format of the data to determine parsing method
            sample_value = self.data[col].iloc[0]
            
            # If values are already numerical arrays, we don't need to parse
            if isinstance(sample_value, (list, np.ndarray)):
                print(f"Column {col} already contains numerical arrays")
                continue
                
            # If values are stored as strings, parse them
            try:
                # Try parsing as literal Python representation
                self.data[col] = self.data[col].apply(ast.literal_eval)
                print(f"Parsed {col} using ast.literal_eval")
            except (ValueError, SyntaxError):
                try:
                    # Alternative: try parsing as comma-separated values
                    self.data[col] = self.data[col].str.strip('[]').str.split(',').apply(
                        lambda x: [float(val.strip()) for val in x]
                    )
                    print(f"Parsed {col} as comma-separated values")
                except Exception as e:
                    raise ValueError(f"Could not parse sentiment vectors in column {col}: {e}")
        
        # Create two separate normalized versions of each element in the vector
        print("\nNormalizing merged sentiment scores...")
        for col in existing_columns:
            # Extract Title and Full_text components
            title_scores = np.array([vec[0] for vec in self.data[col]])
            fulltext_scores = np.array([vec[1] for vec in self.data[col]])
            
            # Display original score ranges
            print(f"{col} (Title component): min={title_scores.min():.4f}, max={title_scores.max():.4f}")
            print(f"{col} (Full_text component): min={fulltext_scores.min():.4f}, max={fulltext_scores.max():.4f}")
            
            # Normalize each component separately
            title_scaler = MinMaxScaler(feature_range=(-1, 1))
            fulltext_scaler = MinMaxScaler(feature_range=(-1, 1))
            
            title_norm = title_scaler.fit_transform(title_scores.reshape(-1, 1)).flatten()
            fulltext_norm = fulltext_scaler.fit_transform(fulltext_scores.reshape(-1, 1)).flatten()
            
            # Store scalers for potential later use
            self.scalers[f"{col}_title"] = title_scaler
            self.scalers[f"{col}_fulltext"] = fulltext_scaler
            
            # Create normalized vectors
            self.data[f"{col}_norm"] = [
                [title_norm[i], fulltext_norm[i]] for i in range(len(title_norm))
            ]
            
            # Display normalized score ranges
            print(f"{col}_norm (Title component): min={title_norm.min():.4f}, max={title_norm.max():.4f}")
            print(f"{col}_norm (Full_text component): min={fulltext_norm.min():.4f}, max={fulltext_norm.max():.4f}")
        
        # Print sample of normalized vectors
        print("\nSample of normalized merged sentiment vectors (first 3 rows):")
        for col in [f"{col}_norm" for col in existing_columns]:
            print(f"{col}: {self.data[col].iloc[:3].tolist()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, sentiment_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            sentiment_col: The column containing the merged sentiment score vectors
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            Split datasets and corresponding indices
        """
        # Create masks for each time period
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        # Get data for each period
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Additional debug information: class distribution in each split
        print(f"Train class distribution ({label_col}): {train_data[label_col].value_counts().to_dict()}")
        print(f"Val class distribution ({label_col}): {val_data[label_col].value_counts().to_dict()}")
        print(f"Test class distribution ({label_col}): {test_data[label_col].value_counts().to_dict()}")
        
        # Extract merged sentiment vectors and convert to numpy arrays
        X_train = np.array([vec for vec in train_data[sentiment_col]])
        X_val = np.array([vec for vec in val_data[sentiment_col]])
        X_test = np.array([vec for vec in test_data[sentiment_col]])
        
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        # Get indices for later reference
        train_indices = train_data.index
        val_indices = val_data.index
        test_indices = test_data.index
        
        return (X_train, y_train, train_indices), (X_val, y_val, val_indices), (X_test, y_test, test_indices)
    
    def create_logistic_model(self, C=1.0, solver='liblinear', max_iter=1000, class_weight=None):
        """
        Create a logistic regression model for prediction.
        
        Args:
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
            
        Returns:
            Logistic regression model
        """
        return LogisticRegression(
            C=C,
            solver=solver,
            max_iter=max_iter,
            class_weight=class_weight,
            random_state=42
        )
    
    def train_and_evaluate(self, sentiment_column, label_column, C=1.0, solver='liblinear', 
                          max_iter=1000, class_weight=None):
        """
        Train and evaluate logistic regression model for a specific merged sentiment column and label column.
        
        Args:
            sentiment_column: The merged sentiment score column to use
            label_column: The label column to use ('S_label' or 'L_label')
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
        """
        # Determine sentiment approach
        approach = None
        for app in self.sentiment_approaches:
            if app in sentiment_column:
                approach = app
                break
        
        if approach is None:
            raise ValueError(f"Could not determine sentiment approach from column name: {sentiment_column}")
        
        # Determine label type
        if label_column == 'S_label':
            label_type = 'Short-term'
        else:
            label_type = 'Long-term'
        
        # Store results
        combination_key = f"{approach}_{label_type}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training logistic regression model for {approach} merged approach, {label_type}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data for this layer
            (X_train, y_train, train_indices), \
            (X_val, y_val, val_indices), \
            (X_test, y_test, test_indices) = self.split_data(layer, sentiment_column, label_column)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or \
               len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_logistic_model(
                C=C,
                solver=solver,
                max_iter=max_iter,
                class_weight=class_weight
            )
            
            # Debug: Print shapes
            print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
            
            # Train model
            model.fit(X_train, y_train)
            
            # Use validation set to evaluate
            val_pred_prob = model.predict_proba(X_val)[:, 1]
            val_pred = model.predict(X_val)
            val_accuracy = accuracy_score(y_val, val_pred)
            
            print(f"Validation Accuracy: {val_accuracy:.4f}")
            
            # Evaluate on test set
            y_pred_prob = model.predict_proba(X_test)[:, 1]
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_prob)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Print the coefs - now we have two coefficients for [Title, Full_text]
            print(f"Model coefficients: {model.coef_[0]}")
            print(f"Model intercept: {model.intercept_[0]:.6f}")
            
            # Print confusion matrix
            cm = confusion_matrix(y_test, y_pred)
            print("Confusion Matrix:")
            print(cm)
            
            # Print classification report
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            
            # Predicted class distribution
            print(f"Predicted class distribution: {np.bincount(y_pred)}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_prob': y_pred_prob,
                'accuracy': accuracy,
                'auc': auc,
                'confusion_matrix': cm,
                'coefficients': model.coef_[0],
                'intercept': model.intercept_[0]
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize confusion matrix for this layer
            self.visualize_confusion_matrix(
                cm,
                approach,
                label_type,
                i+1
            )
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid layers to calculate average metrics")
            self.results[combination_key]['avg_accuracy'] = np.nan
            self.results[combination_key]['avg_auc'] = np.nan
        
        return self
    
    def visualize_confusion_matrix(self, cm, approach, label_type, layer_num):
        """
        Visualize confusion matrix for a specific model and layer.
        
        Args:
            cm: Confusion matrix
            approach: Sentiment analysis approach
            label_type: Label type (Short-term or Long-term)
            layer_num: Layer number
        """
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"Confusion Matrix: {approach} - Merged - {label_type} (Layer {layer_num})")
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        
        # Save figure
        save_path = f'Merged_Sentiment_Logistic_Plots/SLB/visualizations/confusion_matrices/{approach}_Merged_{label_type}_layer{layer_num}.png'
        plt.savefig(save_path)
        plt.close()
    
    def run_all_combinations(self, use_class_weights=False):
        """
        Run the logistic regression analysis for all combinations of merged sentiment approaches and label columns.
        
        Args:
            use_class_weights: Whether to use balanced class weights
        """
        # Define all combinations - now only looping through approaches and label types
        combinations = []
        
        for approach in self.sentiment_approaches:
            for label_type in ['S_label', 'L_label']:
                sentiment_col = f"Merged_{approach}_score_norm"
                if sentiment_col in self.data.columns:
                    combinations.append((sentiment_col, label_type))
        
        # Run analysis for each combination
        for sentiment_col, label_col in combinations:
            # Train model with or without class weights
            class_weight = 'balanced' if use_class_weights else None
            
            print(f"\n{'='*80}")
            print(f"TRAINING LOGISTIC REGRESSION MODEL FOR {sentiment_col} - {label_col}")
            print(f"{'='*80}")
            
            self.train_and_evaluate(
                sentiment_column=sentiment_col, 
                label_column=label_col,
                class_weight=class_weight
            )
        
        # Create summary visualizations
        self.create_summary_visualizations()
        
        # Print final summary
        self.print_summary()
        
        return self
    
    def create_summary_visualizations(self):
        """Create summary visualizations comparing model performances."""
        self._create_performance_by_approach_visualizations()
    
    def _create_performance_by_approach_visualizations(self):
        """Create visualizations comparing performance by approach."""
        # Collect all results
        approaches = []
        combinations = []
        accuracies = []
        aucs = []
        
        for combo_key, results in self.results.items():
            if 'avg_accuracy' in results:
                # Parse combination key
                parts = combo_key.split('_')
                approach = parts[0]
                label_type = '_'.join(parts[1:])
                
                approaches.append(approach)
                combinations.append(f"{approach}_{label_type}")
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            return
        
        # Create dataframe for plotting
        df = pd.DataFrame({
            'Approach': approaches,
            'Combination': combinations,
            'Accuracy': accuracies,
            'AUC': aucs
        })
        
        # Plot accuracy comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='Accuracy', hue='Combination', data=df)
        plt.title('Accuracy by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/SLB/visualizations/Accuracy_performance_comparison.png')
        plt.close()
        
        # Plot AUC comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='AUC', hue='Combination', data=df)
        plt.title('AUC by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/SLB/visualizations/AUC_performance_comparison.png')
        plt.close()
        
        # Also create a coefficient analysis visualization
        self._create_coefficient_analysis()
    
    def _create_coefficient_analysis(self):
        """Create visualization showing the relative importance of Title vs Full_text in each approach."""
        coef_data = []
        
        for combo_key, results in self.results.items():
            for layer_result in results.get('layer_results', []):
                if 'coefficients' in layer_result and len(layer_result['coefficients']) == 2:
                    parts = combo_key.split('_')
                    approach = parts[0]
                    label_type = '_'.join(parts[1:])
                    
                    coef_data.append({
                        'Approach': approach,
                        'Label Type': label_type,
                        'Layer': f"Layer {layer_result['layer']}",
                        'Title Coefficient': layer_result['coefficients'][0],
                        'Full_text Coefficient': layer_result['coefficients'][1],
                        'Title to Full_text Ratio': abs(layer_result['coefficients'][0] / 
                                                      (layer_result['coefficients'][1] 
                                                       if layer_result['coefficients'][1] != 0 else 1e-6))
                    })
        
        if not coef_data:
            return
        
        df = pd.DataFrame(coef_data)
        
        # Plot coefficient comparison
        plt.figure(figsize=(14, 10))
        
        plt.subplot(2, 1, 1)
        sns.barplot(x='Approach', y='Title Coefficient', hue='Label Type', data=df)
        plt.title('Title Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.subplot(2, 1, 2)
        sns.barplot(x='Approach', y='Full_text Coefficient', hue='Label Type', data=df)
        plt.title('Full_text Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.tight_layout()
        plt.savefig('Merged_Sentiment_Logistic_Plots/SLB/visualizations/Coefficient_comparison.png')
        plt.close()
    
    def print_summary(self):
        """Print a summary of all results."""
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - SLB)")
        print("="*80)
        
        # Organize results by approach
        for approach in self.sentiment_approaches:
            print(f"\nApproach: {approach}")
            print("-" * 40)
            
            for label_type in ['Short-term', 'Long-term']:
                label_col = 'S_label' if label_type == 'Short-term' else 'L_label'
                combination_key = f"{approach}_{label_type}"
                
                if combination_key in self.results and 'avg_accuracy' in self.results[combination_key]:
                    avg_accuracy = self.results[combination_key]['avg_accuracy']
                    avg_auc = self.results[combination_key]['avg_auc']
                    
                    print(f"Merged Vector + {label_type}:")
                    print(f"  Average Accuracy: {avg_accuracy:.4f}")
                    print(f"  Average AUC: {avg_auc:.4f}")
                    
                    # Print layer-specific results
                    for i, layer_result in enumerate(self.results[combination_key].get('layer_results', [])):
                        accuracy = layer_result['accuracy']
                        auc = layer_result['auc']
                        coefficients = layer_result.get('coefficients', 'N/A')
                        intercept = layer_result.get('intercept', 'N/A')
                        
                        # More readable coefficient display - showing Title and Full_text components
                        coef_str = "N/A"
                        if isinstance(coefficients, np.ndarray) and len(coefficients) == 2:
                            coef_str = f"Title: {coefficients[0]:.4f}, Full_text: {coefficients[1]:.4f}"
                        
                        print(f"    Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                        print(f"    Layer {i+1} - Coefficients: {coef_str}, Intercept: {intercept}")
            
            print()
        
        # Find best overall combination
        best_accuracy = 0
        best_auc = 0
        best_accuracy_combo = None
        best_auc_combo = None
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results:
                if results['avg_accuracy'] > best_accuracy:
                    best_accuracy = results['avg_accuracy']
                    best_accuracy_combo = combo
                
                if results['avg_auc'] > best_auc:
                    best_auc = results['avg_auc']
                    best_auc_combo = combo
        
        print("\nBest Overall Combinations:")
        print(f"Best Accuracy: {best_accuracy:.4f} - {best_accuracy_combo}")
        print(f"Best AUC: {best_auc:.4f} - {best_auc_combo}")
        
        return self

# Main execution
if __name__ == "__main__":
    # Initialize predictor with the new CSV file
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_sentiment/us_news_sentiment_database_part1_SLB.csv')
    
    # Run prediction pipeline
    # First run without class weights
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS")
    print("="*80)
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=False)
    
    # Then run with class weights to address potential class imbalance
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITH BALANCED CLASS WEIGHTS")
    print("="*80)
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_sentiment/us_news_sentiment_database_part1_SLB.csv')
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=True)


RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS
Loaded 929 financial news articles spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {1: 472, 0: 457}
Class distribution for long-term prediction: {1: 484, 0: 445}

Processing merged sentiment scores...
Parsed Merged_McDonld_score using ast.literal_eval
Parsed Merged_FinBERT_ProsusAI_score using ast.literal_eval
Parsed Merged_FinBERT_yiyang_score using ast.literal_eval
Parsed Merged_FinGPT_score using ast.literal_eval
Parsed Merged_Majority_vote_mean_score using ast.literal_eval
Parsed Merged_Fino1_score using ast.literal_eval

Normalizing merged sentiment scores...
Merged_McDonld_score (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score (Full_text component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Full_text component): min=-1.0000, max=1.0000
Merged_FinBERT_ProsusAI_score (Title component)


Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Train class distribution (L_label): {1: 391, 0: 360}
Val class distribution (L_label): {1: 72, 0: 36}
Test class distribution (L_label): {0: 49, 1: 20}
X_train shape: (751, 2), y_train shape: (751,)
Validation Accuracy: 0.6296
Test Accuracy for Layer 3: 0.2899
Test AUC for Layer 3: 0.3714
Model coefficients: [-0.13668636  0.09809186]
Model intercept: 0.083064
Confusion Matrix:
[[ 2 47]
 [ 2 18]]

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.04      0.08        49
           1       0.28      0.90      0.42        20

    accuracy                           0.29        69
   macro avg       0.39      0.47      0.25        69
weighted avg       0.44      0.29      0.18        69

Predicted class distribution: [ 4 65]



Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (S_label): {0: 343, 1: 342}
Val class distribution (S_label): {0: 34, 1: 32}
Test class distribution (S_label): {1: 58, 0: 50}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.5000
Test Accuracy for Layer 2: 0.4537
Test AUC for Layer 2: 0.4348
Model coefficients: [ 0.40717238 -0.07459759]
Model intercept: 0.067620
Confusion Matrix:
[[10 40]
 [19 39]]

Classification Report:
              precision    recall  f1-score   support

           0       0.34      0.20      0.25        50
           1       0.49      0.67      0.57        58

    accuracy                           0.45       108
   macro avg       0.42      0.44      0.41       108
weighted avg       0.42      0.45      0.42       108

Predicted class distribution: [29 79]


Validation Accuracy: 0.5278
Test Accuracy for Layer 3: 0.5072
Test AUC for Layer 3: 0.4685
Model coefficients: [0.18176357 0.00328432]
Model intercept: 0.008862
Confusion Matrix:
[[ 2 27]
 [ 7 33]]

Classification Report:
              precision    recall  f1-score   support

           0       0.22      0.07      0.11        29
           1       0.55      0.82      0.66        40

    accuracy                           0.51        69
   macro avg       0.39      0.45      0.38        69
weighted avg       0.41      0.51      0.43        69

Predicted class distribution: [ 9 60]

Average Test Accuracy across all layers: 0.5066
Average Test AUC across all layers: 0.4748

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinGPT_score_norm - L_label

Training logistic regression model for FinGPT merged approach, Long-term

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation 


Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (L_label): {1: 350, 0: 335}
Val class distribution (L_label): {1: 41, 0: 25}
Test class distribution (L_label): {1: 72, 0: 36}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.5909
Test Accuracy for Layer 2: 0.6759
Test AUC for Layer 2: 0.4996
Model coefficients: [-0.07039875  0.12627479]
Model intercept: 0.077835
Confusion Matrix:
[[ 6 30]
 [ 5 67]]

Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.17      0.26        36
           1       0.69      0.93      0.79        72

    accuracy                           0.68       108
   macro avg       0.62      0.55      0.52       108
weighted avg       0.64      0.68      0.61       108

Predicted class distribution: [11 97]



Average Test Accuracy across all layers: 0.4854
Average Test AUC across all layers: 0.4966

SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - SLB)

Approach: McDonld
----------------------------------------
Merged Vector + Short-term:
  Average Accuracy: 0.5136
  Average AUC: 0.5240
    Layer 1 - Accuracy: 0.5152, AUC: 0.5772
    Layer 1 - Coefficients: Title: 0.1693, Full_text: 0.2691, Intercept: 0.21724368815127693
    Layer 2 - Accuracy: 0.5185, AUC: 0.5267
    Layer 2 - Coefficients: Title: 0.2749, Full_text: 0.3096, Intercept: 0.22350576227684849
    Layer 3 - Accuracy: 0.5072, AUC: 0.4681
    Layer 3 - Coefficients: Title: 0.2012, Full_text: 0.4761, Intercept: 0.2693580061715589
Merged Vector + Long-term:
  Average Accuracy: 0.4459
  Average AUC: 0.3933
    Layer 1 - Accuracy: 0.5758, AUC: 0.3620
    Layer 1 - Coefficients: Title: -0.0094, Full_text: -0.2708, Intercept: 0.011903773817910877
    Layer 2 - Accuracy: 0.4722, AUC: 0.4466
    Layer 2 - Coefficients: Title: -0.1677, Fu


Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (S_label): {0: 343, 1: 342}
Val class distribution (S_label): {0: 34, 1: 32}
Test class distribution (S_label): {1: 58, 0: 50}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.5152
Test Accuracy for Layer 2: 0.5185
Test AUC for Layer 2: 0.5267
Model coefficients: [0.27490321 0.3095421 ]
Model intercept: 0.226384
Confusion Matrix:
[[25 25]
 [27 31]]

Classification Report:
              precision    recall  f1-score   support

           0       0.48      0.50      0.49        50
           1       0.55      0.53      0.54        58

    accuracy                           0.52       108
   macro avg       0.52      0.52      0.52       108
weighted avg       0.52      0.52      0.52       108

Predicted class distribution: [52 56]

L


Average Test Accuracy across all layers: 0.5233
Average Test AUC across all layers: 0.5106

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinBERT_ProsusAI_score_norm - L_label

Training logistic regression model for FinBERT_ProsusAI merged approach, Long-term

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Train class distribution (L_label): {1: 278, 0: 244}
Val class distribution (L_label): {0: 91, 1: 72}
Test class distribution (L_label): {1: 41, 0: 25}
X_train shape: (522, 2), y_train shape: (522,)
Validation Accuracy: 0.4663
Test Accuracy for Layer 1: 0.5758
Test AUC for Layer 1: 0.6263
Model coefficients: [ 0.28270034 -0.00638018]
Model intercept: 0.019536
Confusion Matrix:
[[13 12]
 [16 25]]

Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.52      0.48      


Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (L_label): {1: 350, 0: 335}
Val class distribution (L_label): {1: 41, 0: 25}
Test class distribution (L_label): {1: 72, 0: 36}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.6061
Test Accuracy for Layer 2: 0.5463
Test AUC for Layer 2: 0.4738
Model coefficients: [ 0.20970939 -0.0910232 ]
Model intercept: 0.030131
Confusion Matrix:
[[ 9 27]
 [22 50]]

Classification Report:
              precision    recall  f1-score   support

           0       0.29      0.25      0.27        36
           1       0.65      0.69      0.67        72

    accuracy                           0.55       108
   macro avg       0.47      0.47      0.47       108
weighted avg       0.53      0.55      0.54       108

Predicted class distribution: [31 77]



Average Test Accuracy across all layers: 0.4592
Average Test AUC across all layers: 0.4786

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_Majority_vote_mean_score_norm - S_label

Training logistic regression model for Majority_vote_mean merged approach, Short-term

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Train class distribution (S_label): {1: 268, 0: 254}
Val class distribution (S_label): {0: 89, 1: 74}
Test class distribution (S_label): {0: 34, 1: 32}
X_train shape: (522, 2), y_train shape: (522,)
Validation Accuracy: 0.4724
Test Accuracy for Layer 1: 0.5909
Test AUC for Layer 1: 0.6342
Model coefficients: [0.00990782 0.29818715]
Model intercept: 0.113170
Confusion Matrix:
[[24 10]
 [17 15]]

Classification Report:
              precision    recall  f1-score   support

           0       0.59      0.71      0.64   


Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Train class distribution (S_label): {0: 377, 1: 374}
Val class distribution (S_label): {1: 58, 0: 50}
Test class distribution (S_label): {1: 40, 0: 29}
X_train shape: (751, 2), y_train shape: (751,)
Validation Accuracy: 0.5463
Test Accuracy for Layer 3: 0.4928
Test AUC for Layer 3: 0.4405
Model coefficients: [0.25311709 0.49291268]
Model intercept: -0.156450
Confusion Matrix:
[[ 6 23]
 [12 28]]

Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.21      0.26        29
           1       0.55      0.70      0.62        40

    accuracy                           0.49        69
   macro avg       0.44      0.45      0.44        69
weighted avg       0.46      0.49      0.46        69

Predicted class distribution: [18 51]



In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report
import os
from datetime import datetime
import ast  # For parsing string representations of lists
import warnings
warnings.filterwarnings('ignore')

class MergedSentimentStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor using logistic regression for merged financial sentiment analysis.
        
        Args:
            csv_path: Path to the CSV file containing merged sentiment score vectors and stock labels
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.results = {}
        self.sentiment_approaches = [
            'McDonld', 'FinBERT_ProsusAI', 'FinBERT_yiyang', 
            'FinGPT', 'Majority_vote_mean', 'Fino1'
        ]
        self.scalers = {}
        
        # Create directories for visualizations
        os.makedirs('Merged_Sentiment_Logistic_Plots/XOM/visualizations', exist_ok=True)
        os.makedirs('Merged_Sentiment_Logistic_Plots/XOM/visualizations/confusion_matrices', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing merged sentiment scores and stock labels."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects
        self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='%d/%m/%Y')
        self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='%d/%m/%Y')
        self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='%d/%m/%Y')
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        # Display initial data info
        print(f"Loaded {len(self.data)} financial news articles spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        
        # Show class distribution for each label type
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        # Process merged sentiment score columns
        self._process_merged_scores()
        
        return self
    
    def _process_merged_scores(self):
        """
        Process and normalize the merged sentiment score columns.
        Each merged score is a vector [Title_score, Full_text_score]
        """
        print("\nProcessing merged sentiment scores...")
        
        # Get all merged sentiment score columns
        merged_columns = [f"Merged_{approach}_score" for approach in self.sentiment_approaches]
        
        # Check if these columns exist in the data
        existing_columns = [col for col in merged_columns if col in self.data.columns]
        
        if not existing_columns:
            raise ValueError("No merged sentiment score columns found in the data")
        
        # First, parse the string vectors into numerical arrays
        for col in existing_columns:
            # Check the format of the data to determine parsing method
            sample_value = self.data[col].iloc[0]
            
            # If values are already numerical arrays, we don't need to parse
            if isinstance(sample_value, (list, np.ndarray)):
                print(f"Column {col} already contains numerical arrays")
                continue
                
            # If values are stored as strings, parse them
            try:
                # Try parsing as literal Python representation
                self.data[col] = self.data[col].apply(ast.literal_eval)
                print(f"Parsed {col} using ast.literal_eval")
            except (ValueError, SyntaxError):
                try:
                    # Alternative: try parsing as comma-separated values
                    self.data[col] = self.data[col].str.strip('[]').str.split(',').apply(
                        lambda x: [float(val.strip()) for val in x]
                    )
                    print(f"Parsed {col} as comma-separated values")
                except Exception as e:
                    raise ValueError(f"Could not parse sentiment vectors in column {col}: {e}")
        
        # Create two separate normalized versions of each element in the vector
        print("\nNormalizing merged sentiment scores...")
        for col in existing_columns:
            # Extract Title and Full_text components
            title_scores = np.array([vec[0] for vec in self.data[col]])
            fulltext_scores = np.array([vec[1] for vec in self.data[col]])
            
            # Display original score ranges
            print(f"{col} (Title component): min={title_scores.min():.4f}, max={title_scores.max():.4f}")
            print(f"{col} (Full_text component): min={fulltext_scores.min():.4f}, max={fulltext_scores.max():.4f}")
            
            # Normalize each component separately
            title_scaler = MinMaxScaler(feature_range=(-1, 1))
            fulltext_scaler = MinMaxScaler(feature_range=(-1, 1))
            
            title_norm = title_scaler.fit_transform(title_scores.reshape(-1, 1)).flatten()
            fulltext_norm = fulltext_scaler.fit_transform(fulltext_scores.reshape(-1, 1)).flatten()
            
            # Store scalers for potential later use
            self.scalers[f"{col}_title"] = title_scaler
            self.scalers[f"{col}_fulltext"] = fulltext_scaler
            
            # Create normalized vectors
            self.data[f"{col}_norm"] = [
                [title_norm[i], fulltext_norm[i]] for i in range(len(title_norm))
            ]
            
            # Display normalized score ranges
            print(f"{col}_norm (Title component): min={title_norm.min():.4f}, max={title_norm.max():.4f}")
            print(f"{col}_norm (Full_text component): min={fulltext_norm.min():.4f}, max={fulltext_norm.max():.4f}")
        
        # Print sample of normalized vectors
        print("\nSample of normalized merged sentiment vectors (first 3 rows):")
        for col in [f"{col}_norm" for col in existing_columns]:
            print(f"{col}: {self.data[col].iloc[:3].tolist()}")
        
        return self
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, sentiment_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            sentiment_col: The column containing the merged sentiment score vectors
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            Split datasets and corresponding indices
        """
        # Create masks for each time period
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        # Get data for each period
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Additional debug information: class distribution in each split
        print(f"Train class distribution ({label_col}): {train_data[label_col].value_counts().to_dict()}")
        print(f"Val class distribution ({label_col}): {val_data[label_col].value_counts().to_dict()}")
        print(f"Test class distribution ({label_col}): {test_data[label_col].value_counts().to_dict()}")
        
        # Extract merged sentiment vectors and convert to numpy arrays
        X_train = np.array([vec for vec in train_data[sentiment_col]])
        X_val = np.array([vec for vec in val_data[sentiment_col]])
        X_test = np.array([vec for vec in test_data[sentiment_col]])
        
        y_train = train_data[label_col].values
        y_val = val_data[label_col].values
        y_test = test_data[label_col].values
        
        # Get indices for later reference
        train_indices = train_data.index
        val_indices = val_data.index
        test_indices = test_data.index
        
        return (X_train, y_train, train_indices), (X_val, y_val, val_indices), (X_test, y_test, test_indices)
    
    def create_logistic_model(self, C=1.0, solver='liblinear', max_iter=1000, class_weight=None):
        """
        Create a logistic regression model for prediction.
        
        Args:
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
            
        Returns:
            Logistic regression model
        """
        return LogisticRegression(
            C=C,
            solver=solver,
            max_iter=max_iter,
            class_weight=class_weight,
            random_state=42
        )
    
    def train_and_evaluate(self, sentiment_column, label_column, C=1.0, solver='liblinear', 
                          max_iter=1000, class_weight=None):
        """
        Train and evaluate logistic regression model for a specific merged sentiment column and label column.
        
        Args:
            sentiment_column: The merged sentiment score column to use
            label_column: The label column to use ('S_label' or 'L_label')
            C: Inverse of regularization strength
            solver: Algorithm for optimization
            max_iter: Maximum number of iterations
            class_weight: Weights for classes (None, 'balanced', or dict)
        """
        # Determine sentiment approach
        approach = None
        for app in self.sentiment_approaches:
            if app in sentiment_column:
                approach = app
                break
        
        if approach is None:
            raise ValueError(f"Could not determine sentiment approach from column name: {sentiment_column}")
        
        # Determine label type
        if label_column == 'S_label':
            label_type = 'Short-term'
        else:
            label_type = 'Long-term'
        
        # Store results
        combination_key = f"{approach}_{label_type}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training logistic regression model for {approach} merged approach, {label_type}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data for this layer
            (X_train, y_train, train_indices), \
            (X_val, y_val, val_indices), \
            (X_test, y_test, test_indices) = self.split_data(layer, sentiment_column, label_column)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or \
               len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            model = self.create_logistic_model(
                C=C,
                solver=solver,
                max_iter=max_iter,
                class_weight=class_weight
            )
            
            # Debug: Print shapes
            print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
            
            # Train model
            model.fit(X_train, y_train)
            
            # Use validation set to evaluate
            val_pred_prob = model.predict_proba(X_val)[:, 1]
            val_pred = model.predict(X_val)
            val_accuracy = accuracy_score(y_val, val_pred)
            
            print(f"Validation Accuracy: {val_accuracy:.4f}")
            
            # Evaluate on test set
            y_pred_prob = model.predict_proba(X_test)[:, 1]
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_prob)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Print the coefs - now we have two coefficients for [Title, Full_text]
            print(f"Model coefficients: {model.coef_[0]}")
            print(f"Model intercept: {model.intercept_[0]:.6f}")
            
            # Print confusion matrix
            cm = confusion_matrix(y_test, y_pred)
            print("Confusion Matrix:")
            print(cm)
            
            # Print classification report
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            
            # Predicted class distribution
            print(f"Predicted class distribution: {np.bincount(y_pred)}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_prob': y_pred_prob,
                'accuracy': accuracy,
                'auc': auc,
                'confusion_matrix': cm,
                'coefficients': model.coef_[0],
                'intercept': model.intercept_[0]
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize confusion matrix for this layer
            self.visualize_confusion_matrix(
                cm,
                approach,
                label_type,
                i+1
            )
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid layers to calculate average metrics")
            self.results[combination_key]['avg_accuracy'] = np.nan
            self.results[combination_key]['avg_auc'] = np.nan
        
        return self
    
    def visualize_confusion_matrix(self, cm, approach, label_type, layer_num):
        """
        Visualize confusion matrix for a specific model and layer.
        
        Args:
            cm: Confusion matrix
            approach: Sentiment analysis approach
            label_type: Label type (Short-term or Long-term)
            layer_num: Layer number
        """
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
        plt.title(f"Confusion Matrix: {approach} - Merged - {label_type} (Layer {layer_num})")
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        
        # Save figure
        save_path = f'Merged_Sentiment_Logistic_Plots/XOM/visualizations/confusion_matrices/{approach}_Merged_{label_type}_layer{layer_num}.png'
        plt.savefig(save_path)
        plt.close()
    
    def run_all_combinations(self, use_class_weights=False):
        """
        Run the logistic regression analysis for all combinations of merged sentiment approaches and label columns.
        
        Args:
            use_class_weights: Whether to use balanced class weights
        """
        # Define all combinations - now only looping through approaches and label types
        combinations = []
        
        for approach in self.sentiment_approaches:
            for label_type in ['S_label', 'L_label']:
                sentiment_col = f"Merged_{approach}_score_norm"
                if sentiment_col in self.data.columns:
                    combinations.append((sentiment_col, label_type))
        
        # Run analysis for each combination
        for sentiment_col, label_col in combinations:
            # Train model with or without class weights
            class_weight = 'balanced' if use_class_weights else None
            
            print(f"\n{'='*80}")
            print(f"TRAINING LOGISTIC REGRESSION MODEL FOR {sentiment_col} - {label_col}")
            print(f"{'='*80}")
            
            self.train_and_evaluate(
                sentiment_column=sentiment_col, 
                label_column=label_col,
                class_weight=class_weight
            )
        
        # Create summary visualizations
        self.create_summary_visualizations()
        
        # Print final summary
        self.print_summary()
        
        return self
    
    def create_summary_visualizations(self):
        """Create summary visualizations comparing model performances."""
        self._create_performance_by_approach_visualizations()
    
    def _create_performance_by_approach_visualizations(self):
        """Create visualizations comparing performance by approach."""
        # Collect all results
        approaches = []
        combinations = []
        accuracies = []
        aucs = []
        
        for combo_key, results in self.results.items():
            if 'avg_accuracy' in results:
                # Parse combination key
                parts = combo_key.split('_')
                approach = parts[0]
                label_type = '_'.join(parts[1:])
                
                approaches.append(approach)
                combinations.append(f"{approach}_{label_type}")
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            return
        
        # Create dataframe for plotting
        df = pd.DataFrame({
            'Approach': approaches,
            'Combination': combinations,
            'Accuracy': accuracies,
            'AUC': aucs
        })
        
        # Plot accuracy comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='Accuracy', hue='Combination', data=df)
        plt.title('Accuracy by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/XOM/visualizations/Accuracy_performance_comparison.png')
        plt.close()
        
        # Plot AUC comparison
        plt.figure(figsize=(14, 8))
        sns.barplot(x='Approach', y='AUC', hue='Combination', data=df)
        plt.title('AUC by Merged Approach and Prediction Term')
        plt.ylim(0, 0.8)
        plt.xticks(rotation=0)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.savefig('Merged_Sentiment_Logistic_Plots/XOM/visualizations/AUC_performance_comparison.png')
        plt.close()
        
        # Also create a coefficient analysis visualization
        self._create_coefficient_analysis()
    
    def _create_coefficient_analysis(self):
        """Create visualization showing the relative importance of Title vs Full_text in each approach."""
        coef_data = []
        
        for combo_key, results in self.results.items():
            for layer_result in results.get('layer_results', []):
                if 'coefficients' in layer_result and len(layer_result['coefficients']) == 2:
                    parts = combo_key.split('_')
                    approach = parts[0]
                    label_type = '_'.join(parts[1:])
                    
                    coef_data.append({
                        'Approach': approach,
                        'Label Type': label_type,
                        'Layer': f"Layer {layer_result['layer']}",
                        'Title Coefficient': layer_result['coefficients'][0],
                        'Full_text Coefficient': layer_result['coefficients'][1],
                        'Title to Full_text Ratio': abs(layer_result['coefficients'][0] / 
                                                      (layer_result['coefficients'][1] 
                                                       if layer_result['coefficients'][1] != 0 else 1e-6))
                    })
        
        if not coef_data:
            return
        
        df = pd.DataFrame(coef_data)
        
        # Plot coefficient comparison
        plt.figure(figsize=(14, 10))
        
        plt.subplot(2, 1, 1)
        sns.barplot(x='Approach', y='Title Coefficient', hue='Label Type', data=df)
        plt.title('Title Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.subplot(2, 1, 2)
        sns.barplot(x='Approach', y='Full_text Coefficient', hue='Label Type', data=df)
        plt.title('Full_text Component Coefficient by Approach')
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.legend(title='Prediction Term')
        
        plt.tight_layout()
        plt.savefig('Merged_Sentiment_Logistic_Plots/XOM/visualizations/Coefficient_comparison.png')
        plt.close()
    
    def print_summary(self):
        """Print a summary of all results."""
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - XOM)")
        print("="*80)
        
        # Organize results by approach
        for approach in self.sentiment_approaches:
            print(f"\nApproach: {approach}")
            print("-" * 40)
            
            for label_type in ['Short-term', 'Long-term']:
                label_col = 'S_label' if label_type == 'Short-term' else 'L_label'
                combination_key = f"{approach}_{label_type}"
                
                if combination_key in self.results and 'avg_accuracy' in self.results[combination_key]:
                    avg_accuracy = self.results[combination_key]['avg_accuracy']
                    avg_auc = self.results[combination_key]['avg_auc']
                    
                    print(f"Merged Vector + {label_type}:")
                    print(f"  Average Accuracy: {avg_accuracy:.4f}")
                    print(f"  Average AUC: {avg_auc:.4f}")
                    
                    # Print layer-specific results
                    for i, layer_result in enumerate(self.results[combination_key].get('layer_results', [])):
                        accuracy = layer_result['accuracy']
                        auc = layer_result['auc']
                        coefficients = layer_result.get('coefficients', 'N/A')
                        intercept = layer_result.get('intercept', 'N/A')
                        
                        # More readable coefficient display - showing Title and Full_text components
                        coef_str = "N/A"
                        if isinstance(coefficients, np.ndarray) and len(coefficients) == 2:
                            coef_str = f"Title: {coefficients[0]:.4f}, Full_text: {coefficients[1]:.4f}"
                        
                        print(f"    Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                        print(f"    Layer {i+1} - Coefficients: {coef_str}, Intercept: {intercept}")
            
            print()
        
        # Find best overall combination
        best_accuracy = 0
        best_auc = 0
        best_accuracy_combo = None
        best_auc_combo = None
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results:
                if results['avg_accuracy'] > best_accuracy:
                    best_accuracy = results['avg_accuracy']
                    best_accuracy_combo = combo
                
                if results['avg_auc'] > best_auc:
                    best_auc = results['avg_auc']
                    best_auc_combo = combo
        
        print("\nBest Overall Combinations:")
        print(f"Best Accuracy: {best_accuracy:.4f} - {best_accuracy_combo}")
        print(f"Best AUC: {best_auc:.4f} - {best_auc_combo}")
        
        return self

# Main execution
if __name__ == "__main__":
    # Initialize predictor with the new CSV file
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_sentiment/us_news_sentiment_database_part1_XOM.csv')
    
    # Run prediction pipeline
    # First run without class weights
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS")
    print("="*80)
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=False)
    
    # Then run with class weights to address potential class imbalance
    print("\n" + "="*80)
    print("RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITH BALANCED CLASS WEIGHTS")
    print("="*80)
    predictor = MergedSentimentStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_sentiment/us_news_sentiment_database_part1_XOM.csv')
    predictor.load_data().define_time_windows().run_all_combinations(use_class_weights=True)


RUNNING LOGISTIC REGRESSION WITH MERGED VECTORS WITHOUT CLASS WEIGHTS
Loaded 929 financial news articles spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {0: 471, 1: 458}
Class distribution for long-term prediction: {1: 498, 0: 431}

Processing merged sentiment scores...
Parsed Merged_McDonld_score using ast.literal_eval
Parsed Merged_FinBERT_ProsusAI_score using ast.literal_eval
Parsed Merged_FinBERT_yiyang_score using ast.literal_eval
Parsed Merged_FinGPT_score using ast.literal_eval
Parsed Merged_Majority_vote_mean_score using ast.literal_eval
Parsed Merged_Fino1_score using ast.literal_eval

Normalizing merged sentiment scores...
Merged_McDonld_score (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score (Full_text component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Title component): min=-1.0000, max=1.0000
Merged_McDonld_score_norm (Full_text component): min=-1.0000, max=1.0000
Merged_FinBERT_ProsusAI_score (Title component)


Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Train class distribution (L_label): {1: 401, 0: 350}
Val class distribution (L_label): {1: 69, 0: 39}
Test class distribution (L_label): {0: 42, 1: 27}
X_train shape: (751, 2), y_train shape: (751,)
Validation Accuracy: 0.6296
Test Accuracy for Layer 3: 0.3913
Test AUC for Layer 3: 0.4224
Model coefficients: [-0.14318764  0.08590462]
Model intercept: 0.128774
Confusion Matrix:
[[ 1 41]
 [ 1 26]]

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.02      0.05        42
           1       0.39      0.96      0.55        27

    accuracy                           0.39        69
   macro avg       0.44      0.49      0.30        69
weighted avg       0.46      0.39      0.24        69

Predicted class distribution: [ 2 67]



Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (S_label): {0: 359, 1: 326}
Val class distribution (S_label): {1: 39, 0: 27}
Test class distribution (S_label): {1: 60, 0: 48}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.4091
Test Accuracy for Layer 2: 0.4352
Test AUC for Layer 2: 0.3969
Model coefficients: [ 0.22651149 -0.10367932]
Model intercept: -0.064359
Confusion Matrix:
[[41  7]
 [54  6]]

Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.85      0.57        48
           1       0.46      0.10      0.16        60

    accuracy                           0.44       108
   macro avg       0.45      0.48      0.37       108
weighted avg       0.45      0.44      0.35       108

Predicted class distribution: [95 13]


Average Test Accuracy across all layers: 0.4938
Average Test AUC across all layers: 0.5341

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinGPT_score_norm - L_label

Training logistic regression model for FinGPT merged approach, Long-term

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Train class distribution (L_label): {1: 267, 0: 255}
Val class distribution (L_label): {0: 87, 1: 76}
Test class distribution (L_label): {1: 58, 0: 8}
X_train shape: (522, 2), y_train shape: (522,)
Validation Accuracy: 0.4663
Test Accuracy for Layer 1: 0.8788
Test AUC for Layer 1: 0.4903
Model coefficients: [-0.07585713  0.08176739]
Model intercept: 0.117128
Confusion Matrix:
[[ 0  8]
 [ 0 58]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1    


Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Train class distribution (L_label): {1: 401, 0: 350}
Val class distribution (L_label): {1: 69, 0: 39}
Test class distribution (L_label): {0: 42, 1: 27}
X_train shape: (751, 2), y_train shape: (751,)
Validation Accuracy: 0.6389
Test Accuracy for Layer 3: 0.3913
Test AUC for Layer 3: 0.5291
Model coefficients: [-0.00341647  0.08919437]
Model intercept: 0.169674
Confusion Matrix:
[[ 0 42]
 [ 0 27]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        42
           1       0.39      1.00      0.56        27

    accuracy                           0.39        69
   macro avg       0.20      0.50      0.28        69
weighted avg       0.15      0.39      0.22        69

Predicted class distribution: [ 0 69]



SUMMARY OF RESULTS (MERGED LOGISTIC REGRESSION - XOM)

Approach: McDonld
----------------------------------------
Merged Vector + Short-term:
  Average Accuracy: 0.5230
  Average AUC: 0.5203
    Layer 1 - Accuracy: 0.5303, AUC: 0.6002
    Layer 1 - Coefficients: Title: -0.2156, Full_text: 0.4448, Intercept: 0.003431216447908189
    Layer 2 - Accuracy: 0.4444, AUC: 0.3967
    Layer 2 - Coefficients: Title: -0.1363, Full_text: 0.3980, Intercept: 0.037971682112764804
    Layer 3 - Accuracy: 0.5942, AUC: 0.5640
    Layer 3 - Coefficients: Title: -0.1686, Full_text: 0.4304, Intercept: 0.08392848327253828
Merged Vector + Long-term:
  Average Accuracy: 0.5645
  Average AUC: 0.4699
    Layer 1 - Accuracy: 0.8485, AUC: 0.5129
    Layer 1 - Coefficients: Title: -0.0716, Full_text: 0.0118, Intercept: 0.02972561475518476
    Layer 2 - Accuracy: 0.4537, AUC: 0.4744
    Layer 2 - Coefficients: Title: -0.1720, Full_text: 0.1036, Intercept: -0.0059395289573154495
    Layer 3 - Accuracy: 0.3913, AUC: 


Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (S_label): {0: 359, 1: 326}
Val class distribution (S_label): {1: 39, 0: 27}
Test class distribution (S_label): {1: 60, 0: 48}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.6061
Test Accuracy for Layer 2: 0.4259
Test AUC for Layer 2: 0.3967
Model coefficients: [-0.13581875  0.39585926]
Model intercept: 0.133001
Confusion Matrix:
[[21 27]
 [35 25]]

Classification Report:
              precision    recall  f1-score   support

           0       0.38      0.44      0.40        48
           1       0.48      0.42      0.45        60

    accuracy                           0.43       108
   macro avg       0.43      0.43      0.43       108
weighted avg       0.43      0.43      0.43       108

Predicted class distribution: [56 52]



Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (L_label): {1: 343, 0: 342}
Val class distribution (L_label): {1: 58, 0: 8}
Test class distribution (L_label): {1: 69, 0: 39}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.5606
Test Accuracy for Layer 2: 0.5370
Test AUC for Layer 2: 0.4281
Model coefficients: [0.37498246 0.01498612]
Model intercept: 0.029017
Confusion Matrix:
[[14 25]
 [25 44]]

Classification Report:
              precision    recall  f1-score   support

           0       0.36      0.36      0.36        39
           1       0.64      0.64      0.64        69

    accuracy                           0.54       108
   macro avg       0.50      0.50      0.50       108
weighted avg       0.54      0.54      0.54       108

Predicted class distribution: [39 69]

La


Average Test Accuracy across all layers: 0.5447
Average Test AUC across all layers: 0.4373

TRAINING LOGISTIC REGRESSION MODEL FOR Merged_FinGPT_score_norm - S_label

Training logistic regression model for FinGPT merged approach, Short-term

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Train class distribution (S_label): {0: 277, 1: 245}
Val class distribution (S_label): {0: 82, 1: 81}
Test class distribution (S_label): {1: 39, 0: 27}
X_train shape: (522, 2), y_train shape: (522,)
Validation Accuracy: 0.4969
Test Accuracy for Layer 1: 0.4091
Test AUC for Layer 1: 0.4772
Model coefficients: [-0.18844468  0.10512943]
Model intercept: 0.085055
Confusion Matrix:
[[23  4]
 [35  4]]

Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.85      0.54        27
           1  


Layer 3:
Training period: 01/01/2019 - 31/05/2022
Validation period: 01/06/2022 - 31/12/2022
Testing period: 01/01/2023 - 31/05/2023
Training data: 751 samples
Validation data: 108 samples
Test data: 69 samples
Train class distribution (S_label): {0: 386, 1: 365}
Val class distribution (S_label): {1: 60, 0: 48}
Test class distribution (S_label): {0: 36, 1: 33}
X_train shape: (751, 2), y_train shape: (751,)
Validation Accuracy: 0.4815
Test Accuracy for Layer 3: 0.5362
Test AUC for Layer 3: 0.5000
Model coefficients: [-0.15946196  0.07929487]
Model intercept: -0.002474
Confusion Matrix:
[[21 15]
 [17 16]]

Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.58      0.57        36
           1       0.52      0.48      0.50        33

    accuracy                           0.54        69
   macro avg       0.53      0.53      0.53        69
weighted avg       0.54      0.54      0.54        69

Predicted class distribution: [38 31]


Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Train class distribution (L_label): {1: 343, 0: 342}
Val class distribution (L_label): {1: 58, 0: 8}
Test class distribution (L_label): {1: 69, 0: 39}
X_train shape: (685, 2), y_train shape: (685,)
Validation Accuracy: 0.5758
Test Accuracy for Layer 2: 0.5093
Test AUC for Layer 2: 0.4879
Model coefficients: [0.16149115 0.04920827]
Model intercept: -0.139101
Confusion Matrix:
[[15 24]
 [29 40]]

Classification Report:
              precision    recall  f1-score   support

           0       0.34      0.38      0.36        39
           1       0.62      0.58      0.60        69

    accuracy                           0.51       108
   macro avg       0.48      0.48      0.48       108
weighted avg       0.52      0.51      0.51       108

Predicted class distribution: [44 64]

L