In [1]:
!pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
                                              0.0/294.9 kB ? eta -:--:--
     -----------------------------          225.3/294.9 kB 6.7 MB/s eta 0:00:01
     -------------------------------------- 294.9/294.9 kB 4.5 MB/s eta 0:00:00
Installing collected packages: seaborn
Successfully installed seaborn-0.13.2



[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [12]:
pip list

Package                      Version
---------------------------- ----------
absl-py                      1.4.0
accelerate                   1.4.0.dev0
aiohappyeyeballs             2.4.4
aiohttp                      3.11.11
aiosignal                    1.3.2
asttokens                    3.0.0
astunparse                   1.6.3
attrs                        25.1.0
baidu-aip                    4.16.13
bitsandbytes                 0.45.1
cachetools                   5.3.1
certifi                      2023.7.22
charset-normalizer           3.2.0
click                        8.1.8
colorama                     0.4.6
comm                         0.2.2
contourpy                    1.1.0
cycler                       0.11.0
datasets                     3.2.0
debugpy                      1.8.12
decorator                    5.1.1
dill                         0.3.8
et-xmlfile                   1.1.0
executing                    2.2.0
filelock                     3.12.4
flatbuffers                  2


[notice] A new release of pip is available: 23.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize the top features and learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # 1. Top Features
        plt.subplot(1, 2, 1)
        tfidf_vectorizer = layer_result['model'].named_steps['tfidf']
        classifier = layer_result['model'].named_steps['classifier']
        
        # Get feature names
        feature_names = tfidf_vectorizer.get_feature_names_out()
        
        # Get feature importances
        coefs = classifier.coef_[0]
        
        # Get top positive and negative features
        top_positive_idx = np.argsort(coefs)[-15:]
        top_negative_idx = np.argsort(coefs)[:15]
        
        # Combine top features
        top_idx = np.concatenate([top_negative_idx, top_positive_idx])
        top_features = [feature_names[i] for i in top_idx]
        top_coefs = coefs[top_idx]
        
        # Plot
        plt.barh(range(len(top_features)), top_coefs)
        plt.yticks(range(len(top_features)), top_features)
        plt.title('Top Predictive Features')
        plt.xlabel('Coefficient (Negative = Bearish, Positive = Bullish)')
        
        # 2. Learning Curve (using accuracy on different dataset sizes)
        plt.subplot(1, 2, 2)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        for combination, results in self.results.items():
            text_col, label_col = combination.split('|')
            print(f"\nCombination: {text_col} + {label_col}")
            print(f"Average Accuracy: {results.get('avg_accuracy', 'N/A'):.4f}")
            print(f"Average AUC: {results.get('avg_auc', 'N/A'):.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                best_params = results.get('best_params', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                print(f"  Layer {i+1} - Best Parameters: {best_params}")
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_COP_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Loaded 890 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0.0: 455, 1.0: 435}
Class distribution for long-term prediction: {1.0: 463, 0.0: 427}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 437 samples
Validation data: 139 samples
Test data: 137 samples
Best parameters for Layer 1: {'classifier__C': 0.1, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.4599
Test AUC for Layer 1: 0.5086
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 576 samples
Validation data: 137 samples
Test data: 103 samples
Best parameters for 

ValueError: too many values to unpack (expected 2)

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=1000,  # Limit to top 1000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize the top features and learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # 1. Top Features
        plt.subplot(1, 2, 1)
        tfidf_vectorizer = layer_result['model'].named_steps['tfidf']
        classifier = layer_result['model'].named_steps['classifier']
        
        # Get feature names
        feature_names = tfidf_vectorizer.get_feature_names_out()
        
        # Get feature importances
        coefs = classifier.coef_[0]
        
        # Get top positive and negative features
        top_positive_idx = np.argsort(coefs)[-15:]
        top_negative_idx = np.argsort(coefs)[:15]
        
        # Combine top features
        top_idx = np.concatenate([top_negative_idx, top_positive_idx])
        top_features = [feature_names[i] for i in top_idx]
        top_coefs = coefs[top_idx]
        
        # Plot
        plt.barh(range(len(top_features)), top_coefs)
        plt.yticks(range(len(top_features)), top_features)
        plt.title('Top Predictive Features')
        plt.xlabel('Coefficient (Negative = Bearish, Positive = Bullish)')
        
        # 2. Learning Curve (using accuracy on different dataset sizes)
        plt.subplot(1, 2, 2)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        for combination, results in self.results.items():
            text_col, label_col = combination.split('|')
            print(f"\nCombination: {text_col} + {label_col}")
            print(f"Average Accuracy: {results.get('avg_accuracy', 'N/A'):.4f}")
            print(f"Average AUC: {results.get('avg_auc', 'N/A'):.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                best_params = results.get('best_params', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                print(f"  Layer {i+1} - Best Parameters: {best_params}")
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_COP_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 890 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0.0: 455, 1.0: 435}
Class distribution for long-term prediction: {1.0: 463, 0.0: 427}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 437 samples
Validation data: 139 samples
Test data: 137 samples
Best parameters for Layer 1: {'classifier__C': 0.1, 'classifier__class_weight': 'balanced', 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.4818
Test AUC for Layer 1: 0.4976
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 576 samples
Validation data: 137 samples
Test data: 103 samples
Best parameter

ValueError: not enough values to unpack (expected 2, got 1)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=10000,  # Limit to top 10000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize the top features and learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # 1. Top Features
        plt.subplot(1, 2, 1)
        tfidf_vectorizer = layer_result['model'].named_steps['tfidf']
        classifier = layer_result['model'].named_steps['classifier']
        
        # Get feature names
        feature_names = tfidf_vectorizer.get_feature_names_out()
        
        # Get feature importances
        coefs = classifier.coef_[0]
        
        # Get top positive and negative features
        top_positive_idx = np.argsort(coefs)[-15:]
        top_negative_idx = np.argsort(coefs)[:15]
        
        # Combine top features
        top_idx = np.concatenate([top_negative_idx, top_positive_idx])
        top_features = [feature_names[i] for i in top_idx]
        top_coefs = coefs[top_idx]
        
        # Plot
        plt.barh(range(len(top_features)), top_coefs)
        plt.yticks(range(len(top_features)), top_features)
        plt.title('Top Predictive Features')
        plt.xlabel('Coefficient (Negative = Bearish, Positive = Bullish)')
        
        # 2. Learning Curve (using accuracy on different dataset sizes)
        plt.subplot(1, 2, 2)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        for combination, results in self.results.items():
            text_col, label_col = combination.split('|')
            print(f"\nCombination: {text_col} + {label_col}")
            print(f"Average Accuracy: {results.get('avg_accuracy', 'N/A'):.4f}")
            print(f"Average AUC: {results.get('avg_auc', 'N/A'):.4f}")
            
            # Print layer-specific results
            for i, accuracy in enumerate(results.get('accuracy', [])):
                auc = results.get('auc', [])[i]
                best_params = results.get('best_params', [])[i]
                print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                print(f"  Layer {i+1} - Best Parameters: {best_params}")
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_COP_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 890 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0.0: 455, 1.0: 435}
Class distribution for long-term prediction: {1.0: 463, 0.0: 427}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 437 samples
Validation data: 139 samples
Test data: 137 samples
Best parameters for Layer 1: {'classifier__C': 0.1, 'classifier__class_weight': 'balanced', 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.4818
Test AUC for Layer 1: 0.4976
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 576 samples
Validation data: 137 samples
Test data: 103 samples
Best parameter

ValueError: not enough values to unpack (expected 2, got 1)

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_CVX_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 890 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1.0: 476, 0.0: 414}
Class distribution for long-term prediction: {1.0: 489, 0.0: 401}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 437 samples
Validation data: 139 samples
Test data: 137 samples
Best parameters for Layer 1: {'classifier__C': 100, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.4745
Test AUC for Layer 1: 0.5236
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 576 samples
Validation data: 137 samples
Test data: 103 samples
Best parameters for 

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_MPC_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 890 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1.0: 485, 0.0: 405}
Class distribution for long-term prediction: {1.0: 531, 0.0: 359}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 437 samples
Validation data: 139 samples
Test data: 137 samples
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.4453
Test AUC for Layer 1: 0.5000
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 576 samples
Validation data: 137 samples
Test data: 103 samples
Best parameters for

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_SLB_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 890 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0.0: 462, 1.0: 428}
Class distribution for long-term prediction: {0.0: 465, 1.0: 425}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 437 samples
Validation data: 139 samples
Test data: 137 samples
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l2'}
Test Accuracy for Layer 1: 0.4818
Test AUC for Layer 1: 0.4982
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 576 samples
Validation data: 137 samples
Test data: 103 samples
Best parameters for

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_XOM_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 890 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1.0: 451, 0.0: 439}
Class distribution for long-term prediction: {1.0: 449, 0.0: 441}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 437 samples
Validation data: 139 samples
Test data: 137 samples
Best parameters for Layer 1: {'classifier__C': 100, 'classifier__class_weight': 'balanced', 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l2'}
Test Accuracy for Layer 1: 0.4672
Test AUC for Layer 1: 0.4873
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 576 samples
Validation data: 137 samples
Test data: 103 samples
Best parameter

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_COP_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 889 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0.0: 455, 1.0: 434}
Class distribution for long-term prediction: {1.0: 463, 0.0: 426}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 437 samples
Validation data: 139 samples
Test data: 137 samples
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.4599
Test AUC for Layer 1: 0.5000
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 576 samples
Validation data: 137 samples
Test data: 103 samples
Best parameters for

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_CVX_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 889 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1.0: 476, 0.0: 413}
Class distribution for long-term prediction: {1.0: 489, 0.0: 400}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 437 samples
Validation data: 139 samples
Test data: 137 samples
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l2'}
Test Accuracy for Layer 1: 0.5839
Test AUC for Layer 1: 0.5615
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 576 samples
Validation data: 137 samples
Test data: 103 samples
Best parameters for

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_MPC_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 889 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1.0: 485, 0.0: 404}
Class distribution for long-term prediction: {1.0: 531, 0.0: 358}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 437 samples
Validation data: 139 samples
Test data: 137 samples
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.4453
Test AUC for Layer 1: 0.5000
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 576 samples
Validation data: 137 samples
Test data: 103 samples
Best parameters for

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_SLB_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 889 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0.0: 461, 1.0: 428}
Class distribution for long-term prediction: {0.0: 464, 1.0: 425}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 437 samples
Validation data: 139 samples
Test data: 137 samples
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': 'balanced', 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l2'}
Test Accuracy for Layer 1: 0.5036
Test AUC for Layer 1: 0.4811
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 576 samples
Validation data: 137 samples
Test data: 103 samples
Best paramete

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_XOM_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 889 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1.0: 451, 0.0: 438}
Class distribution for long-term prediction: {1.0: 449, 0.0: 440}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 437 samples
Validation data: 139 samples
Test data: 137 samples
Best parameters for Layer 1: {'classifier__C': 0.1, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l2'}
Test Accuracy for Layer 1: 0.5255
Test AUC for Layer 1: 0.5518
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 576 samples
Validation data: 137 samples
Test data: 103 samples
Best parameters for 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_COP_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 838 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0: 431, 1: 407}
Class distribution for long-term prediction: {1: 440, 0: 398}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Best parameters for Layer 1: {'classifier__C': 0.1, 'classifier__class_weight': 'balanced', 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.5038
Test AUC for Layer 1: 0.5116
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Best parameters for La

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_CVX_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 838 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 452, 0: 386}
Class distribution for long-term prediction: {1: 464, 0: 374}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.3910
Test AUC for Layer 1: 0.5000
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Best parameters for Layer 2

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_MPC_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 838 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 460, 0: 378}
Class distribution for long-term prediction: {1: 503, 0: 335}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.4511
Test AUC for Layer 1: 0.5000
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Best parameters for Layer 2

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_SLB_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 838 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0: 433, 1: 405}
Class distribution for long-term prediction: {0: 437, 1: 401}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l2'}
Test Accuracy for Layer 1: 0.4887
Test AUC for Layer 1: 0.4893
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Best parameters for Layer 2

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_XOM_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 838 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 427, 0: 411}
Class distribution for long-term prediction: {1: 428, 0: 410}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.4436
Test AUC for Layer 1: 0.5000
Visualization saved as: visualization_Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Best parameters for Layer 2

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Merged news', 'S_label'),     # Merged news + short-term prediction
            ('Merged news', 'L_label'),     # Merged news + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_title_full_text_SP500_database/semantic/wall_street_news_semantics_COP_completed_Merged.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 838 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0: 431, 1: 407}
Class distribution for long-term prediction: {1: 440, 0: 398}

Training model for Merged news and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Best parameters for Layer 1: {'classifier__C': 1, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.4962
Test AUC for Layer 1: 0.5143
Visualization saved as: visualization_Merged news_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Best parameters fo

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Merged news', 'S_label'),     # Merged news + short-term prediction
            ('Merged news', 'L_label'),     # Merged news + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_title_full_text_SP500_database/semantic/wall_street_news_semantics_CVX_completed_Merged.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 838 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 452, 0: 386}
Class distribution for long-term prediction: {1: 464, 0: 374}

Training model for Merged news and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Best parameters for Layer 1: {'classifier__C': 10, 'classifier__class_weight': 'balanced', 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.5564
Test AUC for Layer 1: 0.4919
Visualization saved as: visualization_Merged news_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Best parame

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Merged news', 'S_label'),     # Merged news + short-term prediction
            ('Merged news', 'L_label'),     # Merged news + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_title_full_text_SP500_database/semantic/wall_street_news_semantics_MPC_completed_Merged.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 838 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 460, 0: 378}
Class distribution for long-term prediction: {1: 503, 0: 335}

Training model for Merged news and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.4511
Test AUC for Layer 1: 0.5000
Visualization saved as: visualization_Merged news_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Best parameters

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Merged news', 'S_label'),     # Merged news + short-term prediction
            ('Merged news', 'L_label'),     # Merged news + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_title_full_text_SP500_database/semantic/wall_street_news_semantics_SLB_completed_Merged.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 838 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0: 433, 1: 405}
Class distribution for long-term prediction: {0: 437, 1: 401}

Training model for Merged news and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Best parameters for Layer 1: {'classifier__C': 10, 'classifier__class_weight': 'balanced', 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.5564
Test AUC for Layer 1: 0.5776
Visualization saved as: visualization_Merged news_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Best parame

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Merged news', 'S_label'),     # Merged news + short-term prediction
            ('Merged news', 'L_label'),     # Merged news + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_title_full_text_SP500_database/semantic/wall_street_news_semantics_XOM_completed_Merged.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 838 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 427, 0: 411}
Class distribution for long-term prediction: {1: 428, 0: 410}

Training model for Merged news and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Best parameters for Layer 1: {'classifier__C': 100, 'classifier__class_weight': 'balanced', 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.4511
Test AUC for Layer 1: 0.4118
Visualization saved as: visualization_Merged news_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 samples
Validation data: 133 samples
Test data: 101 samples
Best param

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
        # Dictionary to track visualizations - new addition
        self.visualizations = {
            'learning_curves': {},  # Will store paths to learning curve plots
            'feature_importance': {},  # Will store paths to feature importance plots
            'summary': {}  # Will store paths to summary visualizations
        }
        
        # Create visualization directory if it doesn't exist
        os.makedirs('TF-IDF_SVM_Plots/COP/visualizations', exist_ok=True)
        os.makedirs('TF-IDF_SVM_Plots/COP/visualizations/learning_curves', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self, text_col):
        """
        Create a TF-IDF + SVM pipeline model with max_features adjusted based on input type.
        
        Args:
            text_col: The text column being processed ('Title' or 'Full text')
            
        Returns:
            A GridSearchCV model with the appropriate parameters
        """
        # Set max_features based on the text column type - new implementation
        if text_col == 'Title':
            max_features = 1500  # Reduced feature set for titles
            print(f"Using {max_features} features for Title inputs")
        else:  # 'Full text'
            max_features = 4000  # Larger feature set for full text
            print(f"Using {max_features} features for Full text inputs")
        
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=max_features,  # Dynamic max_features based on input type
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        # Initialize visualization tracking for this combination
        self.visualizations['learning_curves'][combination_key] = {}
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model - now passing text_col to determine max_features
            model = self.create_model(text_col)
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer
            viz_path = self.visualize_layer_results(layer_result, text_col, label_col, i+1)
            
            # Store visualization path in our dictionary - new implementation
            self.visualizations['learning_curves'][combination_key][i+1] = viz_path
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid results to calculate average metrics")
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
            
        Returns:
            Path to the saved visualization
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        
        # Create organized visualization directory structure
        viz_dir = f"TF-IDF_SVM_Plots/COP/visualizations/learning_curves/{text_col.replace(' ', '_')}"
        os.makedirs(viz_dir, exist_ok=True)
        
        # Save figure with a more organized naming pattern
        viz_path = f"{viz_dir}/{text_col.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(viz_path)
        print(f"Visualization saved as: {viz_path}")
        plt.close()
        
        return viz_path
    
    def create_summary_visualization(self):
        """
        Create a summary visualization comparing all model combinations.
        """
        # Prepare data for visualization
        combinations = []
        accuracies = []
        aucs = []
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results and 'avg_auc' in results:
                combinations.append(combo)
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            print("No valid results to create summary visualization")
            return
        
        # Create figure
        plt.figure(figsize=(12, 6))
        
        # Set up bar positions
        x = np.arange(len(combinations))
        width = 0.35
        
        # Plot accuracy and AUC bars
        plt.bar(x - width/2, accuracies, width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, aucs, width, label='Average AUC', color='salmon')
        
        # Add labels and title
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of Different Model Combinations')
        plt.xticks(x, combinations, rotation=45, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(accuracies):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(aucs):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(max(accuracies), max(aucs)) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        summary_path = "TF-IDF_SVM_Plots/COP/visualizations/summary_performance.png"
        plt.savefig(summary_path)
        print(f"Summary comparison visualization saved as: {summary_path}")
        plt.close()
        
        # Store visualization path
        self.visualizations['summary']['overall_performance'] = summary_path
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (COP)")
        print("="*80)
        
        # Create comprehensive summary table
        for combination, results in self.results.items():
            text_col, label_col = combination.split('|')
            if 'avg_accuracy' in results:
                print(f"\nCombination: {text_col} + {label_col}")
                print(f"Average Accuracy: {results['avg_accuracy']:.4f}")
                print(f"Average AUC: {results['avg_auc']:.4f}")
                
                # Print layer-specific results
                for i, accuracy in enumerate(results.get('accuracy', [])):
                    auc = results.get('auc', [])[i]
                    best_params = results.get('best_params', [])[i]
                    print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                    print(f"  Layer {i+1} - Best Parameters: {best_params}")
                    # Print path to visualization for this layer
                    viz_path = self.visualizations['learning_curves'][combination].get(i+1, "No visualization available")
                    print(f"  Layer {i+1} - Visualization: {viz_path}")
        
        # Create summary visualization
        self.create_summary_visualization()
        
        # Print visualization paths summary
        print("\n" + "="*80)
        print("VISUALIZATION PATHS SUMMARY")
        print("="*80)
        print(f"Summary visualization: {self.visualizations['summary'].get('overall_performance', 'Not created')}")
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_COP_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 838 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0: 431, 1: 407}
Class distribution for long-term prediction: {1: 440, 0: 398}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Using 1500 features for Title inputs
Best parameters for Layer 1: {'classifier__C': 0.1, 'classifier__class_weight': 'balanced', 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.5038
Test AUC for Layer 1: 0.5116
Visualization saved as: TF-IDF_SVM_Plots/COP/visualizations/learning_curves/Title/Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 s

ValueError: too many values to unpack (expected 2)

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
        # Dictionary to track visualizations - new addition
        self.visualizations = {
            'learning_curves': {},  # Will store paths to learning curve plots
            'feature_importance': {},  # Will store paths to feature importance plots
            'summary': {}  # Will store paths to summary visualizations
        }
        
        # Create visualization directory if it doesn't exist
        os.makedirs('TF-IDF_SVM_Plots/CVX/visualizations', exist_ok=True)
        os.makedirs('TF-IDF_SVM_Plots/CVX/visualizations/learning_curves', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self, text_col):
        """
        Create a TF-IDF + SVM pipeline model with max_features adjusted based on input type.
        
        Args:
            text_col: The text column being processed ('Title' or 'Full text')
            
        Returns:
            A GridSearchCV model with the appropriate parameters
        """
        # Set max_features based on the text column type - new implementation
        if text_col == 'Title':
            max_features = 1500  # Reduced feature set for titles
            print(f"Using {max_features} features for Title inputs")
        else:  # 'Full text'
            max_features = 4000  # Larger feature set for full text
            print(f"Using {max_features} features for Full text inputs")
        
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=max_features,  # Dynamic max_features based on input type
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        # Initialize visualization tracking for this combination
        self.visualizations['learning_curves'][combination_key] = {}
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model - now passing text_col to determine max_features
            model = self.create_model(text_col)
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer
            viz_path = self.visualize_layer_results(layer_result, text_col, label_col, i+1)
            
            # Store visualization path in our dictionary - new implementation
            self.visualizations['learning_curves'][combination_key][i+1] = viz_path
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid results to calculate average metrics")
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
            
        Returns:
            Path to the saved visualization
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        
        # Create organized visualization directory structure
        viz_dir = f"TF-IDF_SVM_Plots/CVX/visualizations/learning_curves/{text_col.replace(' ', '_')}"
        os.makedirs(viz_dir, exist_ok=True)
        
        # Save figure with a more organized naming pattern
        viz_path = f"{viz_dir}/{text_col.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(viz_path)
        print(f"Visualization saved as: {viz_path}")
        plt.close()
        
        return viz_path
    
    def create_summary_visualization(self):
        """
        Create a summary visualization comparing all model combinations.
        """
        # Prepare data for visualization
        combinations = []
        accuracies = []
        aucs = []
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results and 'avg_auc' in results:
                combinations.append(combo)
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            print("No valid results to create summary visualization")
            return
        
        # Create figure
        plt.figure(figsize=(12, 6))
        
        # Set up bar positions
        x = np.arange(len(combinations))
        width = 0.35
        
        # Plot accuracy and AUC bars
        plt.bar(x - width/2, accuracies, width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, aucs, width, label='Average AUC', color='salmon')
        
        # Add labels and title
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of Different Model Combinations')
        plt.xticks(x, combinations, rotation=45, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(accuracies):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(aucs):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(max(accuracies), max(aucs)) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        summary_path = "TF-IDF_SVM_Plots/CVX/visualizations/summary_performance.png"
        plt.savefig(summary_path)
        print(f"Summary comparison visualization saved as: {summary_path}")
        plt.close()
        
        # Store visualization path
        self.visualizations['summary']['overall_performance'] = summary_path
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (CVX)")
        print("="*80)
        
        # Create comprehensive summary table
        for combination, results in self.results.items():
            text_col, label_col = combination.split('|')
            if 'avg_accuracy' in results:
                print(f"\nCombination: {text_col} + {label_col}")
                print(f"Average Accuracy: {results['avg_accuracy']:.4f}")
                print(f"Average AUC: {results['avg_auc']:.4f}")
                
                # Print layer-specific results
                for i, accuracy in enumerate(results.get('accuracy', [])):
                    auc = results.get('auc', [])[i]
                    best_params = results.get('best_params', [])[i]
                    print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                    print(f"  Layer {i+1} - Best Parameters: {best_params}")
                    # Print path to visualization for this layer
                    viz_path = self.visualizations['learning_curves'][combination].get(i+1, "No visualization available")
                    print(f"  Layer {i+1} - Visualization: {viz_path}")
        
        # Create summary visualization
        self.create_summary_visualization()
        
        # Print visualization paths summary
        print("\n" + "="*80)
        print("VISUALIZATION PATHS SUMMARY")
        print("="*80)
        print(f"Summary visualization: {self.visualizations['summary'].get('overall_performance', 'Not created')}")
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_CVX_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 838 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 452, 0: 386}
Class distribution for long-term prediction: {1: 464, 0: 374}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Using 1500 features for Title inputs
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.3910
Test AUC for Layer 1: 0.5000
Visualization saved as: TF-IDF_SVM_Plots/CVX/visualizations/learning_curves/Title/Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 sample

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
        # Dictionary to track visualizations - new addition
        self.visualizations = {
            'learning_curves': {},  # Will store paths to learning curve plots
            'feature_importance': {},  # Will store paths to feature importance plots
            'summary': {}  # Will store paths to summary visualizations
        }
        
        # Create visualization directory if it doesn't exist
        os.makedirs('TF-IDF_SVM_Plots/MPC/visualizations', exist_ok=True)
        os.makedirs('TF-IDF_SVM_Plots/MPC/visualizations/learning_curves', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self, text_col):
        """
        Create a TF-IDF + SVM pipeline model with max_features adjusted based on input type.
        
        Args:
            text_col: The text column being processed ('Title' or 'Full text')
            
        Returns:
            A GridSearchCV model with the appropriate parameters
        """
        # Set max_features based on the text column type - new implementation
        if text_col == 'Title':
            max_features = 1500  # Reduced feature set for titles
            print(f"Using {max_features} features for Title inputs")
        else:  # 'Full text'
            max_features = 4000  # Larger feature set for full text
            print(f"Using {max_features} features for Full text inputs")
        
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=max_features,  # Dynamic max_features based on input type
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        # Initialize visualization tracking for this combination
        self.visualizations['learning_curves'][combination_key] = {}
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model - now passing text_col to determine max_features
            model = self.create_model(text_col)
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer
            viz_path = self.visualize_layer_results(layer_result, text_col, label_col, i+1)
            
            # Store visualization path in our dictionary - new implementation
            self.visualizations['learning_curves'][combination_key][i+1] = viz_path
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid results to calculate average metrics")
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
            
        Returns:
            Path to the saved visualization
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        
        # Create organized visualization directory structure
        viz_dir = f"TF-IDF_SVM_Plots/MPC/visualizations/learning_curves/{text_col.replace(' ', '_')}"
        os.makedirs(viz_dir, exist_ok=True)
        
        # Save figure with a more organized naming pattern
        viz_path = f"{viz_dir}/{text_col.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(viz_path)
        print(f"Visualization saved as: {viz_path}")
        plt.close()
        
        return viz_path
    
    def create_summary_visualization(self):
        """
        Create a summary visualization comparing all model combinations.
        """
        # Prepare data for visualization
        combinations = []
        accuracies = []
        aucs = []
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results and 'avg_auc' in results:
                combinations.append(combo)
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            print("No valid results to create summary visualization")
            return
        
        # Create figure
        plt.figure(figsize=(12, 6))
        
        # Set up bar positions
        x = np.arange(len(combinations))
        width = 0.35
        
        # Plot accuracy and AUC bars
        plt.bar(x - width/2, accuracies, width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, aucs, width, label='Average AUC', color='salmon')
        
        # Add labels and title
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of Different Model Combinations')
        plt.xticks(x, combinations, rotation=45, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(accuracies):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(aucs):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(max(accuracies), max(aucs)) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        summary_path = "TF-IDF_SVM_Plots/MPC/visualizations/summary_performance.png"
        plt.savefig(summary_path)
        print(f"Summary comparison visualization saved as: {summary_path}")
        plt.close()
        
        # Store visualization path
        self.visualizations['summary']['overall_performance'] = summary_path
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (MPC)")
        print("="*80)
        
        # Create comprehensive summary table
        for combination, results in self.results.items():
            text_col, label_col = combination.split('|')
            if 'avg_accuracy' in results:
                print(f"\nCombination: {text_col} + {label_col}")
                print(f"Average Accuracy: {results['avg_accuracy']:.4f}")
                print(f"Average AUC: {results['avg_auc']:.4f}")
                
                # Print layer-specific results
                for i, accuracy in enumerate(results.get('accuracy', [])):
                    auc = results.get('auc', [])[i]
                    best_params = results.get('best_params', [])[i]
                    print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                    print(f"  Layer {i+1} - Best Parameters: {best_params}")
                    # Print path to visualization for this layer
                    viz_path = self.visualizations['learning_curves'][combination].get(i+1, "No visualization available")
                    print(f"  Layer {i+1} - Visualization: {viz_path}")
        
        # Create summary visualization
        self.create_summary_visualization()
        
        # Print visualization paths summary
        print("\n" + "="*80)
        print("VISUALIZATION PATHS SUMMARY")
        print("="*80)
        print(f"Summary visualization: {self.visualizations['summary'].get('overall_performance', 'Not created')}")
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_MPC_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 838 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 460, 0: 378}
Class distribution for long-term prediction: {1: 503, 0: 335}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Using 1500 features for Title inputs
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.4511
Test AUC for Layer 1: 0.5000
Visualization saved as: TF-IDF_SVM_Plots/MPC/visualizations/learning_curves/Title/Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 sample

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
        # Dictionary to track visualizations - new addition
        self.visualizations = {
            'learning_curves': {},  # Will store paths to learning curve plots
            'feature_importance': {},  # Will store paths to feature importance plots
            'summary': {}  # Will store paths to summary visualizations
        }
        
        # Create visualization directory if it doesn't exist
        os.makedirs('TF-IDF_SVM_Plots/SLB/visualizations', exist_ok=True)
        os.makedirs('TF-IDF_SVM_Plots/SLB/visualizations/learning_curves', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self, text_col):
        """
        Create a TF-IDF + SVM pipeline model with max_features adjusted based on input type.
        
        Args:
            text_col: The text column being processed ('Title' or 'Full text')
            
        Returns:
            A GridSearchCV model with the appropriate parameters
        """
        # Set max_features based on the text column type - new implementation
        if text_col == 'Title':
            max_features = 1500  # Reduced feature set for titles
            print(f"Using {max_features} features for Title inputs")
        else:  # 'Full text'
            max_features = 4000  # Larger feature set for full text
            print(f"Using {max_features} features for Full text inputs")
        
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=max_features,  # Dynamic max_features based on input type
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        # Initialize visualization tracking for this combination
        self.visualizations['learning_curves'][combination_key] = {}
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model - now passing text_col to determine max_features
            model = self.create_model(text_col)
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer
            viz_path = self.visualize_layer_results(layer_result, text_col, label_col, i+1)
            
            # Store visualization path in our dictionary - new implementation
            self.visualizations['learning_curves'][combination_key][i+1] = viz_path
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid results to calculate average metrics")
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
            
        Returns:
            Path to the saved visualization
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        
        # Create organized visualization directory structure
        viz_dir = f"TF-IDF_SVM_Plots/SLB/visualizations/learning_curves/{text_col.replace(' ', '_')}"
        os.makedirs(viz_dir, exist_ok=True)
        
        # Save figure with a more organized naming pattern
        viz_path = f"{viz_dir}/{text_col.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(viz_path)
        print(f"Visualization saved as: {viz_path}")
        plt.close()
        
        return viz_path
    
    def create_summary_visualization(self):
        """
        Create a summary visualization comparing all model combinations.
        """
        # Prepare data for visualization
        combinations = []
        accuracies = []
        aucs = []
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results and 'avg_auc' in results:
                combinations.append(combo)
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            print("No valid results to create summary visualization")
            return
        
        # Create figure
        plt.figure(figsize=(12, 6))
        
        # Set up bar positions
        x = np.arange(len(combinations))
        width = 0.35
        
        # Plot accuracy and AUC bars
        plt.bar(x - width/2, accuracies, width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, aucs, width, label='Average AUC', color='salmon')
        
        # Add labels and title
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of Different Model Combinations')
        plt.xticks(x, combinations, rotation=45, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(accuracies):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(aucs):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(max(accuracies), max(aucs)) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        summary_path = "TF-IDF_SVM_Plots/SLB/visualizations/summary_performance.png"
        plt.savefig(summary_path)
        print(f"Summary comparison visualization saved as: {summary_path}")
        plt.close()
        
        # Store visualization path
        self.visualizations['summary']['overall_performance'] = summary_path
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (SLB)")
        print("="*80)
        
        # Create comprehensive summary table
        for combination, results in self.results.items():
            text_col, label_col = combination.split('|')
            if 'avg_accuracy' in results:
                print(f"\nCombination: {text_col} + {label_col}")
                print(f"Average Accuracy: {results['avg_accuracy']:.4f}")
                print(f"Average AUC: {results['avg_auc']:.4f}")
                
                # Print layer-specific results
                for i, accuracy in enumerate(results.get('accuracy', [])):
                    auc = results.get('auc', [])[i]
                    best_params = results.get('best_params', [])[i]
                    print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                    print(f"  Layer {i+1} - Best Parameters: {best_params}")
                    # Print path to visualization for this layer
                    viz_path = self.visualizations['learning_curves'][combination].get(i+1, "No visualization available")
                    print(f"  Layer {i+1} - Visualization: {viz_path}")
        
        # Create summary visualization
        self.create_summary_visualization()
        
        # Print visualization paths summary
        print("\n" + "="*80)
        print("VISUALIZATION PATHS SUMMARY")
        print("="*80)
        print(f"Summary visualization: {self.visualizations['summary'].get('overall_performance', 'Not created')}")
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_SLB_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 838 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {0: 433, 1: 405}
Class distribution for long-term prediction: {0: 437, 1: 401}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Using 1500 features for Title inputs
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l2'}
Test Accuracy for Layer 1: 0.4887
Test AUC for Layer 1: 0.4893
Visualization saved as: TF-IDF_SVM_Plots/CVX/visualizations/learning_curves/Title/Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 sample

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
        # Dictionary to track visualizations - new addition
        self.visualizations = {
            'learning_curves': {},  # Will store paths to learning curve plots
            'feature_importance': {},  # Will store paths to feature importance plots
            'summary': {}  # Will store paths to summary visualizations
        }
        
        # Create visualization directory if it doesn't exist
        os.makedirs('TF-IDF_SVM_Plots/XOM/visualizations', exist_ok=True)
        os.makedirs('TF-IDF_SVM_Plots/XOM/visualizations/learning_curves', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2014-2020), validation (2020-2021), test (2021-2022)
        layer1 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2020-10-31'),
            'val_start': pd.Timestamp('2020-11-01'),
            'val_end': pd.Timestamp('2021-10-31'),
            'test_start': pd.Timestamp('2021-11-01'),
            'test_end': pd.Timestamp('2022-10-31')
        }
        
        # Second layer: train (2014-2021), validation (2021-2022), test (2022-2023)
        layer2 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2021-10-31'),
            'val_start': pd.Timestamp('2021-11-01'),
            'val_end': pd.Timestamp('2022-10-31'),
            'test_start': pd.Timestamp('2022-11-01'),
            'test_end': pd.Timestamp('2023-10-31')
        }
        
        # Third layer: train (2014-2022), validation (2022-2023), test (2023-2024)
        layer3 = {
            'train_start': pd.Timestamp('2014-11-01'),
            'train_end': pd.Timestamp('2022-10-31'),
            'val_start': pd.Timestamp('2022-11-01'),
            'val_end': pd.Timestamp('2023-10-31'),
            'test_start': pd.Timestamp('2023-11-01'),
            'test_end': pd.Timestamp('2024-11-01')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self, text_col):
        """
        Create a TF-IDF + SVM pipeline model with max_features adjusted based on input type.
        
        Args:
            text_col: The text column being processed ('Title' or 'Full text')
            
        Returns:
            A GridSearchCV model with the appropriate parameters
        """
        # Set max_features based on the text column type - new implementation
        if text_col == 'Title':
            max_features = 1500  # Reduced feature set for titles
            print(f"Using {max_features} features for Title inputs")
        else:  # 'Full text'
            max_features = 4000  # Larger feature set for full text
            print(f"Using {max_features} features for Full text inputs")
        
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=max_features,  # Dynamic max_features based on input type
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        # Initialize visualization tracking for this combination
        self.visualizations['learning_curves'][combination_key] = {}
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model - now passing text_col to determine max_features
            model = self.create_model(text_col)
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer
            viz_path = self.visualize_layer_results(layer_result, text_col, label_col, i+1)
            
            # Store visualization path in our dictionary - new implementation
            self.visualizations['learning_curves'][combination_key][i+1] = viz_path
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid results to calculate average metrics")
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
            
        Returns:
            Path to the saved visualization
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        
        # Create organized visualization directory structure
        viz_dir = f"TF-IDF_SVM_Plots/XOM/visualizations/learning_curves/{text_col.replace(' ', '_')}"
        os.makedirs(viz_dir, exist_ok=True)
        
        # Save figure with a more organized naming pattern
        viz_path = f"{viz_dir}/{text_col.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(viz_path)
        print(f"Visualization saved as: {viz_path}")
        plt.close()
        
        return viz_path
    
    def create_summary_visualization(self):
        """
        Create a summary visualization comparing all model combinations.
        """
        # Prepare data for visualization
        combinations = []
        accuracies = []
        aucs = []
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results and 'avg_auc' in results:
                combinations.append(combo)
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            print("No valid results to create summary visualization")
            return
        
        # Create figure
        plt.figure(figsize=(12, 6))
        
        # Set up bar positions
        x = np.arange(len(combinations))
        width = 0.35
        
        # Plot accuracy and AUC bars
        plt.bar(x - width/2, accuracies, width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, aucs, width, label='Average AUC', color='salmon')
        
        # Add labels and title
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of Different Model Combinations')
        plt.xticks(x, combinations, rotation=45, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(accuracies):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(aucs):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(max(accuracies), max(aucs)) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        summary_path = "TF-IDF_SVM_Plots/XOM/visualizations/summary_performance.png"
        plt.savefig(summary_path)
        print(f"Summary comparison visualization saved as: {summary_path}")
        plt.close()
        
        # Store visualization path
        self.visualizations['summary']['overall_performance'] = summary_path
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (XOM)")
        print("="*80)
        
        # Create comprehensive summary table
        for combination, results in self.results.items():
            text_col, label_col = combination.split('|')
            if 'avg_accuracy' in results:
                print(f"\nCombination: {text_col} + {label_col}")
                print(f"Average Accuracy: {results['avg_accuracy']:.4f}")
                print(f"Average AUC: {results['avg_auc']:.4f}")
                
                # Print layer-specific results
                for i, accuracy in enumerate(results.get('accuracy', [])):
                    auc = results.get('auc', [])[i]
                    best_params = results.get('best_params', [])[i]
                    print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                    print(f"  Layer {i+1} - Best Parameters: {best_params}")
                    # Print path to visualization for this layer
                    viz_path = self.visualizations['learning_curves'][combination].get(i+1, "No visualization available")
                    print(f"  Layer {i+1} - Visualization: {viz_path}")
        
        # Create summary visualization
        self.create_summary_visualization()
        
        # Print visualization paths summary
        print("\n" + "="*80)
        print("VISUALIZATION PATHS SUMMARY")
        print("="*80)
        print(f"Summary visualization: {self.visualizations['summary'].get('overall_performance', 'Not created')}")
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/wall_street_news_semantics_SP500_database/wall_street_news_semantics_XOM_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 838 climate change news articles from Wall Street Journal spanning from 07/11/2014 to 24/09/2024
Class distribution for short-term prediction: {1: 427, 0: 411}
Class distribution for long-term prediction: {1: 428, 0: 410}

Training model for Title and S_label

Layer 1:
Training period: 01/11/2014 - 31/10/2020
Validation period: 01/11/2020 - 31/10/2021
Testing period: 01/11/2021 - 31/10/2022
Training data: 401 samples
Validation data: 131 samples
Test data: 133 samples
Using 1500 features for Title inputs
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.4436
Test AUC for Layer 1: 0.5000
Visualization saved as: TF-IDF_SVM_Plots/XOM/visualizations/learning_curves/Title/Title_S_label_layer_1.png

Layer 2:
Training period: 01/11/2014 - 31/10/2021
Validation period: 01/11/2021 - 31/10/2022
Testing period: 01/11/2022 - 31/10/2023
Training data: 532 sample

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
        # Dictionary to track visualizations - new addition
        self.visualizations = {
            'learning_curves': {},  # Will store paths to learning curve plots
            'feature_importance': {},  # Will store paths to feature importance plots
            'summary': {}  # Will store paths to summary visualizations
        }
        
        # Create visualization directory if it doesn't exist
        os.makedirs('TF-IDF_SVM_Plots/COP/visualizations', exist_ok=True)
        os.makedirs('TF-IDF_SVM_Plots/COP/visualizations/learning_curves', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self, text_col):
        """
        Create a TF-IDF + SVM pipeline model with max_features adjusted based on input type.
        
        Args:
            text_col: The text column being processed ('Title' or 'Full text')
            
        Returns:
            A GridSearchCV model with the appropriate parameters
        """
        # Set max_features based on the text column type - new implementation
        if text_col == 'Title':
            max_features = 1500  # Reduced feature set for titles
            print(f"Using {max_features} features for Title inputs")
        else:  # 'Full text'
            max_features = 4000  # Larger feature set for full text
            print(f"Using {max_features} features for Full text inputs")
        
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=max_features,  # Dynamic max_features based on input type
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        # Initialize visualization tracking for this combination
        self.visualizations['learning_curves'][combination_key] = {}
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model - now passing text_col to determine max_features
            model = self.create_model(text_col)
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer
            viz_path = self.visualize_layer_results(layer_result, text_col, label_col, i+1)
            
            # Store visualization path in our dictionary - new implementation
            self.visualizations['learning_curves'][combination_key][i+1] = viz_path
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid results to calculate average metrics")
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
            
        Returns:
            Path to the saved visualization
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        
        # Create organized visualization directory structure
        viz_dir = f"TF-IDF_SVM_Plots/COP/visualizations/learning_curves/{text_col.replace(' ', '_')}"
        os.makedirs(viz_dir, exist_ok=True)
        
        # Save figure with a more organized naming pattern
        viz_path = f"{viz_dir}/{text_col.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(viz_path)
        print(f"Visualization saved as: {viz_path}")
        plt.close()
        
        return viz_path
    
    def create_summary_visualization(self):
        """
        Create a summary visualization comparing all model combinations.
        """
        # Prepare data for visualization
        combinations = []
        accuracies = []
        aucs = []
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results and 'avg_auc' in results:
                combinations.append(combo)
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            print("No valid results to create summary visualization")
            return
        
        # Create figure
        plt.figure(figsize=(12, 6))
        
        # Set up bar positions
        x = np.arange(len(combinations))
        width = 0.35
        
        # Plot accuracy and AUC bars
        plt.bar(x - width/2, accuracies, width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, aucs, width, label='Average AUC', color='salmon')
        
        # Add labels and title
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of Different Model Combinations')
        plt.xticks(x, combinations, rotation=45, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(accuracies):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(aucs):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(max(accuracies), max(aucs)) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        summary_path = "TF-IDF_SVM_Plots/COP/visualizations/summary_performance.png"
        plt.savefig(summary_path)
        print(f"Summary comparison visualization saved as: {summary_path}")
        plt.close()
        
        # Store visualization path
        self.visualizations['summary']['overall_performance'] = summary_path
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (COP)")
        print("="*80)
        
        # Create comprehensive summary table
        for combination, results in self.results.items():
            text_col, label_col = combination.split('|')
            if 'avg_accuracy' in results:
                print(f"\nCombination: {text_col} + {label_col}")
                print(f"Average Accuracy: {results['avg_accuracy']:.4f}")
                print(f"Average AUC: {results['avg_auc']:.4f}")
                
                # Print layer-specific results
                for i, accuracy in enumerate(results.get('accuracy', [])):
                    auc = results.get('auc', [])[i]
                    best_params = results.get('best_params', [])[i]
                    print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                    print(f"  Layer {i+1} - Best Parameters: {best_params}")
                    # Print path to visualization for this layer
                    viz_path = self.visualizations['learning_curves'][combination].get(i+1, "No visualization available")
                    print(f"  Layer {i+1} - Visualization: {viz_path}")
        
        # Create summary visualization
        self.create_summary_visualization()
        
        # Print visualization paths summary
        print("\n" + "="*80)
        print("VISUALIZATION PATHS SUMMARY")
        print("="*80)
        print(f"Summary visualization: {self.visualizations['summary'].get('overall_performance', 'Not created')}")
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_COP_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 929 climate change news articles from Wall Street Journal spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {0: 469, 1: 460}
Class distribution for long-term prediction: {1: 504, 0: 425}

Training model for Title and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Using 1500 features for Title inputs
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': 'balanced', 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l2'}
Test Accuracy for Layer 1: 0.4697
Test AUC for Layer 1: 0.4773
Visualization saved as: TF-IDF_SVM_Plots/COP/visualizations/learning_curves/Title/Title_S_label_layer_1.png

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 s

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Merged news', 'S_label'),     # Merged news + short-term prediction
            ('Merged news', 'L_label'),     # Merged news + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (COP)")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_COP_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 929 climate change news articles from Wall Street Journal spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {0: 469, 1: 460}
Class distribution for long-term prediction: {1: 504, 0: 425}

Training model for Merged news and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': 'balanced', 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l2'}
Test Accuracy for Layer 1: 0.4545
Test AUC for Layer 1: 0.4750
Visualization saved as: visualization_Merged news_S_label_layer_1.png

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Best parame

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
        # Dictionary to track visualizations - new addition
        self.visualizations = {
            'learning_curves': {},  # Will store paths to learning curve plots
            'feature_importance': {},  # Will store paths to feature importance plots
            'summary': {}  # Will store paths to summary visualizations
        }
        
        # Create visualization directory if it doesn't exist
        os.makedirs('TF-IDF_SVM_Plots/CVX/visualizations', exist_ok=True)
        os.makedirs('TF-IDF_SVM_Plots/CVX/visualizations/learning_curves', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self, text_col):
        """
        Create a TF-IDF + SVM pipeline model with max_features adjusted based on input type.
        
        Args:
            text_col: The text column being processed ('Title' or 'Full text')
            
        Returns:
            A GridSearchCV model with the appropriate parameters
        """
        # Set max_features based on the text column type - new implementation
        if text_col == 'Title':
            max_features = 1500  # Reduced feature set for titles
            print(f"Using {max_features} features for Title inputs")
        else:  # 'Full text'
            max_features = 4000  # Larger feature set for full text
            print(f"Using {max_features} features for Full text inputs")
        
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=max_features,  # Dynamic max_features based on input type
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        # Initialize visualization tracking for this combination
        self.visualizations['learning_curves'][combination_key] = {}
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model - now passing text_col to determine max_features
            model = self.create_model(text_col)
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer
            viz_path = self.visualize_layer_results(layer_result, text_col, label_col, i+1)
            
            # Store visualization path in our dictionary - new implementation
            self.visualizations['learning_curves'][combination_key][i+1] = viz_path
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid results to calculate average metrics")
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
            
        Returns:
            Path to the saved visualization
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        
        # Create organized visualization directory structure
        viz_dir = f"TF-IDF_SVM_Plots/CVX/visualizations/learning_curves/{text_col.replace(' ', '_')}"
        os.makedirs(viz_dir, exist_ok=True)
        
        # Save figure with a more organized naming pattern
        viz_path = f"{viz_dir}/{text_col.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(viz_path)
        print(f"Visualization saved as: {viz_path}")
        plt.close()
        
        return viz_path
    
    def create_summary_visualization(self):
        """
        Create a summary visualization comparing all model combinations.
        """
        # Prepare data for visualization
        combinations = []
        accuracies = []
        aucs = []
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results and 'avg_auc' in results:
                combinations.append(combo)
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            print("No valid results to create summary visualization")
            return
        
        # Create figure
        plt.figure(figsize=(12, 6))
        
        # Set up bar positions
        x = np.arange(len(combinations))
        width = 0.35
        
        # Plot accuracy and AUC bars
        plt.bar(x - width/2, accuracies, width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, aucs, width, label='Average AUC', color='salmon')
        
        # Add labels and title
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of Different Model Combinations')
        plt.xticks(x, combinations, rotation=45, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(accuracies):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(aucs):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(max(accuracies), max(aucs)) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        summary_path = "TF-IDF_SVM_Plots/CVX/visualizations/summary_performance.png"
        plt.savefig(summary_path)
        print(f"Summary comparison visualization saved as: {summary_path}")
        plt.close()
        
        # Store visualization path
        self.visualizations['summary']['overall_performance'] = summary_path
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (CVX)")
        print("="*80)
        
        # Create comprehensive summary table
        for combination, results in self.results.items():
            text_col, label_col = combination.split('|')
            if 'avg_accuracy' in results:
                print(f"\nCombination: {text_col} + {label_col}")
                print(f"Average Accuracy: {results['avg_accuracy']:.4f}")
                print(f"Average AUC: {results['avg_auc']:.4f}")
                
                # Print layer-specific results
                for i, accuracy in enumerate(results.get('accuracy', [])):
                    auc = results.get('auc', [])[i]
                    best_params = results.get('best_params', [])[i]
                    print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                    print(f"  Layer {i+1} - Best Parameters: {best_params}")
                    # Print path to visualization for this layer
                    viz_path = self.visualizations['learning_curves'][combination].get(i+1, "No visualization available")
                    print(f"  Layer {i+1} - Visualization: {viz_path}")
        
        # Create summary visualization
        self.create_summary_visualization()
        
        # Print visualization paths summary
        print("\n" + "="*80)
        print("VISUALIZATION PATHS SUMMARY")
        print("="*80)
        print(f"Summary visualization: {self.visualizations['summary'].get('overall_performance', 'Not created')}")
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_CVX_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

Loaded 929 climate change news articles from Wall Street Journal spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {1: 499, 0: 430}
Class distribution for long-term prediction: {1: 521, 0: 408}

Training model for Title and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Using 1500 features for Title inputs
Best parameters for Layer 1: {'classifier__C': 1, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l2'}
Test Accuracy for Layer 1: 0.4545
Test AUC for Layer 1: 0.4676
Visualization saved as: TF-IDF_SVM_Plots/CVX/visualizations/learning_curves/Title/Title_S_label_layer_1.png

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Using 1500 features for Title inputs
Best parameters for Layer 2: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 2: 0.3981
Test AUC for Layer 2: 0.5000
Visualization saved as: TF-IDF_SVM_Plots/CVX/visualizations/learning_curves/Title/Title_S_label_lay

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"F:/Python/my jupyter notebook/Merged_TF-IDF_SVM_Plots/CVX/visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Merged news', 'S_label'),     # Merged news + short-term prediction
            ('Merged news', 'L_label'),     # Merged news + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (CVX)")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_CVX_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 929 climate change news articles from Wall Street Journal spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {1: 499, 0: 430}
Class distribution for long-term prediction: {1: 521, 0: 408}

Training model for Merged news and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': 'balanced', 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l2'}
Test Accuracy for Layer 1: 0.5000
Test AUC for Layer 1: 0.4681
Visualization saved as: visualization_Merged news_S_label_layer_1.png

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Best parame

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
        # Dictionary to track visualizations - new addition
        self.visualizations = {
            'learning_curves': {},  # Will store paths to learning curve plots
            'feature_importance': {},  # Will store paths to feature importance plots
            'summary': {}  # Will store paths to summary visualizations
        }
        
        # Create visualization directory if it doesn't exist
        os.makedirs('TF-IDF_SVM_Plots/MPC/visualizations', exist_ok=True)
        os.makedirs('TF-IDF_SVM_Plots/MPC/visualizations/learning_curves', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self, text_col):
        """
        Create a TF-IDF + SVM pipeline model with max_features adjusted based on input type.
        
        Args:
            text_col: The text column being processed ('Title' or 'Full text')
            
        Returns:
            A GridSearchCV model with the appropriate parameters
        """
        # Set max_features based on the text column type - new implementation
        if text_col == 'Title':
            max_features = 1500  # Reduced feature set for titles
            print(f"Using {max_features} features for Title inputs")
        else:  # 'Full text'
            max_features = 4000  # Larger feature set for full text
            print(f"Using {max_features} features for Full text inputs")
        
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=max_features,  # Dynamic max_features based on input type
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        # Initialize visualization tracking for this combination
        self.visualizations['learning_curves'][combination_key] = {}
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model - now passing text_col to determine max_features
            model = self.create_model(text_col)
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer
            viz_path = self.visualize_layer_results(layer_result, text_col, label_col, i+1)
            
            # Store visualization path in our dictionary - new implementation
            self.visualizations['learning_curves'][combination_key][i+1] = viz_path
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid results to calculate average metrics")
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
            
        Returns:
            Path to the saved visualization
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        
        # Create organized visualization directory structure
        viz_dir = f"TF-IDF_SVM_Plots/MPC/visualizations/learning_curves/{text_col.replace(' ', '_')}"
        os.makedirs(viz_dir, exist_ok=True)
        
        # Save figure with a more organized naming pattern
        viz_path = f"{viz_dir}/{text_col.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(viz_path)
        print(f"Visualization saved as: {viz_path}")
        plt.close()
        
        return viz_path
    
    def create_summary_visualization(self):
        """
        Create a summary visualization comparing all model combinations.
        """
        # Prepare data for visualization
        combinations = []
        accuracies = []
        aucs = []
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results and 'avg_auc' in results:
                combinations.append(combo)
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            print("No valid results to create summary visualization")
            return
        
        # Create figure
        plt.figure(figsize=(12, 6))
        
        # Set up bar positions
        x = np.arange(len(combinations))
        width = 0.35
        
        # Plot accuracy and AUC bars
        plt.bar(x - width/2, accuracies, width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, aucs, width, label='Average AUC', color='salmon')
        
        # Add labels and title
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of Different Model Combinations')
        plt.xticks(x, combinations, rotation=45, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(accuracies):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(aucs):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(max(accuracies), max(aucs)) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        summary_path = "TF-IDF_SVM_Plots/MPC/visualizations/summary_performance.png"
        plt.savefig(summary_path)
        print(f"Summary comparison visualization saved as: {summary_path}")
        plt.close()
        
        # Store visualization path
        self.visualizations['summary']['overall_performance'] = summary_path
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (MPC)")
        print("="*80)
        
        # Create comprehensive summary table
        for combination, results in self.results.items():
            text_col, label_col = combination.split('|')
            if 'avg_accuracy' in results:
                print(f"\nCombination: {text_col} + {label_col}")
                print(f"Average Accuracy: {results['avg_accuracy']:.4f}")
                print(f"Average AUC: {results['avg_auc']:.4f}")
                
                # Print layer-specific results
                for i, accuracy in enumerate(results.get('accuracy', [])):
                    auc = results.get('auc', [])[i]
                    best_params = results.get('best_params', [])[i]
                    print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                    print(f"  Layer {i+1} - Best Parameters: {best_params}")
                    # Print path to visualization for this layer
                    viz_path = self.visualizations['learning_curves'][combination].get(i+1, "No visualization available")
                    print(f"  Layer {i+1} - Visualization: {viz_path}")
        
        # Create summary visualization
        self.create_summary_visualization()
        
        # Print visualization paths summary
        print("\n" + "="*80)
        print("VISUALIZATION PATHS SUMMARY")
        print("="*80)
        print(f"Summary visualization: {self.visualizations['summary'].get('overall_performance', 'Not created')}")
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_MPC_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 929 climate change news articles from Wall Street Journal spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {1: 482, 0: 447}
Class distribution for long-term prediction: {1: 555, 0: 374}

Training model for Title and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Using 1500 features for Title inputs
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.3939
Test AUC for Layer 1: 0.5000
Visualization saved as: TF-IDF_SVM_Plots/MPC/visualizations/learning_curves/Title/Title_S_label_layer_1.png

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"F:/Python/my jupyter notebook/Merged_TF-IDF_SVM_Plots/MPC/visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Merged news', 'S_label'),     # Merged news + short-term prediction
            ('Merged news', 'L_label'),     # Merged news + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (MPC)")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_MPC_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 929 climate change news articles from Wall Street Journal spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {1: 482, 0: 447}
Class distribution for long-term prediction: {1: 555, 0: 374}

Training model for Merged news and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Best parameters for Layer 1: {'classifier__C': 10, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.5303
Test AUC for Layer 1: 0.5587
Visualization saved as: visualization_Merged news_S_label_layer_1.png

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Best parameters for

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
        # Dictionary to track visualizations - new addition
        self.visualizations = {
            'learning_curves': {},  # Will store paths to learning curve plots
            'feature_importance': {},  # Will store paths to feature importance plots
            'summary': {}  # Will store paths to summary visualizations
        }
        
        # Create visualization directory if it doesn't exist
        os.makedirs('TF-IDF_SVM_Plots/SLB/visualizations', exist_ok=True)
        os.makedirs('TF-IDF_SVM_Plots/SLB/visualizations/learning_curves', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self, text_col):
        """
        Create a TF-IDF + SVM pipeline model with max_features adjusted based on input type.
        
        Args:
            text_col: The text column being processed ('Title' or 'Full text')
            
        Returns:
            A GridSearchCV model with the appropriate parameters
        """
        # Set max_features based on the text column type - new implementation
        if text_col == 'Title':
            max_features = 1500  # Reduced feature set for titles
            print(f"Using {max_features} features for Title inputs")
        else:  # 'Full text'
            max_features = 4000  # Larger feature set for full text
            print(f"Using {max_features} features for Full text inputs")
        
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=max_features,  # Dynamic max_features based on input type
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        # Initialize visualization tracking for this combination
        self.visualizations['learning_curves'][combination_key] = {}
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model - now passing text_col to determine max_features
            model = self.create_model(text_col)
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer
            viz_path = self.visualize_layer_results(layer_result, text_col, label_col, i+1)
            
            # Store visualization path in our dictionary - new implementation
            self.visualizations['learning_curves'][combination_key][i+1] = viz_path
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid results to calculate average metrics")
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
            
        Returns:
            Path to the saved visualization
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        
        # Create organized visualization directory structure
        viz_dir = f"TF-IDF_SVM_Plots/SLB/visualizations/learning_curves/{text_col.replace(' ', '_')}"
        os.makedirs(viz_dir, exist_ok=True)
        
        # Save figure with a more organized naming pattern
        viz_path = f"{viz_dir}/{text_col.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(viz_path)
        print(f"Visualization saved as: {viz_path}")
        plt.close()
        
        return viz_path
    
    def create_summary_visualization(self):
        """
        Create a summary visualization comparing all model combinations.
        """
        # Prepare data for visualization
        combinations = []
        accuracies = []
        aucs = []
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results and 'avg_auc' in results:
                combinations.append(combo)
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            print("No valid results to create summary visualization")
            return
        
        # Create figure
        plt.figure(figsize=(12, 6))
        
        # Set up bar positions
        x = np.arange(len(combinations))
        width = 0.35
        
        # Plot accuracy and AUC bars
        plt.bar(x - width/2, accuracies, width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, aucs, width, label='Average AUC', color='salmon')
        
        # Add labels and title
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of Different Model Combinations')
        plt.xticks(x, combinations, rotation=45, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(accuracies):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(aucs):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(max(accuracies), max(aucs)) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        summary_path = "TF-IDF_SVM_Plots/SLB/visualizations/summary_performance.png"
        plt.savefig(summary_path)
        print(f"Summary comparison visualization saved as: {summary_path}")
        plt.close()
        
        # Store visualization path
        self.visualizations['summary']['overall_performance'] = summary_path
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (SLB)")
        print("="*80)
        
        # Create comprehensive summary table
        for combination, results in self.results.items():
            text_col, label_col = combination.split('|')
            if 'avg_accuracy' in results:
                print(f"\nCombination: {text_col} + {label_col}")
                print(f"Average Accuracy: {results['avg_accuracy']:.4f}")
                print(f"Average AUC: {results['avg_auc']:.4f}")
                
                # Print layer-specific results
                for i, accuracy in enumerate(results.get('accuracy', [])):
                    auc = results.get('auc', [])[i]
                    best_params = results.get('best_params', [])[i]
                    print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                    print(f"  Layer {i+1} - Best Parameters: {best_params}")
                    # Print path to visualization for this layer
                    viz_path = self.visualizations['learning_curves'][combination].get(i+1, "No visualization available")
                    print(f"  Layer {i+1} - Visualization: {viz_path}")
        
        # Create summary visualization
        self.create_summary_visualization()
        
        # Print visualization paths summary
        print("\n" + "="*80)
        print("VISUALIZATION PATHS SUMMARY")
        print("="*80)
        print(f"Summary visualization: {self.visualizations['summary'].get('overall_performance', 'Not created')}")
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_SLB_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 929 climate change news articles from Wall Street Journal spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {1: 472, 0: 457}
Class distribution for long-term prediction: {1: 494, 0: 435}

Training model for Title and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Using 1500 features for Title inputs
Best parameters for Layer 1: {'classifier__C': 0.01, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.4848
Test AUC for Layer 1: 0.5000
Visualization saved as: TF-IDF_SVM_Plots/SLB/visualizations/learning_curves/Title/Title_S_label_layer_1.png

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"F:/Python/my jupyter notebook/Merged_TF-IDF_SVM_Plots/SLB/visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Merged news', 'S_label'),     # Merged news + short-term prediction
            ('Merged news', 'L_label'),     # Merged news + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (SLB)")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_SLB_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 929 climate change news articles from Wall Street Journal spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {1: 472, 0: 457}
Class distribution for long-term prediction: {1: 494, 0: 435}

Training model for Merged news and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Best parameters for Layer 1: {'classifier__C': 1, 'classifier__class_weight': 'balanced', 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l1'}
Test Accuracy for Layer 1: 0.5455
Test AUC for Layer 1: 0.6094
Visualization saved as: visualization_Merged news_S_label_layer_1.png

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Best parameter

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import os
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
        # Dictionary to track visualizations - new addition
        self.visualizations = {
            'learning_curves': {},  # Will store paths to learning curve plots
            'feature_importance': {},  # Will store paths to feature importance plots
            'summary': {}  # Will store paths to summary visualizations
        }
        
        # Create visualization directory if it doesn't exist
        os.makedirs('TF-IDF_SVM_Plots/XOM/visualizations', exist_ok=True)
        os.makedirs('TF-IDF_SVM_Plots/XOM/visualizations/learning_curves', exist_ok=True)
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self, text_col):
        """
        Create a TF-IDF + SVM pipeline model with max_features adjusted based on input type.
        
        Args:
            text_col: The text column being processed ('Title' or 'Full text')
            
        Returns:
            A GridSearchCV model with the appropriate parameters
        """
        # Set max_features based on the text column type - new implementation
        if text_col == 'Title':
            max_features = 1500  # Reduced feature set for titles
            print(f"Using {max_features} features for Title inputs")
        else:  # 'Full text'
            max_features = 4000  # Larger feature set for full text
            print(f"Using {max_features} features for Full text inputs")
        
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=max_features,  # Dynamic max_features based on input type
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}|{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        # Initialize visualization tracking for this combination
        self.visualizations['learning_curves'][combination_key] = {}
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model - now passing text_col to determine max_features
            model = self.create_model(text_col)
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer
            viz_path = self.visualize_layer_results(layer_result, text_col, label_col, i+1)
            
            # Store visualization path in our dictionary - new implementation
            self.visualizations['learning_curves'][combination_key][i+1] = viz_path
        
        # Calculate average metrics
        if self.results[combination_key]['accuracy']:
            avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
            avg_auc = np.mean(self.results[combination_key]['auc'])
            
            print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
            print(f"Average Test AUC across all layers: {avg_auc:.4f}")
            
            self.results[combination_key]['avg_accuracy'] = avg_accuracy
            self.results[combination_key]['avg_auc'] = avg_auc
        else:
            print("\nNo valid results to calculate average metrics")
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
            
        Returns:
            Path to the saved visualization
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        
        # Create organized visualization directory structure
        viz_dir = f"TF-IDF_SVM_Plots/XOM/visualizations/learning_curves/{text_col.replace(' ', '_')}"
        os.makedirs(viz_dir, exist_ok=True)
        
        # Save figure with a more organized naming pattern
        viz_path = f"{viz_dir}/{text_col.replace(' ', '_')}_{label_col}_layer_{layer_num}.png"
        plt.savefig(viz_path)
        print(f"Visualization saved as: {viz_path}")
        plt.close()
        
        return viz_path
    
    def create_summary_visualization(self):
        """
        Create a summary visualization comparing all model combinations.
        """
        # Prepare data for visualization
        combinations = []
        accuracies = []
        aucs = []
        
        for combo, results in self.results.items():
            if 'avg_accuracy' in results and 'avg_auc' in results:
                combinations.append(combo)
                accuracies.append(results['avg_accuracy'])
                aucs.append(results['avg_auc'])
        
        if not combinations:
            print("No valid results to create summary visualization")
            return
        
        # Create figure
        plt.figure(figsize=(12, 6))
        
        # Set up bar positions
        x = np.arange(len(combinations))
        width = 0.35
        
        # Plot accuracy and AUC bars
        plt.bar(x - width/2, accuracies, width, label='Average Accuracy', color='skyblue')
        plt.bar(x + width/2, aucs, width, label='Average AUC', color='salmon')
        
        # Add labels and title
        plt.xlabel('Model Combination')
        plt.ylabel('Score')
        plt.title('Performance Comparison of Different Model Combinations')
        plt.xticks(x, combinations, rotation=45, ha='right')
        plt.legend()
        
        # Add value labels on top of bars
        for i, v in enumerate(accuracies):
            plt.text(i - width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        for i, v in enumerate(aucs):
            plt.text(i + width/2, v + 0.01, f"{v:.3f}", ha='center', va='bottom', fontsize=9)
        
        plt.ylim(0, max(max(accuracies), max(aucs)) + 0.1)
        plt.tight_layout()
        
        # Save the visualization
        summary_path = "TF-IDF_SVM_Plots/XOM/visualizations/summary_performance.png"
        plt.savefig(summary_path)
        print(f"Summary comparison visualization saved as: {summary_path}")
        plt.close()
        
        # Store visualization path
        self.visualizations['summary']['overall_performance'] = summary_path
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Title', 'S_label'),     # news headline + short-term prediction
            ('Title', 'L_label'),     # news headline + long-term prediction
            ('Full text', 'S_label'), # news body + short-term prediction
            ('Full text', 'L_label')  # news body + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (XOM)")
        print("="*80)
        
        # Create comprehensive summary table
        for combination, results in self.results.items():
            text_col, label_col = combination.split('|')
            if 'avg_accuracy' in results:
                print(f"\nCombination: {text_col} + {label_col}")
                print(f"Average Accuracy: {results['avg_accuracy']:.4f}")
                print(f"Average AUC: {results['avg_auc']:.4f}")
                
                # Print layer-specific results
                for i, accuracy in enumerate(results.get('accuracy', [])):
                    auc = results.get('auc', [])[i]
                    best_params = results.get('best_params', [])[i]
                    print(f"  Layer {i+1} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
                    print(f"  Layer {i+1} - Best Parameters: {best_params}")
                    # Print path to visualization for this layer
                    viz_path = self.visualizations['learning_curves'][combination].get(i+1, "No visualization available")
                    print(f"  Layer {i+1} - Visualization: {viz_path}")
        
        # Create summary visualization
        self.create_summary_visualization()
        
        # Print visualization paths summary
        print("\n" + "="*80)
        print("VISUALIZATION PATHS SUMMARY")
        print("="*80)
        print(f"Summary visualization: {self.visualizations['summary'].get('overall_performance', 'Not created')}")
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_XOM_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 929 climate change news articles from Wall Street Journal spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {0: 466, 1: 463}
Class distribution for long-term prediction: {1: 507, 0: 422}

Training model for Title and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Using 1500 features for Title inputs
Best parameters for Layer 1: {'classifier__C': 0.1, 'classifier__class_weight': None, 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l2'}
Test Accuracy for Layer 1: 0.4697
Test AUC for Layer 1: 0.5215
Visualization saved as: TF-IDF_SVM_Plots/XOM/visualizations/learning_curves/Title/Title_S_label_layer_1.png

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples


In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import warnings
warnings.filterwarnings('ignore')

# Download required NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')

# Define climate-finance specific stopwords to remove in addition to regular stopwords
FINANCIAL_STOPWORDS = {
    'said', 'inc', 'corp', 'company', 'companies', 'reuters', 'news', 'press', 'release',
    'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday',
    'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 
    'october', 'november', 'december', 'wall', 'street', 'journal'
}

# Define climate-finance terms to standardize
CLIMATE_FINANCIAL_TERMS = {
    'q1': 'first_quarter',
    'q2': 'second_quarter',
    'q3': 'third_quarter',
    'q4': 'fourth_quarter',
    'esg': 'environmental_social_governance',
    'ghg': 'greenhouse_gas',
    'co2': 'carbon_dioxide',
    'carbon': 'carbon_emissions',
    'renewable': 'renewable_energy',
    'climate': 'climate_change',
    'global warming': 'climate_change',
    'green': 'green_energy',
    'sustainable': 'sustainability',
    'paris agreement': 'paris_climate_accord',
    'emissions': 'carbon_emissions',
    'environmental': 'environmental_impact',
    'solar': 'renewable_energy_solar',
    'wind': 'renewable_energy_wind',
    'net zero': 'net_zero_emissions',
    'clean energy': 'renewable_energy',
    'fossil': 'fossil_fuel',
    'coal': 'fossil_fuel_coal'
}

class ClimateNewsStockPredictor:
    def __init__(self, csv_path):
        """
        Initialize the stock predictor for climate change news analysis.
        
        Args:
            csv_path: Path to the CSV file containing climate change news data from Wall Street Journal
        """
        self.csv_path = csv_path
        self.data = None
        self.layers = []
        self.lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english')).union(FINANCIAL_STOPWORDS)
        self.results = {}
        
    def load_data(self):
        """Load and preprocess the CSV data containing climate change news from Wall Street Journal."""
        self.data = pd.read_csv(self.csv_path)
        
        # Convert date strings to datetime objects with mixed format detection
        # Setting dayfirst=True helps correctly parse dates like DD/MM/YYYY
        try:
            self.data['Publication date'] = pd.to_datetime(self.data['Publication date'], format='mixed', dayfirst=True)
            self.data['Predicting date Short'] = pd.to_datetime(self.data['Predicting date Short'], format='mixed', dayfirst=True)
            self.data['Predicting date Long'] = pd.to_datetime(self.data['Predicting date Long'], format='mixed', dayfirst=True)
        except (ValueError, TypeError):
            # If 'mixed' isn't supported in your pandas version, try a manual approach
            print("Mixed format not supported in your pandas version. Trying manual conversion...")
            
            # Helper function to handle different date formats
            def parse_date_column(column):
                result = []
                for date_str in column:
                    try:
                        # Try to parse as YYYY-MM-DD
                        date = pd.to_datetime(date_str, format='%Y-%m-%d')
                    except ValueError:
                        try:
                            # Try to parse as DD/MM/YYYY
                            date = pd.to_datetime(date_str, format='%d/%m/%Y')
                        except ValueError:
                            # As a last resort, let pandas guess
                            date = pd.to_datetime(date_str, dayfirst=True)
                    result.append(date)
                return pd.Series(result)
            
            self.data['Publication date'] = parse_date_column(self.data['Publication date'])
            self.data['Predicting date Short'] = parse_date_column(self.data['Predicting date Short'])
            self.data['Predicting date Long'] = parse_date_column(self.data['Predicting date Long'])
        
        # Sort by publication date
        self.data = self.data.sort_values('Publication date')
        
        print(f"Loaded {len(self.data)} climate change news articles from Wall Street Journal spanning from "
              f"{self.data['Publication date'].min().strftime('%d/%m/%Y')} "
              f"to {self.data['Publication date'].max().strftime('%d/%m/%Y')}")
        print(f"Class distribution for short-term prediction: {self.data['S_label'].value_counts().to_dict()}")
        print(f"Class distribution for long-term prediction: {self.data['L_label'].value_counts().to_dict()}")
        
        return self
    
    def preprocess_text(self, text):
        """
        Preprocess climate change news text by applying various NLP techniques.
        
        Args:
            text: The text to preprocess
            
        Returns:
            Preprocessed text
        """
        if pd.isna(text):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Standardize climate-finance terms
        for term, replacement in CLIMATE_FINANCIAL_TERMS.items():
            text = re.sub(r'\b' + term + r'\b', replacement, text)
        
        # Standardize numerical representations with units
        text = re.sub(r'\$(\d+)([kmbt])', lambda m: m.group(1) + '_' + 
                      {'k': 'thousand', 'm': 'million', 'b': 'billion', 't': 'trillion'}[m.group(2).lower()], text)
        
        # Remove punctuation except $ and %
        text = re.sub(r'[^\w\s$%]', ' ', text)
        
        # Remove non-alphanumeric characters but preserve meaningful financial symbols
        text = re.sub(r'[^\w\s$%.-]', ' ', text)
        
        # Tokenize
        tokens = nltk.word_tokenize(text)
        
        # Remove stopwords and lemmatize
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens if token not in self.stopwords]
        
        # Join tokens back into a string
        return ' '.join(tokens)
    
    def define_time_windows(self):
        """Define the time windows for the sliding window approach."""
        # First layer: train (2019-2021), validation (2021-2021), test (2022-2022)
        layer1 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-05-31'),
            'val_start': pd.Timestamp('2021-06-01'),
            'val_end': pd.Timestamp('2021-12-31'),
            'test_start': pd.Timestamp('2022-01-01'),
            'test_end': pd.Timestamp('2022-05-31')
        }
        
        # Second layer: train (2019-2021), validation (2022-2022), test (2022-2022)
        layer2 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2021-12-31'),
            'val_start': pd.Timestamp('2022-01-01'),
            'val_end': pd.Timestamp('2022-05-31'),
            'test_start': pd.Timestamp('2022-06-01'),
            'test_end': pd.Timestamp('2022-12-31')
        }
        
        # Third layer: train (2019-2022), validation (2022-2022), test (2023-2023)
        layer3 = {
            'train_start': pd.Timestamp('2019-01-01'),
            'train_end': pd.Timestamp('2022-05-31'),
            'val_start': pd.Timestamp('2022-06-01'),
            'val_end': pd.Timestamp('2022-12-31'),
            'test_start': pd.Timestamp('2023-01-01'),
            'test_end': pd.Timestamp('2023-05-31')
        }
        
        self.layers = [layer1, layer2, layer3]
        return self
    
    def split_data(self, layer, text_col, label_col):
        """
        Split the data into training, validation, and test sets based on the defined time windows.
        
        Args:
            layer: The time window layer
            text_col: The column containing the text data ('Title' or 'Full text')
            label_col: The column containing the labels ('S_label' or 'L_label')
            
        Returns:
            train_data, val_data, test_data
        """
        train_mask = (self.data['Publication date'] >= layer['train_start']) & (self.data['Publication date'] <= layer['train_end'])
        val_mask = (self.data['Publication date'] > layer['val_start']) & (self.data['Publication date'] <= layer['val_end'])
        test_mask = (self.data['Publication date'] > layer['test_start']) & (self.data['Publication date'] <= layer['test_end'])
        
        train_data = self.data[train_mask]
        val_data = self.data[val_mask]
        test_data = self.data[test_mask]
        
        print(f"Training data: {len(train_data)} samples")
        print(f"Validation data: {len(val_data)} samples")
        print(f"Test data: {len(test_data)} samples")
        
        # Preprocess text
        X_train = train_data[text_col].apply(self.preprocess_text)
        X_val = val_data[text_col].apply(self.preprocess_text)
        X_test = test_data[text_col].apply(self.preprocess_text)
        
        y_train = train_data[label_col]
        y_val = val_data[label_col]
        y_test = test_data[label_col]
        
        return (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data
    
    def create_model(self):
        """
        Create a TF-IDF + SVM pipeline model.

        """
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(
                max_features=5000,  # Limit to top 5000 features
                min_df=5,           # Remove terms that appear in fewer than 5 documents
                max_df=0.85,        # Remove terms that appear in more than 85% of documents
                ngram_range=(1, 3), # Include unigrams, bigrams, and trigrams
                sublinear_tf=True   # Apply sublinear tf scaling (1 + log(tf))
            )),
            ('classifier', LinearSVC(
                random_state=42,
                max_iter=2000,
                dual=False          # Better performance when n_samples > n_features
            ))
        ])
        
        # Define parameters for grid search
        param_grid = {
            'classifier__C': [0.01, 0.1, 1, 10, 100],          # Regularization parameter
            'classifier__loss': ['hinge', 'squared_hinge'],    # Loss function
            'classifier__penalty': ['l1', 'l2'],               # Penalty norm
            'classifier__class_weight': [None, 'balanced']     # Weight classes inversely proportional to frequencies
        }
        
        # Create grid search model - this is done for each layer independently
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,                # 3-fold cross-validation
            scoring='roc_auc',   # Optimize for ROC AUC
            verbose=0,
            n_jobs=-1            # Use all available cores
        )
        
        return grid_search
    
    def train_and_evaluate(self, text_col, label_col):
        """
        Train and evaluate the model for a specific text column and label column.
        
        Args:
            text_col: The text column to use ('Title' or 'Full text')
            label_col: The label column to use ('S_label' or 'L_label')
        """
        # Store results
        combination_key = f"{text_col}_{label_col}"
        self.results[combination_key] = {
            'accuracy': [],
            'auc': [],
            'best_params': [],
            'layer_results': []
        }
        
        print(f"\n{'='*80}")
        print(f"Training model for {text_col} and {label_col}")
        print(f"{'='*80}")
        
        for i, layer in enumerate(self.layers):
            print(f"\nLayer {i+1}:")
            print(f"Training period: {layer['train_start'].strftime('%d/%m/%Y')} - {layer['train_end'].strftime('%d/%m/%Y')}")
            print(f"Validation period: {layer['val_start'].strftime('%d/%m/%Y')} - {layer['val_end'].strftime('%d/%m/%Y')}")
            print(f"Testing period: {layer['test_start'].strftime('%d/%m/%Y')} - {layer['test_end'].strftime('%d/%m/%Y')}")
            
            # Split data
            (X_train, y_train), (X_val, y_val), (X_test, y_test), train_data, val_data, test_data = self.split_data(layer, text_col, label_col)
            
            # Check if there are enough samples and classes
            if len(X_train) < 10 or len(np.unique(y_train)) < 2 or len(np.unique(y_val)) < 2 or len(np.unique(y_test)) < 2:
                print(f"Skipping layer {i+1} due to insufficient data or class imbalance")
                continue
            
            # Create and train model
            # Grid search is performed for each layer independently
            model = self.create_model()
            model.fit(X_train, y_train)
            
            # Get best model and parameters for this specific layer
            best_model = model.best_estimator_
            best_params = model.best_params_
            
            # Save best parameters for this layer
            self.results[combination_key]['best_params'].append(best_params)
            print(f"Best parameters for Layer {i+1}: {best_params}")
            
            # Evaluate on test set
            y_pred = best_model.predict(X_test)
            y_pred_proba = best_model.decision_function(X_test)
            
            accuracy = accuracy_score(y_test, y_pred)
            auc = roc_auc_score(y_test, y_pred_proba)
            
            self.results[combination_key]['accuracy'].append(accuracy)
            self.results[combination_key]['auc'].append(auc)
            
            print(f"Test Accuracy for Layer {i+1}: {accuracy:.4f}")
            print(f"Test AUC for Layer {i+1}: {auc:.4f}")
            
            # Store layer results for visualization
            layer_result = {
                'layer': i+1,
                'model': best_model,
                'train_data': train_data,
                'val_data': val_data,
                'test_data': test_data,
                'X_train': X_train,
                'y_train': y_train,
                'X_val': X_val,
                'y_val': y_val,
                'X_test': X_test,
                'y_test': y_test,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba,
                'accuracy': accuracy,
                'auc': auc,
                'best_params': best_params  # Store best parameters for this layer
            }
            
            self.results[combination_key]['layer_results'].append(layer_result)
            
            # Visualize results for this layer - only top features and learning curve
            self.visualize_layer_results(layer_result, text_col, label_col, i+1)
        
        # Calculate average metrics
        avg_accuracy = np.mean(self.results[combination_key]['accuracy'])
        avg_auc = np.mean(self.results[combination_key]['auc'])
        
        print(f"\nAverage Test Accuracy across all layers: {avg_accuracy:.4f}")
        print(f"Average Test AUC across all layers: {avg_auc:.4f}")
        
        self.results[combination_key]['avg_accuracy'] = avg_accuracy
        self.results[combination_key]['avg_auc'] = avg_auc
        
        return self
    
    def visualize_layer_results(self, layer_result, text_col, label_col, layer_num):
        """
        Visualize learning curve for a specific layer.
        
        Args:
            layer_result: The results for the layer
            text_col: The text column used
            label_col: The label column used
            layer_num: The layer number
        """
        plt.figure(figsize=(15, 10))
        plt.suptitle(f"Model Evaluation: {text_col} + {label_col} (Layer {layer_num})", fontsize=16)
        
        # Learning Curve (using accuracy on different dataset sizes)
        
        # Get train data
        X_train = layer_result['X_train']
        y_train = layer_result['y_train']
        
        # Create different training set sizes
        train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
        train_acc = []
        val_acc = []
        
        for size in train_sizes:
            # Get subset of training data
            n_samples = int(len(X_train) * size)
            X_train_subset = X_train.iloc[:n_samples]
            y_train_subset = y_train.iloc[:n_samples]
            
            # Train a new model
            model_clone = clone(layer_result['model'])
            model_clone.fit(X_train_subset, y_train_subset)
            
            # Evaluate on training and validation sets
            y_train_pred = model_clone.predict(X_train_subset)
            y_val_pred = model_clone.predict(layer_result['X_val'])
            
            train_acc.append(accuracy_score(y_train_subset, y_train_pred))
            val_acc.append(accuracy_score(layer_result['y_val'], y_val_pred))
        
        plt.plot(train_sizes, train_acc, 'o-', label='Training Accuracy')
        plt.plot(train_sizes, val_acc, 'o-', label='Validation Accuracy')
        plt.title('Learning Curve (Overfitting Detection)')
        plt.xlabel('Training Set Size (%)')
        plt.ylabel('Accuracy')
        plt.legend(loc='lower right')
        plt.grid(True, linestyle='--', alpha=0.7)
        
        plt.tight_layout(rect=[0, 0, 1, 0.95])
        plt.savefig(f"F:/Python/my jupyter notebook/Merged_TF-IDF_SVM_Plots/XOM/visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        print(f"Visualization saved as: visualization_{text_col}_{label_col}_layer_{layer_num}.png")
        plt.close()
    
    def run_all_combinations(self):
        """Run the analysis for all combinations of text and label columns."""
        # Define all combinations
        combinations = [
            ('Merged news', 'S_label'),     # Merged news + short-term prediction
            ('Merged news', 'L_label'),     # Merged news + long-term prediction
        ]
        
        # Run analysis for each combination
        for text_col, label_col in combinations:
            self.train_and_evaluate(text_col, label_col)
        
        # Print summary
        print("\n" + "="*80)
        print("SUMMARY OF RESULTS (XOM)")
        print("="*80)
        
        return self

# Main execution
if __name__ == "__main__":
    predictor = ClimateNewsStockPredictor('E:/User/Documents/Documents (UK)/Cardiff (PhD)/First Year/climate change/US_news/SP500_semantic/us_news_semantics_XOM_completed.csv')
    predictor.load_data().define_time_windows().run_all_combinations()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loaded 929 climate change news articles from Wall Street Journal spanning from 02/01/2019 to 07/05/2023
Class distribution for short-term prediction: {0: 466, 1: 463}
Class distribution for long-term prediction: {1: 507, 0: 422}

Training model for Merged news and S_label

Layer 1:
Training period: 01/01/2019 - 31/05/2021
Validation period: 01/06/2021 - 31/12/2021
Testing period: 01/01/2022 - 31/05/2022
Training data: 522 samples
Validation data: 163 samples
Test data: 66 samples
Best parameters for Layer 1: {'classifier__C': 1, 'classifier__class_weight': 'balanced', 'classifier__loss': 'squared_hinge', 'classifier__penalty': 'l2'}
Test Accuracy for Layer 1: 0.4242
Test AUC for Layer 1: 0.4937
Visualization saved as: visualization_Merged news_S_label_layer_1.png

Layer 2:
Training period: 01/01/2019 - 31/12/2021
Validation period: 01/01/2022 - 31/05/2022
Testing period: 01/06/2022 - 31/12/2022
Training data: 685 samples
Validation data: 66 samples
Test data: 108 samples
Best parameter