In [None]:
import json
import pandas as pd

def load_sentiment_scores(files, keywords_dict):
    sentiment_scores = []
    
    for file in files:
        try:
            with open(file, 'r', encoding='utf-8') as f:
                sentiment_data = json.load(f)
                
                total_sentiment_score = 0
                keyword_sentiment_score = 0
                ref_date = None
                
                # Validate structure and calculate total sentiment score
                if 'sentiment' in sentiment_data and isinstance(sentiment_data['sentiment'], list):
                    for entry in sentiment_data['sentiment']:
                        if 'document' in entry and 'score' in entry['document']:
                            total_sentiment_score += entry['document']['score']
                            ref_date = entry['document'].get('ref_date', ref_date)  # Extract ref_date if available
                
                # Validate keywords and calculate keyword-specific sentiment score
                if 'keywords' in sentiment_data and isinstance(sentiment_data['keywords'], list):
                    for entry in sentiment_data['keywords']:
                        if 'text' in entry and 'sentiment' in entry and entry['text'] in keywords_dict[file]:
                            keyword_sentiment_score += entry['sentiment'].get('score', 0)
                
                # Combine scores into a final sentiment score
                final_sentiment_score = total_sentiment_score + keyword_sentiment_score
                
                sentiment_scores.append({
                    'ref_date': ref_date,  # If ref_date is None, it will be filled later
                    'file': file,
                    'final_sentiment_score': final_sentiment_score
                })
        except (json.JSONDecodeError, KeyError, TypeError) as e:
            print(f"Error processing file {file}: {e}")
    
    return pd.DataFrame(sentiment_scores)

# List of JSON files containing sentiment scores
sentiment_files = ['analysis19_output.json', 'analysis21_output.json', 'analysis22_output.json', 'analysis24_output.json']

# Dictionary of keywords for which sentiment scores should be extracted for each file
keywords_dict = {
    'analysis19_output.json': ['international graduates', 'work force'],
    'analysis21_output.json': ['international graduates - Canada.ca Skip', 'workplace Immigration'],
    'analysis22_output.json': ['Immigration - Canada.ca Skip', 'work permit holders'],
    'analysis24_output.json': ['International Students - Canada.ca Skip', 'temporary work']
}

# Manually input the dates for the sentiment files (as a fallback)
file_dates = {
    'analysis19_output.json': '2019-01-01',
    'analysis21_output.json': '2021-01-01',
    'analysis22_output.json': '2022-01-01',
    'analysis24_output.json': '2024-01-01'
}

# Load and combine sentiment scores
sentiment_df = load_sentiment_scores(sentiment_files, keywords_dict)

# Handle cases where ref_date is missing, and fill with manual dates
sentiment_df['ref_date'] = sentiment_df.apply(
    lambda row: row['ref_date'] if pd.notnull(row['ref_date']) else file_dates.get(row['file'], None), axis=1
)

# Convert 'ref_date' to datetime
sentiment_df['ref_date'] = pd.to_datetime(sentiment_df['ref_date'], errors='coerce')

print(sentiment_df)
