In [None]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data_path = 'selected_data.parquet'
data = pd.read_parquet(data_path)

In [None]:
keywords_context = {
    'USD': {
        'keywords': [
            'Federal Reserve', 'interest rate hike', 'US inflation data', 'US GDP growth',
            'US unemployment rate', 'US trade balance', 'Federal budget deficit', 'US monetary policy'
        ],
        'positive': [
            'hike', 'strong', 'growth', 'surplus', 'tighten', 'rally'
        ],
        'negative': [
            'cut', 'weak', 'decline', 'deficit', 'loosen', 'slump'
        ]
    },
    'SGD': {
        'keywords': [
            'Monetary Authority of Singapore', 'SGD interest rates', 'Singapore GDP growth',
            'Singapore inflation rate', 'Singapore trade data', 'Singapore government budget'
        ],
        'positive': [
            'raise', 'strong', 'growth', 'surplus', 'tighten', 'advance'
        ],
        'negative': [
            'cut', 'weak', 'decline', 'deficit', 'loosen', 'retract'
        ]
    }
}

In [None]:
def contains_keywords(text, keywords, positive, negative):
    text_lower = text.lower()
    keyword_hits = any(word in text_lower for word in keywords)
    if not keyword_hits:
        return 0  # No keywords found, skip processing
    pos_count = sum(text_lower.count(pos) for pos in positive)
    neg_count = sum(text_lower.count(neg) for neg in negative)
    return pos_count - neg_count

In [None]:
for key, context in keywords_context.items():
    data[f'{key}_context'] = data['text'].apply(
        lambda x: contains_keywords(x, context['keywords'], context['positive'], context['negative'])
    )


for key, context in keywords_context.items():
    data[f'{key}_context'] = data['text'].apply(lambda x: contains_keywords(x, context['keywords'], context['positive'], context['negative']))

analyzer = SentimentIntensityAnalyzer()
data['sentiment'] = data['short_description'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

data['impact_currency'] = 'Neutral'
data['impact_score'] =

In [None]:
ef sentiment_impact(row):
    for currency in ['USD', 'SGD']:
        context_score = row[f'{currency}_context']
        if context_score != 0:
            adjusted_score = row['sentiment'] * context_score
            if currency == 'SGD':
                adjusted_score *= -1
            if abs(adjusted_score) > abs(row['impact_score']):  # Only update if the new score is more significant
                row['impact_currency'] = currency
                row['impact_score'] = min(1, max(-1, adjusted_score))
    return row

In [None]:
data = data.apply(sentiment_impact, axis=1)

average_sentiment = data.groupby('impact_currency')['impact_score'].mean()
print(average_sentiment)

print(data.head())

impact_score_1 = data[data['impact_score'] <= 1].head(5)
impact_score_minus_1 = data[data['impact_score'] >= -1].head(5)

print("Examples with impact_score = 1:")
print(impact_score_1[['short_description', 'impact_currency', 'impact_score']])

print("\nExamples with impact_score = -1:")
print(impact_score_minus_1[['short_description', 'impact_currency', 'impact_score']])

In [None]:
non_zero_impact_scores = data[data['sentiment'] != 0].head(1000)
csv_output_path = 'non_zero_impact_score_examples.csv'
non_zero_impact_scores.to_csv(csv_output_path, index=False)