In [23]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

In [32]:
# Step 1: Load the Loughran-McDonald Dictionary
lm_dict = pd.read_csv('Loughran-McDonald_MasterDictionary_1993-2023.csv', sep=',', encoding='UTF-8')
df_portfolio_1 = pd.read_csv("portfolio_articles_1_translated.csv", sep='|', encoding='UTF-8')
df_portfolio_2 = pd.read_csv("portfolio_articles_2_translated.csv", sep='|', encoding='UTF-8')
df_portfolio_3 = pd.read_csv("portfolio_articles_3_translated.csv", sep='|', encoding='UTF-8')
df = pd.concat([df_portfolio_1, df_portfolio_2,df_portfolio_3], axis=0)

In [29]:
# Convert dictionary to a usable format
lm_dict['Word'] = lm_dict['Word'].str.lower()
positive_words = set(lm_dict[lm_dict['Positive'] != 0]['Word'])
negative_words = set(lm_dict[lm_dict['Negative'] != 0]['Word'])

In [38]:
# Step 2: Text Preprocessing Function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Step 3: Sentiment Scoring Function
def calculate_sentiment(text, pos_words, neg_words):
    tokens = preprocess_text(text)
    # Count positive and negative words
    pos_count = sum(1 for word in tokens if word in pos_words)
    neg_count = sum(1 for word in tokens if word in neg_words)
    total_words = len(tokens)
    
    # Avoid division by zero
    if total_words == 0:
        return 0
    # sentiment score: (pos - neg) / total_words
    sentiment_score = (pos_count - neg_count) / total_words
    return sentiment_score

In [39]:
# Step 4: Apply to DataFrame
df['sentiment_score'] = df['translated_content'].apply(
    lambda x: calculate_sentiment(x, positive_words, negative_words)
)


df_final = df.loc[:,['translated_content', 'date', 'sentiment_score']]
df_final.to_csv("sentiment_scores_LMD_portfolio.csv", sep="|", encoding='utf-8', index=False)

In [41]:
df_final

Unnamed: 0,translated_content,date,sentiment_score
0,The first Friday of the year ends with an acci...,2025/01/03,-0.104651
1,The government's decision regarding guest work...,2024/12/24,-0.005076
2,The speeding is over: Barcelona is cracking do...,2025/01/03,-0.009662
3,Ursula von der Leyen has a serious case of pne...,2025/01/03,-0.106061
4,Unusual weather phenomenon developed at Lake B...,2025/01/03,0.011494
...,...,...,...
15136,The Greeks are getting money. The finance mini...,2018/01/23,-0.041420
15137,Media mogul attacks Facebook and Google. Ameri...,2018/01/23,-0.026087
15138,"The US government shutdown has ended, but it c...",2018/01/23,-0.048866
15139,The next installment of the Greek bailout pack...,2018/01/22,-0.059701
