In [176]:
import pandas as pd
import numpy as np
import re
import ast
from collections import Counter
from sklearn.preprocessing import MinMaxScaler

#Create dictionary for monthly set of articles
text_source = pd.read_csv('./Text_data/scrapped_articles_master.csv')
cycles_ref = text_source['Cycle_month'].unique()

def extract_cyclemonth(df, cyclemonth):
    text_sample = df[df['Cycle_month'] == cyclemonth]['Text'].values.tolist()
    pattern = re.compile(r'^[^a-zA-Z]*|[^a-zA-Z]*$')
    cleaned_texts = ["\n" + pattern.sub('', text).strip() + "\n" for text in text_sample]
    return cleaned_texts

articles = {}

for cycle in cycles_ref:
    articles[cycle] = extract_cyclemonth(text_source, cycle)


In [177]:
topwords = pd.read_csv("./Sentiment_data/top_sentiment_words.csv")

monthly_positive_words = {}
monthly_negative_words = {}

for index, row in topwords.iterrows():
    positive_words = ast.literal_eval(row['positive'])
    negative_words = ast.literal_eval(row['negative'])
    
    #store the generated lists in dictionaries with the cycle month as the key
    monthly_positive_words[row['Unnamed: 0']] = positive_words
    monthly_negative_words[row['Unnamed: 0']] = negative_words

pos_ranks = {month: {word: len(monthly_positive_words[month]) - rank for rank, word in enumerate(monthly_positive_words[month])}
             for month in monthly_positive_words.keys()}

neg_ranks = {month: {word: len(monthly_negative_words[month]) - rank for rank, word in enumerate(monthly_negative_words[month])}
             for month in monthly_negative_words.keys()}

In [183]:
#Calculating the sentiment scores

def calculate_sentiment_with_ranking(text, pos_words, neg_words, pos_ranks, neg_ranks):
    """
    Computes a sentiment score for a given text by evaluating its content against predefined lists of positive
    and negative words and their associated rankings, taking into account the frequency of each word.
    
    Parameters:
        text (str): string of words separated by spaces.
        pos_words (set): set of K top positive words within each month.
        neg_words (set): set of K top negative words within each month.
        pos_ranks (dict): ranking dictionary of pos_words where higher ranks have a greater positive impact.
        neg_ranks (dict): ranking dictionary of neg_words where higher ranks have a greater negative impact.
        
    Returns:
        float: normalized sentiment score, where positive values indicate positive sentiment and negative values indicate negative sentiment.
        Note that we do not scale the sentiment score in this function.
        The for loop below takes care of this.
    """
    words = Counter(text.lower().split())
    max_pos_rank = max(pos_ranks.values(), default=1)
    max_neg_rank = max(neg_ranks.values(), default=1)
    
    pos_score = sum((pos_ranks.get(word, 0) / max_pos_rank) * count for word, count in words.items() if word in pos_words)
    neg_score = sum((neg_ranks.get(word, 0) / max_neg_rank) * count for word, count in words.items() if word in neg_words)
    
    
    sentiment_score = (pos_score - neg_score)
    
    return sentiment_score


#define data frame for storing monthly sentiment scores
senti = pd.DataFrame()
senti["month"] = cycles_ref
#define list for monthly sentiment scores
scaled_sentiment_scores = []

for month in cycles_ref:
    #define list for storing sentiment scores of each month
    month_scores = []
    for text in articles[month]:

        #extraction of positive and negative words of each month as sets, necessary for intersection check
        pos_words = set(monthly_positive_words[month])
        neg_words = set(monthly_negative_words[month])
        
        #extract the word: rank dictionaries for each month
        pranks = pos_ranks[month]
        nranks = neg_ranks[month]
        
        #calculate the sentiment score as the difference between the neg and pos count for each months
        score = calculate_sentiment_with_ranking(text, pos_words, neg_words, pranks, nranks)
        month_scores.append(score)
    #check if monthly score exists
    if month_scores: 
        #apply scikit MinMaxScaler for scaling the monthly sentiment scores into the intervall [-1,1]
        scaler = MinMaxScaler(feature_range=(-1, 1))
        month_scores = np.array(month_scores).reshape(-1, 1)
        scaled_scores = scaler.fit_transform(month_scores).flatten()
        #calculate the average scaled score for each month
        average_scaled_score = np.mean(scaled_scores)
    else:
        average_scaled_score = 0
    #append the average monthly score to the sentiment score list
    scaled_sentiment_scores.append(average_scaled_score)

senti['scaled_scores'] = scaled_sentiment_scores
senti.to_csv("./Sentiment_data/sentiment_scores.csv")

-0.8014862914862916

In [169]:

#Example application of calculate_sentiment_with_ranking function using the first month in the data
m = []

for text in articles[201201]:
    m.append(calculate_sentiment_with_ranking(text, set(monthly_positive_words[201201]), set(monthly_negative_words[201201]), pos_ranks[201201], neg_ranks[201201]))
    score = sum(m)/len(m)
print(score)


0.33238795903776525
