In [1]:
import pandas as pd
import numpy as np
import os
import re
import contractions
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
#nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import opinion_lexicon

In [2]:
path_name = "L:/year2_sem3/Pembroke/supervision/Russia-Ukraine war"
NEWS_FILE = os.listdir(path_name)
NEWS_FILE = [file for file in NEWS_FILE if file != 'stock price']
NEWS_FILE

['Baltic states appeal for Nato help after Russia’s assault on Ukraine_2022-02-24.txt',
 'EU holds emergency summit after Russia attacks Ukraine_2022-02-24.txt',
 'Putin opens a dark new chapter in Europe_2022-02-24.txt',
 'Putin’s distortion of Ukraine’s history lays ground for further operations_2022-02-22.txt',
 'Russia’s aggression tests limits of Ukraine’s restraint_2022-02-22.txt',
 'Russia’s invasion of Ukraine rattles markets_2022-02-24.txt',
 'UK imposes package of sanctions on Russia in response to Ukraine invasion_2022-02-24.txt',
 'Ukrainefood prices war is a punch in the breadbasket for poor importers_2022-02-24.txt',
 'Vladimir Putin abandons hopes of Ukraine deal and shifts to land-grab strategy_2022-02-24.txt',
 'Vladimir Putin orders troops into eastern Ukraine after recognising breakaway republics_2022-02-22.txt',
 "Volodymyr Zelensky’s appeal to Russians ‘The people of Ukraine want peace'_2022-02-24.txt",
 'Zelensky steps up in crisis to become Ukraine’s wartime comm

# Text Normalization

In [3]:
stop_words = set(stopwords.words('english'))
stop_words.discard('not')

In [4]:
def normalize(text, default_replace = ""):
    #handle URL
    text = re.sub("(http|https):\/\/\S+", default_replace, text)
    
    #remove upper capitalization
    text = text.lower()
    
    #word contraction
    for k,v in contractions.contractions_dict.items():
        text = text.replace(k,v)
     
    #tokenization
    token_list = word_tokenize(text)
    
    token_list = [token for token in token_list if token not in string.punctuation]
    token_list = [token for token in token_list if token.isalpha()]
    token_list = [token for token in token_list if not token in stop_words]

    return token_list

# Stemming

In [5]:
stemmer = SnowballStemmer("english")

In [6]:
def stem_tokens(tokens, stemmer):
    token_list = []
    for token in tokens:
        token_list.append(stemmer.stem(token))
    return token_list

In [7]:
def process_text(file):
    with open(f'{path_name}/{file}', encoding="utf8") as f:
        text = f.read()
    
    tokens = normalize(text)
    stem = stem_tokens(tokens, stemmer)

    return stem   

# Sentiment analysis 

In [8]:
positive_lexiceon = set(opinion_lexicon.positive())
negative_lexiceon = set(opinion_lexicon.negative())

In [9]:
sentiment_scores = {}
for i in range(len(NEWS_FILE)):
    date = os.path.splitext(NEWS_FILE[i])[0].split('_')[-1].split('-')[-1]
    if date == "24":
        cleaned_words = process_text(NEWS_FILE[i])
        positive_sentiment = 0
        negative_sentiment = 0
        for word in cleaned_words:
            if word in positive_lexiceon:
                positive_sentiment += 1
            elif word in negative_lexiceon:
                negative_sentiment += 1
        
        phi_pos = positive_sentiment / len(cleaned_words)
        phi_neg = negative_sentiment / len(cleaned_words)
        
        phi_npt = (phi_pos - phi_neg) / (phi_pos + phi_neg)
        
        sentiment_scores[f"news{i+1}"] = [phi_pos, phi_neg, phi_npt]

sentiment_scores

{'news1': [0.019867549668874173, 0.033112582781456956, -0.25],
 'news2': [0.03436807095343681, 0.041019955654102, -0.08823529411764706],
 'news3': [0.024029574861367836, 0.07024029574861368, -0.4901960784313726],
 'news6': [0.03048780487804878, 0.03048780487804878, 0.0],
 'news7': [0.0182648401826484, 0.0273972602739726, -0.2],
 'news8': [0.014634146341463415, 0.03902439024390244, -0.4545454545454546],
 'news9': [0.026845637583892617, 0.05145413870246085, -0.3142857142857143],
 'news11': [0.02181818181818182, 0.03636363636363636, -0.24999999999999994],
 'news12': [0.04225352112676056, 0.056338028169014086, -0.14285714285714288],
 'news13': [0.02097902097902098, 0.06468531468531469, -0.5102040816326531]}

In [10]:
sentiment = pd.DataFrame(sentiment_scores, index = ['positive','negative','npt_positive'] )
sentiment

Unnamed: 0,news1,news2,news3,news6,news7,news8,news9,news11,news12,news13
positive,0.019868,0.034368,0.02403,0.030488,0.018265,0.014634,0.026846,0.021818,0.042254,0.020979
negative,0.033113,0.04102,0.07024,0.030488,0.027397,0.039024,0.051454,0.036364,0.056338,0.064685
npt_positive,-0.25,-0.088235,-0.490196,0.0,-0.2,-0.454545,-0.314286,-0.25,-0.142857,-0.510204
