In [73]:
# Importing libraries
import pandas as pd
import regex as re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import Dataset
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from langdetect import detect
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import spacy


In [74]:
# Load data
data = pd.read_csv('all_ECB_speeches.csv',sep = '|')

In [75]:
# Remove non-english speeches

# Function to detect language of a given text
def detect_language(text):
    try:
        lang = detect(text)
    except:
        lang = "Unknown"
    return lang

# Apply the language detection function to each text in your DataFrame
data['language'] = data['contents'].apply(detect_language)

# Filter out the texts that are not in English
data = data[data['language'] == 'en']



In [76]:
# Drop null values
data = data.dropna(subset=['speakers','contents'])
data = data.reset_index(drop=True)

# Remove introduction
for row in range (data.shape[0]):
  speech = re.split(
          " \d+ (January|February|March|April|May|June|July|August|September|October|November|December) \d{4} ",
          data.loc[row, "contents"])
  data.loc[row, "contents"] = speech[-1]

In [77]:
# Split into sentences
sentences_list = []
for row in range (data.shape[0]):
    text = data.loc[row, "contents"]
    sentences = re.split(r'[.!?]', text)
    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
    sentences_list.append(sentences)
data['sentences'] = sentences_list


In [78]:
# Remove stopwords and punctuation
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text, re.UNICODE)
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

sentences_list = []
for row in range (data.shape[0]):
    sentences = data.loc[row, "sentences"]
    sentences = [preprocess_text(sentence) for sentence in sentences]
    sentences_list.append(sentences)

data['sentences'] = sentences_list


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lavin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lavin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Lavin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [79]:
# Vectorize the data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['sentences'].apply(lambda x: ' '.join(x)))

In [80]:
def get_top_n_words(corpus, n=10):
        vec = CountVectorizer().fit(corpus)
        bag_of_words = vec.transform(corpus)
        sum_words = bag_of_words.sum(axis=0)
        words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
        words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
        return words_freq[:n]

def KMeans_clustering(X,data,num_clusters):
# Determine the number of clusters
    # Execute K-means clustering
    kmeans = KMeans(n_clusters=num_clusters)
    kmeans.fit(X)

    # Get cluster labels for each text
    cluster_labels = kmeans.labels_

    # Add cluster labels to the DataFrame
    data['cluster'] = cluster_labels

    # Identify the most frequent words in each cluster
    for i in range(num_clusters):
        print(f"Cluster {i}:")
        corpus = data[data['cluster'] == i]['sentences'].apply(lambda x: ' '.join(x))
        common_words = get_top_n_words(corpus, 10)
        print(common_words,'\n')
    return data

    


In [81]:
KMeans_clustering(X,data,3)



Cluster 0:
[('bank', 7706), ('euro', 6126), ('payment', 5546), ('market', 4863), ('european', 4853), ('financial', 4827), ('ha', 4711), ('central', 4399), ('also', 4196), ('area', 3916)] 

Cluster 1:
[('policy', 30596), ('euro', 26732), ('monetary', 24631), ('area', 21951), ('rate', 17605), ('inflation', 16960), ('price', 16513), ('ha', 16133), ('economic', 14708), ('market', 14349)] 

Cluster 2:
[('financial', 22244), ('market', 16655), ('bank', 15616), ('risk', 9120), ('policy', 8829), ('ha', 8539), ('euro', 8515), ('area', 8264), ('banking', 6554), ('crisis', 6127)] 



Unnamed: 0,date,speakers,title,subtitle,contents,language,sentences,cluster
0,2024-02-26,Christine Lagarde,European Parliament plenary debate on the ECB ...,"Speech by Christine Lagarde, President of the ...",As we approach the end of this legislative ter...,en,[approach end legislative term wish sincerely ...,1
1,2024-02-16,Isabel Schnabel,From laggard to leader? Closing the euro area’...,Inaugural lecture of the EMU Lab by Isabel Sch...,"More than 30 years after its inception, Econom...",en,[30 year inception economic monetary union wid...,1
2,2024-02-15,Christine Lagarde,Hearing of the Committee on Economic and Monet...,"Speech by Christine Lagarde, President of the ...",Today’s hearing is our last before the end o...,en,"[today hearing last end legislative term, let ...",1
3,2024-02-14,Piero Cipollone,Preserving people’s freedom to use a public me...,"Introductory statement by Piero Cipollone, Mem...",on the digital euro (CON/2023/34)”. See Ar...,en,"[digital euro con202334, see article 52 applic...",0
4,2024-02-14,Luis de Guindos,"Monetary policy, financial stability and mediu...","Speech by Luis de Guindos, Vice-President of t...",Over the past few years the euro area economy ...,en,[past year euro area economy ha experienced un...,1
...,...,...,...,...,...,...,...,...
2519,1997-05-13,Alexandre Lamfalussy,The European Central Bank: independent and acc...,Keynote speech delivered by Alexandre Lamfalus...,Against a background of both historical experi...,en,[background historical experience evolution po...,1
2520,1997-04-30,Alexandre Lamfalussy,The operation of monetary policy in stage thre...,"Address by Alexandre Lamfalussy, President of ...",Introduction I am delighted to be here today...,en,[introduction delighted today new york context...,1
2521,1997-04-22,Alexandre Lamfalussy,Convergence and the role of the European Centr...,"Remarks by Alexandre Lamfalussy, President of ...",These remarks will touch on the following topi...,en,[remark touch following topic first progress t...,1
2522,1997-03-10,Alexandre Lamfalussy,Securing the benefits of EMU,"Address by Alexandre Lamfalussy, President of ...",It is a great pleasure to be with you today he...,en,[great pleasure today london august institutio...,1


In [82]:
KMeans_clustering(X,data,4)



Cluster 0:
[('euro', 20752), ('area', 15718), ('policy', 12427), ('country', 9898), ('ha', 9801), ('market', 9762), ('monetary', 9652), ('economic', 9319), ('growth', 8157), ('european', 7208)] 

Cluster 1:
[('policy', 20695), ('monetary', 16865), ('inflation', 13694), ('rate', 11561), ('price', 11002), ('bank', 10322), ('euro', 8950), ('area', 8580), ('ha', 8228), ('market', 7028)] 

Cluster 2:
[('bank', 6891), ('payment', 5499), ('euro', 5171), ('market', 4471), ('financial', 4427), ('european', 4077), ('ha', 4025), ('central', 3999), ('also', 3670), ('area', 3313)] 

Cluster 3:
[('financial', 20117), ('market', 14606), ('bank', 14394), ('risk', 8537), ('ha', 7329), ('policy', 6911), ('area', 6520), ('euro', 6500), ('banking', 6098), ('also', 5368)] 



Unnamed: 0,date,speakers,title,subtitle,contents,language,sentences,cluster
0,2024-02-26,Christine Lagarde,European Parliament plenary debate on the ECB ...,"Speech by Christine Lagarde, President of the ...",As we approach the end of this legislative ter...,en,[approach end legislative term wish sincerely ...,1
1,2024-02-16,Isabel Schnabel,From laggard to leader? Closing the euro area’...,Inaugural lecture of the EMU Lab by Isabel Sch...,"More than 30 years after its inception, Econom...",en,[30 year inception economic monetary union wid...,0
2,2024-02-15,Christine Lagarde,Hearing of the Committee on Economic and Monet...,"Speech by Christine Lagarde, President of the ...",Today’s hearing is our last before the end o...,en,"[today hearing last end legislative term, let ...",1
3,2024-02-14,Piero Cipollone,Preserving people’s freedom to use a public me...,"Introductory statement by Piero Cipollone, Mem...",on the digital euro (CON/2023/34)”. See Ar...,en,"[digital euro con202334, see article 52 applic...",2
4,2024-02-14,Luis de Guindos,"Monetary policy, financial stability and mediu...","Speech by Luis de Guindos, Vice-President of t...",Over the past few years the euro area economy ...,en,[past year euro area economy ha experienced un...,1
...,...,...,...,...,...,...,...,...
2519,1997-05-13,Alexandre Lamfalussy,The European Central Bank: independent and acc...,Keynote speech delivered by Alexandre Lamfalus...,Against a background of both historical experi...,en,[background historical experience evolution po...,1
2520,1997-04-30,Alexandre Lamfalussy,The operation of monetary policy in stage thre...,"Address by Alexandre Lamfalussy, President of ...",Introduction I am delighted to be here today...,en,[introduction delighted today new york context...,1
2521,1997-04-22,Alexandre Lamfalussy,Convergence and the role of the European Centr...,"Remarks by Alexandre Lamfalussy, President of ...",These remarks will touch on the following topi...,en,[remark touch following topic first progress t...,0
2522,1997-03-10,Alexandre Lamfalussy,Securing the benefits of EMU,"Address by Alexandre Lamfalussy, President of ...",It is a great pleasure to be with you today he...,en,[great pleasure today london august institutio...,0


In [83]:
KMeans_clustering(X,data,5)



Cluster 0:
[('euro', 20337), ('area', 15390), ('policy', 13173), ('monetary', 10490), ('ha', 9628), ('country', 9478), ('economic', 9391), ('market', 9361), ('growth', 7998), ('rate', 7190)] 

Cluster 1:
[('payment', 4920), ('bank', 2579), ('euro', 1829), ('market', 1708), ('sepa', 1607), ('service', 1364), ('system', 1318), ('central', 1307), ('european', 1209), ('digital', 1193)] 

Cluster 2:
[('financial', 20758), ('market', 15185), ('bank', 14788), ('risk', 8713), ('ha', 7836), ('policy', 7643), ('euro', 7154), ('area', 7124), ('banking', 6238), ('also', 5636)] 

Cluster 3:
[('policy', 18890), ('monetary', 15274), ('inflation', 13325), ('rate', 10947), ('price', 10023), ('bank', 9372), ('euro', 8159), ('area', 7929), ('ha', 7548), ('market', 6385)] 

Cluster 4:
[('bank', 4850), ('euro', 3894), ('financial', 3680), ('ha', 3259), ('market', 3228), ('european', 3209), ('also', 3055), ('policy', 2976), ('central', 2864), ('area', 2812)] 



Unnamed: 0,date,speakers,title,subtitle,contents,language,sentences,cluster
0,2024-02-26,Christine Lagarde,European Parliament plenary debate on the ECB ...,"Speech by Christine Lagarde, President of the ...",As we approach the end of this legislative ter...,en,[approach end legislative term wish sincerely ...,3
1,2024-02-16,Isabel Schnabel,From laggard to leader? Closing the euro area’...,Inaugural lecture of the EMU Lab by Isabel Sch...,"More than 30 years after its inception, Econom...",en,[30 year inception economic monetary union wid...,0
2,2024-02-15,Christine Lagarde,Hearing of the Committee on Economic and Monet...,"Speech by Christine Lagarde, President of the ...",Today’s hearing is our last before the end o...,en,"[today hearing last end legislative term, let ...",3
3,2024-02-14,Piero Cipollone,Preserving people’s freedom to use a public me...,"Introductory statement by Piero Cipollone, Mem...",on the digital euro (CON/2023/34)”. See Ar...,en,"[digital euro con202334, see article 52 applic...",4
4,2024-02-14,Luis de Guindos,"Monetary policy, financial stability and mediu...","Speech by Luis de Guindos, Vice-President of t...",Over the past few years the euro area economy ...,en,[past year euro area economy ha experienced un...,3
...,...,...,...,...,...,...,...,...
2519,1997-05-13,Alexandre Lamfalussy,The European Central Bank: independent and acc...,Keynote speech delivered by Alexandre Lamfalus...,Against a background of both historical experi...,en,[background historical experience evolution po...,0
2520,1997-04-30,Alexandre Lamfalussy,The operation of monetary policy in stage thre...,"Address by Alexandre Lamfalussy, President of ...",Introduction I am delighted to be here today...,en,[introduction delighted today new york context...,3
2521,1997-04-22,Alexandre Lamfalussy,Convergence and the role of the European Centr...,"Remarks by Alexandre Lamfalussy, President of ...",These remarks will touch on the following topi...,en,[remark touch following topic first progress t...,0
2522,1997-03-10,Alexandre Lamfalussy,Securing the benefits of EMU,"Address by Alexandre Lamfalussy, President of ...",It is a great pleasure to be with you today he...,en,[great pleasure today london august institutio...,0


In [84]:
# Classifier using diccionarys
# Dictionary of words related to interest rates
interest_rates_dict = {
    'interest': 1,
    'rate': 1,
    'central bank': 1,
    'monetary policy': 1,
    'European Central Bank': 1,
    'policy rate': 1,
    'benchmark rate': 1,
    'ECB': 1,
    'ECB President': 1,
    'ECB Governing Council': 1,
    'yield curve': 1,
    'nominal rate': 1,
    'real rate': 1,
    'prime rate': 1,
    'discount rate': 1,
    'base rate': 1
}

# Dictionary of words related to inflation
inflation_dict = {
    'inflation': 1,
    'price level': 1,
    'HICP': 1,  # Harmonized Index of Consumer Prices
    'consumer price index': 1,
    'deflation': 1,
    'hyperinflation': 1,
    'core inflation': 1,
    'inflation target': 1,
    'inflation rate': 1,
    'inflationary pressures': 1,
    'cost-push inflation': 1,
    'demand-pull inflation': 1
}

# Dictionary of words related to economic activity (GDP)
economic_activity_dict = {
    'GDP': 1,
    'economic growth': 1,
    'output': 1,
    'economic indicators': 1,
    'business cycle': 1,
    'recession': 1,
    'expansion': 1,
    'unemployment rate': 1,
    'labor market': 1,
    'productivity': 1,
    'investment': 1,
    'consumption': 1,
    'exports': 1,
    'imports': 1,
    'trade balance': 1,
    'eurozone': 1,
    'European Union': 1,
    'euro area': 1,
    'Eurostat': 1,
    'eurozone economy': 1,
    'EU economic policy': 1,
    'EU fiscal policy': 1
}

In [85]:
# Cluster the texts based on the dictionaries
def classify_text(text):
    scores = [0, 0, 0]
    for sentence in text:
        interest_rates_score = sum([interest_rates_dict.get(word, 0) for word in sentence.split()])
        inflation_score = sum([inflation_dict.get(word, 0) for word in sentence.split()])
        economic_activity_score = sum([economic_activity_dict.get(word, 0) for word in sentence.split()])
        scores[0] += interest_rates_score
        scores[1] += inflation_score
        scores[2] += economic_activity_score
    if sum(scores) == 0:
        return scores, -1
    return scores, np.argmax(scores)

def Dictionary_clustering(data):
    data['scores'] = data['sentences'].apply(lambda x: classify_text(x)[0])
    data['cluster'] = data['sentences'].apply(lambda x: classify_text(x)[1])

    for i in range(3):
        print(f"Cluster {i}:")
        corpus = data[data['cluster'] == i]['sentences'].apply(lambda x: ' '.join(x))
        common_words = get_top_n_words(corpus, 10)
        print(common_words,'\n')
    return data


In [86]:
Dictionary_clustering(data)

Cluster 0:
[('policy', 27508), ('euro', 26830), ('bank', 24109), ('financial', 23747), ('market', 23598), ('monetary', 21580), ('area', 20798), ('ha', 18110), ('rate', 17839), ('central', 12778)] 

Cluster 1:
[('inflation', 10737), ('policy', 9994), ('monetary', 8559), ('price', 7162), ('euro', 6832), ('area', 6262), ('ha', 5111), ('bank', 4862), ('economic', 4463), ('financial', 3893)] 

Cluster 2:
[('financial', 8855), ('market', 7314), ('euro', 6580), ('area', 6422), ('bank', 5652), ('ha', 5259), ('policy', 5123), ('risk', 4503), ('growth', 4377), ('european', 4005)] 



Unnamed: 0,date,speakers,title,subtitle,contents,language,sentences,cluster,scores
0,2024-02-26,Christine Lagarde,European Parliament plenary debate on the ECB ...,"Speech by Christine Lagarde, President of the ...",As we approach the end of this legislative ter...,en,[approach end legislative term wish sincerely ...,1,"[5, 19, 11]"
1,2024-02-16,Isabel Schnabel,From laggard to leader? Closing the euro area’...,Inaugural lecture of the EMU Lab by Isabel Sch...,"More than 30 years after its inception, Econom...",en,[30 year inception economic monetary union wid...,2,"[22, 8, 100]"
2,2024-02-15,Christine Lagarde,Hearing of the Committee on Economic and Monet...,"Speech by Christine Lagarde, President of the ...",Today’s hearing is our last before the end o...,en,"[today hearing last end legislative term, let ...",1,"[5, 16, 0]"
3,2024-02-14,Piero Cipollone,Preserving people’s freedom to use a public me...,"Introductory statement by Piero Cipollone, Mem...",on the digital euro (CON/2023/34)”. See Ar...,en,"[digital euro con202334, see article 52 applic...",-1,"[0, 0, 0]"
4,2024-02-14,Luis de Guindos,"Monetary policy, financial stability and mediu...","Speech by Luis de Guindos, Vice-President of t...",Over the past few years the euro area economy ...,en,[past year euro area economy ha experienced un...,1,"[16, 20, 17]"
...,...,...,...,...,...,...,...,...,...
2519,1997-05-13,Alexandre Lamfalussy,The European Central Bank: independent and acc...,Keynote speech delivered by Alexandre Lamfalus...,Against a background of both historical experi...,en,[background historical experience evolution po...,0,"[12, 9, 6]"
2520,1997-04-30,Alexandre Lamfalussy,The operation of monetary policy in stage thre...,"Address by Alexandre Lamfalussy, President of ...",Introduction I am delighted to be here today...,en,[introduction delighted today new york context...,0,"[26, 10, 2]"
2521,1997-04-22,Alexandre Lamfalussy,Convergence and the role of the European Centr...,"Remarks by Alexandre Lamfalussy, President of ...",These remarks will touch on the following topi...,en,[remark touch following topic first progress t...,0,"[17, 6, 0]"
2522,1997-03-10,Alexandre Lamfalussy,Securing the benefits of EMU,"Address by Alexandre Lamfalussy, President of ...",It is a great pleasure to be with you today he...,en,[great pleasure today london august institutio...,0,"[26, 14, 6]"
