In [2]:
! pip install langdetect


Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/981.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m614.4/981.5 kB[0m [31m8.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=66a207a925a9e0b4ac6ed8751ab3fc50c9b015c23e9fc8b56e1c8d649cba1fa9
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b54

In [3]:
import pandas as pd
import re
import string
import nltk
from gensim import corpora
from gensim.models import LsiModel
from gensim.models import LdaModel
from langdetect import detect
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [5]:
data = pd.read_csv('all_ECB_speeches.csv',sep = '|')
# Remove non-english speeches
data_drop_nan = data.dropna(subset=data.columns)
data.dropna(subset=data.columns, inplace=True)

# Function to detect language of a given text
def detect_language(text):
    try:
        lang = detect(text)
    except:
        lang = "Unknown"
    return lang

# Apply the language detection function to each text in your DataFrame
data['language'] = data['contents'].apply(detect_language)

# Filter out the texts that are not in English
data = data[data['language'] == 'en']
# Drop null values
data = data.dropna(subset=['speakers','contents'])
# Reset index
data = data.reset_index(drop=True)
# Remove introduction
for row in range (data.shape[0]):
  try:
    speech = re.split(
        " \d+ (January|February|March|April|May|June|July|August|September|October|November|December) \d{4} ",
        data.loc[row, "contents"])
    data.loc[row, "contents"] = speech[-1]
  except:
    pass

In [6]:
corpus = data['contents']

# remove stopwords, punctuation, and normalize the corpus
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

clean_corpus = [clean(doc).split() for doc in corpus]

In [7]:
# Creating document-term matrix
dictionary = corpora.Dictionary(clean_corpus)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_corpus]

In [23]:
# LSA model
lsa_model = LsiModel(doc_term_matrix, num_topics=4, id2word = dictionary)

# LSA model
print(lsa_model.print_topics(num_topics=4, num_words=3))

[(0, '0.324*"policy" + 0.281*"inflation" + 0.225*"monetary"'), (1, '-0.430*"inflation" + 0.363*"digital" + 0.285*"payment"'), (2, '-0.357*"digital" + 0.336*"bank" + -0.332*"inflation"'), (3, '0.337*"rate" + -0.310*"risk" + -0.310*"climate"')]


In [24]:
# LDA model
lda_model = LdaModel(doc_term_matrix, num_topics=4, id2word = dictionary)

# Results
print(lda_model.print_topics(num_topics=4, num_words=3))



[(0, '0.011*"policy" + 0.008*"euro" + 0.008*"monetary"'), (1, '0.010*"inflation" + 0.010*"policy" + 0.008*"euro"'), (2, '0.012*"policy" + 0.009*"inflation" + 0.009*"euro"'), (3, '0.011*"bank" + 0.011*"policy" + 0.009*"inflation"')]


In [26]:
topic_categories_lsa = {
    0: "monetary policy",
    1: "inflation",
    2: "activity",
    3: "economic crisis"
}
topic_categories_lda = {
    0: "inflation",
    1: "monetary policy",
    2: "activity",
    3: "economic crisis"
}

In [30]:
# Função para pré-processar o texto
def preprocess(text):
    return [lemma.lemmatize(word) for word in text.lower().split() if word not in stop and word not in exclude]

# Convertendo texto para vetor usando o dicionário
data['bow'] = data['contents'].apply(lambda x: dictionary.doc2bow(preprocess(x)))

# Aplicando o modelo LSA para obter os tópicos
data['lsa_topics'] = data['bow'].apply(lambda x: lsa_model[x])
data['lda_topics'] = data['bow'].apply(lambda x: lda_model[x])

# Extraindo o tópico dominante para cada documento
data['dominant_topic_lsa'] = data['lsa_topics'].apply(lambda x: max(x, key=lambda item: abs(item[1]))[0] if x else None)
data['dominant_topic_lda'] = data['lda_topics'].apply(lambda x: max(x, key=lambda item: abs(item[1]))[0] if x else None)

# Substituindo números dos tópicos por descrições textuais
data['dominant_topic_lsa'] = data['dominant_topic_lsa'].map(topic_categories_lsa)
data['dominant_topic_lda'] = data['dominant_topic_lda'].map(topic_categories_lda)

# Removendo coluna intermediária 'bow' se desejado
data.drop('bow', axis=1, inplace=True)

# Exibindo o DataFrame atualizado
print(data[['contents', 'dominant_topic_lda']][0:5])
print(data[['contents', 'dominant_topic_lsa']][0:5])

data.to_csv("speeches_topics.csv", index=False)

                                            contents dominant_topic_lda
0  As we approach the end of this legislative ter...    monetary policy
1  More than 30 years after its inception, Econom...    monetary policy
2    Today’s hearing is our last before the end o...           activity
3  on the digital euro (CON/2023/34)”.     See Ar...           activity
4  Over the past few years the euro area economy ...           activity
                                            contents dominant_topic_lsa
0  As we approach the end of this legislative ter...    monetary policy
1  More than 30 years after its inception, Econom...    monetary policy
2    Today’s hearing is our last before the end o...    monetary policy
3  on the digital euro (CON/2023/34)”.     See Ar...          inflation
4  Over the past few years the euro area economy ...    monetary policy
