# LDA for UNGDC

In [7]:
import numpy as np
import pandas as pd
import os
import time
import spacy 
from spacy import displacy

import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel, LsiModel, HdpModel

In [24]:
# Read the input DataFrame
df = pd.read_csv('../../../data/processed/cleaned.csv')

# Create the output folder if it doesn't exist
output_folder = "../../../output/LDA_python"
os.makedirs(output_folder, exist_ok=True)

start_time = time.time()

In [10]:
timestamps = df.year.to_list()
texts = df.text.to_list()

In [8]:
nlp = spacy.load('en_core_web_lg')

In [9]:
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['will', 'must'])

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jihyeonbae/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize text into words
    words = gensim.utils.simple_preprocess(text, deacc=True)
    
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize words
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return words

# Apply preprocessing to each sentence
data_words = [preprocess_text(sentence) for sentence in data]

# Print the first 30 words of the preprocessed text
print(data_words[:1][0][:30])


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jihyeonbae/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['consider', 'great', 'honour', 'privilege', 'share', 'opportunity', 'united', 'nation', 'momentous', 'occasion', 'world', 'organization', 'embodies', 'hope', 'aspiration', 'people', 'world', 'peace', 'prosperity', 'prospect', 'better', 'fruitful', 'life', 'task', 'reaffirm', 'help', 'realize', 'aim', 'purpose', 'expressed']


In [15]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]


In [27]:
from gensim.models.phrases import Phrases, Phraser
from gensim import corpora, models

# Define TF-IDF model
tfidf_model = models.TfidfModel(dictionary=id2word)

window_size = 5  # Define the size of the time window (e.g., 5 years)
step_size = 4    # Define the step size for moving the window (e.g., 1 year)

for start_year in range(min(timestamps), max(timestamps) - window_size + 1, step_size):
    end_year = start_year + window_size
    
    # Filter texts within the current time window
    window_texts = df[df['year'].between(start_year, end_year)]['text'].tolist()
    
    # Tokenize the texts into words and generate bigrams
    window_data_words = list(sent_to_words(window_texts))
    bigram = Phrases(window_data_words, min_count=5, threshold=100)
    bigram_phraser = Phraser(bigram)
    window_data_words_bigrams = [bigram_phraser[doc] for doc in window_data_words]
    
    # Remove stopwords
    window_data_words_filtered = remove_stopwords(window_data_words_bigrams)
    
    # Generate TF-IDF weighted representations
    window_corpus = [id2word.doc2bow(text) for text in window_data_words_filtered]
    window_tfidf_corpus = tfidf_model[window_corpus]
    
    # Train LDA model using TF-IDF weighted representations
    window_lda_model = gensim.models.LdaMulticore(corpus=window_tfidf_corpus,
                                                   id2word=id2word,
                                                   num_topics=num_topics)
    
    # Print or store the topics for the current time window
    print(f"Topics for {start_year}-{end_year} time window:")
    pprint(window_lda_model.print_topics())
    print("\n")


Topics for 1946-1951 time window:
[(0,
  '0.001*"would" + 0.001*"could" + 0.000*"cannot" + 0.000*"veto" + '
  '0.000*"greece" + 0.000*"powers" + 0.000*"korea" + 0.000*"ussr" + '
  '0.000*"chinese" + 0.000*"delegation"'),
 (1,
  '0.000*"would" + 0.000*"could" + 0.000*"cannot" + 0.000*"siam" + '
  '0.000*"veto" + 0.000*"polish" + 0.000*"powers" + 0.000*"ass" + '
  '0.000*"arabs" + 0.000*"lebanese"'),
 (2,
  '0.001*"would" + 0.001*"could" + 0.001*"cannot" + 0.000*"veto" + '
  '0.000*"india" + 0.000*"powers" + 0.000*"aggression" + 0.000*"might" + '
  '0.000*"jews" + 0.000*"charter"'),
 (3,
  '0.001*"would" + 0.000*"could" + 0.000*"cannot" + 0.000*"netherlands" + '
  '0.000*"korea" + 0.000*"ussr" + 0.000*"veto" + 0.000*"delegation" + '
  '0.000*"speech" + 0.000*"aggression"'),
 (4,
  '0.001*"would" + 0.001*"could" + 0.000*"cannot" + 0.000*"greece" + '
  '0.000*"greek" + 0.000*"veto" + 0.000*"ussr" + 0.000*"aggression" + '
  '0.000*"czechoslovakia" + 0.000*"american"'),
 (5,
  '0.002*"would"

Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/jihyeonbae/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/jihyeonbae/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
# Function to generate the heatmap for a pair of models

generate_heatmap <- function(model1, model2, correlation_threshold = 0.9) {
  phi1 <- model1$phi
  phi2 <- model2$phi

  phi1_df <- as.data.frame(phi1)
  phi2_df <- as.data.frame(phi2)

  order_phi1 <- order(colMeans(phi1_df), decreasing = TRUE)
  order_phi2 <- order(colMeans(phi2_df), decreasing = TRUE)

  phi1_df <- phi1_df[, order_phi1]
  phi2_df <- phi2_df[, order_phi2]

  all_terms <- union(colnames(phi1_df), colnames(phi2_df))

  phi1_union <- dplyr::bind_cols(phi1_df, setNames(data.frame(matrix(0, nrow = nrow(phi1_df), ncol = length(setdiff(all_terms, colnames(phi1_df))))), setdiff(all_terms, colnames(phi1_df))))
  
  phi2_union <- dplyr::bind_cols(phi2_df, setNames(data.frame(matrix(0, nrow = nrow(phi2_df), ncol = length(setdiff(all_terms, colnames(phi2_df))))), setdiff(all_terms, colnames(phi2_df))))

  phi1_union <- phi1_union[, order(colnames(phi1_union))]
  phi2_union <- phi2_union[, order(colnames(phi2_union))]

  dim(phi1_union)
  dim(phi2_union)

  cor_matrix <- cor(t(phi1_union), t(phi2_union))


  # Heatmap for correlation matrix
  heatmap.2(cor_matrix,
            Rowv = FALSE, Colv = FALSE,
            col = heat.colors(16),
            trace = "none", # no row/column names
            key = TRUE, keysize = 1.5,
            density.info = "none", margins = c(5, 5),
            cexCol = 1, cexRow = 1, # adjust text size
            notecol = "black", notecex = 0.7,
            xlab = "Time 2",
            ylab = "Time 1",
            symkey = FALSE)

  return(list(phi1_union = phi1_union, phi2_union = phi2_union, cor_matrix = cor_matrix))
}
```

## Rows with high correlation

```{r}
# Function to print the ordered rows for each topic with high correlation
print_ordered_rows <- function(phi1_union, phi2_union, cor_matrix, high_corr_indices, correlation_threshold = 0.9) {
  # Find indices where correlation is higher than the threshold
  high_corr_indices <- which(cor_matrix > correlation_threshold & !is.na(cor_matrix), arr.ind = TRUE)

  # Create an empty list to store results
  result_list <- list()

  # Print the ordered rows for each topic with high correlation
  for (i in seq_len(nrow(high_corr_indices))) {
    model1_topic <- high_corr_indices[i, 1]
    model2_topic <- high_corr_indices[i, 2]

    # Print the ordered rows for each model's topic
    cat(paste("Model 1 - Topic", model1_topic), "\n")
    phi1_result_row <- orderBasedOnRow(phi1_union, model1_topic)

    cat(paste("Model 2 - Topic", model2_topic), "\n")
    phi2_result_row <- orderBasedOnRow(phi2_union, model2_topic)

    # Convert result rows to long format
    phi1_result_long <- phi1_result_row %>%
      tidyr::pivot_longer(everything(), names_to = "term_1", values_to = "probability_1")

    phi2_result_long <- phi2_result_row %>%
      tidyr::pivot_longer(everything(), names_to = "term_2", values_to = "probability_2")

    # Combine phi1 and phi2 results
    pair <- knitr::kable(bind_cols(phi1_result_long, phi2_result_long))

    # Append the result to the list
    result_list[[i]] <- pair
  }

  # Combine all results into a single dataframe
  final_result <- do.call(bind_rows, result_list)

  return(final_result)
}
```

## Execute functions over pairs

```{r}
# Loop through pairs of models to generate heatmaps and print results
for (i in 1:(length(lda_models) - 1)) {
  model1 <- lda_models[[i]]
  model2 <- lda_models[[i + 1]]

  result <- generate_heatmap(model1, model2, correlation_threshold = 0.6)
  
  phi1_union <- result$phi1_union
  phi2_union <- result$phi2_union
  cor_matrix <- result$cor_matrix

  # Print ordered rows only if there are high correlations
  if (any(cor_matrix > 0.6, na.rm = TRUE)) {
    phi1_result <- phi1_union[, order(colMeans(phi1_union), decreasing = TRUE)]
    phi2_result <- phi2_union[, order(colMeans(phi2_union), decreasing = TRUE)]

    # Call the modified function and pass high_corr_indices as an argument
    final_result <- print_ordered_rows(phi1_result, phi2_result, cor_matrix, high_corr_indices)
    print(final_result)
  }
}

