In [None]:
library(quanteda)
library(quanteda.textstats)
library(writexl)
library(seededlda)
library(dplyr)
library(tidyr)
library(pheatmap)
library(tibble)

In [None]:
# Load data and list column names
convos <- read.csv('data/all-conversations.csv')

# Focus our analysis on only the turns purely in the MAIN section - note that there are a small number of turns that overlap 
# between main and pre/post as different conversation groups were overlapping at the time of data collection.
convos <- convos[convos$section == 'MAIN',]

# Generate a compact conversation ID from the file_id for compact labels on plots
convos$conversation_id <- regmatches(convos$X_file_id, regexpr("AmAus[0-9]+", convos$X_file_id))

# generate a doc_id for quanteda - this would ideally be a primary key instead.
convos$doc_id <- seq_along(convos$text)

nrow(convos)
names(convos)

In [None]:
# Create a corpus from the dataframe we loaded, use the text field as the text
convo_corpus <- corpus(convos, text_field="text", docid_field = "doc_id")

# Tokenise the full dataset - use standard tokeniser, but remove punctuation and lowercase all tokens
convo_full_tokens <- tokens_tolower(
    tokens(convo_corpus, remove_punct=TRUE)
)

print(paste("Total tokens", sum(ntoken(dfm(convo_full_tokens)))))
print(paste("Total types", length(types(convo_full_tokens))))

frequent_tokens <- c(
    # This is based on the top 100 most frequent tokens after
    # removing the standard English stop list, preserving some common
    # nouns like "Brisbane", "Australia", and "friends"
    "yeah",
    "like",
    "hh",
    "um",
    "know",
    "just",
    "oh",
    "mm",
    "really",
    "haha",
    "think",
    "well",
    "uh",
    "cause",
    "go",
    "ok",
    "get",
    "people",
    "one",
    "right",
    "hahaha",
    "tsk",
    "ah",
    "got",
    "good",
    "hmm",
    "kind",
    "mean",
    "can",
    "kinda",
    "lot",
    "actually",
    "okay",
    "pretty",
    "different",
    "cool",
    "little",
    "much",
    "thing",
    "now",
    "year",
    "stuff",
    "something",
    "yep",
    "went",
    "back",
    "years",
    "two",
    "bit",
    "sort",
    "time",
    "see",
    "even",
    "say",
    "going",
    "nice",
    "things",
    "come",
    "gonna",
    "wanna",
    "us",
    "interesting",
    "guess",
    "work",
    "states",
    "still",
    "probably",
    "big",
    "new",
    "yes",
    "way",
    "first",
    "never",
    "three",
    "sure",
    "hhh",
    "around",
    "always",
    "everything",
    "i-",
    "said",
    "wow",
    "came",
    "ha",
    "mhm",
    "alright",
    "hahahaha",
    "day",
    "take",
    "want",
    "used",
    "long",
    "y-"
)
transcription_marks <- c(
    "°",
    "sniffs",
    "hh",
    "hhh"
)
custom_stopwords <- c(
    frequent_tokens,
    transcription_marks,
    stopwords("english")
)

# Tokenise the corpus - we use the Quanteda default tokenisation and remove the standard list of English stopwords
# Note that the standard English list assumes written not spoken material so we will have to take a closer look at this.
convo_tokens <- convo_full_tokens |> 
# Remove our custom stopwords
tokens_remove(custom_stopwords, min_nchar=2) |>
# Remove partial tokens using a match on trailing hyphen
tokens_remove("*-", valuetype="glob")

# Create a document-feature matrix, (also known as document term, or term-document matrix).
# Note that dfm is the quanteda standard nomenclature so I'll use it throughout.
# More specifically this is a Turn-Token matrix, as the 'documents' are single turns by a speaker.
# For computational reasons we will trim any token that occurs only once to reduce the vocabulary size.
convo_turn_dfm <- dfm_trim(dfm(convo_tokens), min_termfreq = 2)


# Granularity

We can count things at different levels of granularity to start to address topic. A word that is used once in every conversation is different from a word used 30 times in a single conversation.

Some examples:

- we can count tokens, regardless of where they occur
- we can count turns including a token
- we can count conversations including a token
- we can count by speaker in a conversation



In [None]:
# Let's start with tokens and turns:
frequencies <- textstat_frequency(convo_turn_dfm)
names(frequencies)

# the feature is the token, the frequency is the token count, and docfreq is the turn count
write_xlsx(frequencies, "results/token_turn_counts.xlsx")

In [None]:
# We'll group the dfm together by the file (conversation) to count tokens and conversations
convo_dfm <- dfm_group(convo_turn_dfm, groups=docvars(convo_corpus, 'conversation_id'))
conversation_frequencies <- textstat_frequency(convo_dfm)

write_xlsx(conversation_frequencies, "results/token_conversation_counts.xlsx")

# Topic modelling and what's in a document

Topic modelling algorithms like LDA work with "documents" containing "bags of words" (counts of how often each word occurs, regardless of their order). For analysing
conversations this means we have to make an analytical choice about what is the unit of a document.

We'll explore two extreme ends of this granularity question:

1. Each turn in the conversation is a document.
2. Each conversation is a document, including all the turns concatenated together.

Treating each turn as a document should hopefully yield focused and precise topics, but because turns are generally short and LDA like algorithms are thought to perform better with longer documents results may not be great. Treating each conversation as a document might give more traction for an algorithm as there is more opportunity to investigate word co-occurence, at the cost of potentially mixing things too much. As we would expect the topic of conversation to shift over time in a single conversation it may not be appropriate to have the first and last turns in a conversation mixed together.


In [None]:
# Before we do the actual calculation, let's make a helper function to visualise the results a bit
# more nicely and with more information than the default functionality of the textmodels package.
create_topic_table <- function(lda_model, n_terms=20) {
    # Take an LDA model and create a more nicely formatted table.
    # All of the prevalence measures are converted to percentages and rounded for display.
    # Output format: topic_number, topic_prevalence, topic_words (prevalence),
    topic_prevalence <- colMeans(lda_model$theta) |> 
        as.data.frame() |>
        rownames_to_column(var="topic")
    
    colnames(topic_prevalence) <- c("topic", "prevalence")
        
    
    # Now extract the top words and weights for each topic->term probability
    # Unfortunately this duplicates the terms function which doesn't return
    # the weights.
    topic_terms <- lda_model$phi |> 
        as.data.frame() |>
        # Need to convert the matrix rownames on the dataframe to a column for tidyverse
        rownames_to_column(var="topic_id") |>
        # Pivot so we have a long table to make grouping easy
        pivot_longer(!topic_id, names_to="feature", values_to="weight") |>
        rename(topic=topic_id) |>
        # Create the form <token> {<weight} for representing the table 
        mutate(weight_rep = paste0(feature, " (", round(weight * 100, 1), ")")) |>
        # Group by topic and select the top weighted terms, sort in descending order
        group_by(topic) |>
        top_n(weight, n=n_terms) |>
        arrange(topic, desc(weight)) |>
        # Finally collapse all the features into a single row per topic
        summarise(topic_features = paste(weight_rep, sep=", ", collapse=" "))
        
    combined <- topic_terms |>
        inner_join(topic_prevalence, by=join_by(topic)) |>
        arrange(desc(prevalence)) |>
        select(topic, prevalence, topic_features) |>
        mutate(prevalence = round(prevalence * 100, 2))
    
    return(combined)
}

# Set core topic model and display parameters

The following parameters are one of the core parameters to set in most topic model algorithms: the fixed number of topics to learn (sometimes labelled k), and the number of top terms from each topic we will represent in our output table.

In [None]:
number_of_topics <- 20
print_top_terms <- 10

In [None]:
# Topics by turn
set.seed(192038102)
lda_turns <- textmodel_lda(convo_turn_dfm, k=number_of_topics)
turn_results <- terms(lda_turns, n=print_top_terms) |> as.data.frame()

write_xlsx(turn_results, "results/turn_topic_top_words.xlsx")

turn_lda_table <- create_topic_table(lda_turns, n_terms = print_top_terms)
write_xlsx(turn_lda_table, "results/turn_topic_table.xlsx")

turn_results

In [None]:
# Topics on conversations as documents
set.seed(50193853)
lda_convo <- textmodel_lda(convo_dfm, k=number_of_topics)
convo_results <- terms(lda_convo, n=print_top_terms) |> as.data.frame()

write_xlsx(convo_results, "results/convo_topic_top_words.xlsx")

convo_lda_table <- create_topic_table(lda_convo, n_terms = print_top_terms)
write_xlsx(convo_lda_table, "results/convo_topic_table.xlsx")

convo_results

In [None]:
# Heatmap of topics x conversations

# First of all - let's rename the topics to use the first few words of each topic as an indicator
topic_convo_weights <- t(lda_convo$theta)

topic_labels <- terms(lda_convo, n=3) |> 
    as.data.frame() |>
    pivot_longer(everything(), names_to="topic") |>
    group_by(topic) |>
    summarise(features=paste0(value, collapse=" "))

topic_labels$display = paste(topic_labels$topic, topic_labels$features)
    
rownames(topic_convo_weights) <- topic_labels$display[match(rownames(topic_convo_weights), topic_labels$topic)]

pheatmap(
    topic_convo_weights,
    cluster_rows = FALSE,
    cluster_cols = FALSE,
    color = colorRampPalette(c("white", "red"))(20),
    cellwidth = 16,
    cellheight = 16,
    angle_col="90",
    width = 10,
    height = 6,
    file="results/convo_topic_heatmap.pdf"
)


In [None]:
options(repr.matrix.max.rows=500)
        
# Keyword in (conversational) context - to do this we need to retrieve the turns containing the keyword, 
# and show it in context of surrounding turns.

# Show this number of turns either side of each match
turn_window_size = 3
pattern = 'rock'

# This is annoyingly fiddly - grab the matching turns and extract the original doc_id/sequence in our corpus df.
# I feel like there should be a better way to do this?
# We also need to take the unique docids and pre-compute window offsets for displaying the surrounding context. 
matching_turns <- data.frame(matchid = as.integer(unique(index(convo_tokens, pattern = pattern)$docname))) |>
    mutate(window_start = matchid - turn_window_size, window_end = matchid + turn_window_size) |>
    select(matchid, window_start, window_end)

turns_with_context <- convos |> inner_join(matching_turns, join_by(between(doc_id, window_start, window_end))) |>
    arrange(matchid, doc_id) |>
    select(name, text, timeCode, conversation_id, matchid, doc_id)  

filename <- paste("results/snippets_", pattern, ".xlsx", sep = "", collapse = NULL)

print(paste("wrote out ", nrow(matching_turns), " matches to: ", filename, sep="", collapse=NULL))

write_xlsx(turns_with_context, filename)

turns_with_context
