In [None]:
library(quanteda)
library(quanteda.textstats)
library(writexl)
library(seededlda)
library(dplyr)

In [None]:
# Load data and list column names
convos <- read.csv('data/all-conversations.csv')

# Focus our analysis on only the turns purely in the MAIN section - note that there are a small number of turns that overlap 
# between main and pre/post as different conversation groups were overlapping at the time of data collection.
convos <- convos[convos$section == 'MAIN',]

# generate a doc_id for quanteda - this would ideally be a primary key instead.
convos$doc_id <- seq_along(convos$text)

nrow(convos)
names(convos)

In [None]:
# Create a corpus from the dataframe we loaded, use the text field as the text
convo_corpus <- corpus(convos, text_field="text", docid_field = "doc_id")

# Tokenise the full dataset - use standard tokeniser, but remove punctuation and lowercase all tokens
convo_full_tokens <- tokens_tolower(
    tokens(convo_corpus, remove_punct=TRUE)
)

print(paste("Total tokens", sum(ntoken(dfm(convo_full_tokens)))))
print(paste("Total types", length(types(convo_full_tokens))))

frequent_tokens <- c(
    # This is based on the top 100 most frequent tokens after
    # removing the standard English stop list, preserving some common
    # nouns like "Brisbane", "Australia", and "friends"
    "yeah",
    "like",
    "hh",
    "um",
    "know",
    "just",
    "oh",
    "mm",
    "really",
    "haha",
    "think",
    "well",
    "uh",
    "cause",
    "go",
    "ok",
    "get",
    "people",
    "one",
    "right",
    "hahaha",
    "tsk",
    "ah",
    "got",
    "good",
    "hmm",
    "kind",
    "mean",
    "can",
    "kinda",
    "lot",
    "actually",
    "okay",
    "pretty",
    "different",
    "cool",
    "little",
    "much",
    "thing",
    "now",
    "year",
    "stuff",
    "something",
    "yep",
    "went",
    "back",
    "years",
    "two",
    "bit",
    "sort",
    "time",
    "see",
    "even",
    "say",
    "going",
    "nice",
    "things",
    "come",
    "gonna",
    "wanna",
    "us",
    "interesting",
    "guess",
    "work",
    "states",
    "still",
    "probably",
    "big",
    "new",
    "yes",
    "way",
    "first",
    "never",
    "three",
    "sure",
    "hhh",
    "around",
    "always",
    "everything",
    "i-",
    "said",
    "wow",
    "came",
    "ha",
    "mhm",
    "alright",
    "hahahaha",
    "day",
    "take",
    "want",
    "used",
    "long",
    "y-"
)
transcription_marks <- c(
    "°",
    "sniffs",
    "hh",
    "hhh"
)
custom_stopwords <- c(
    frequent_tokens,
    transcription_marks,
    stopwords("english")
)

# Tokenise the corpus - we use the Quanteda default tokenisation and remove the standard list of English stopwords
# Note that the standard English list assumes written not spoken material so we will have to take a closer look at this.
convo_tokens <- convo_full_tokens |> 
# Remove our custom stopwords
tokens_remove(custom_stopwords, min_nchar=2) |>
# Remove partial tokens using a match on trailing hyphen
tokens_remove("*-", valuetype="glob")

# Create a document-feature matrix, (also known as document term, or term-document matrix).
# Note that dfm is the quanteda standard nomenclature so I'll use it throughout.
# More specifically this is a Turn-Token matrix, as the 'documents' are single turns by a speaker.
# For computational reasons we will trim any token that occurs only once to reduce the vocabulary size.
convo_turn_dfm <- dfm_trim(dfm(convo_tokens), min_termfreq = 2)


# Granularity

We can count things at different levels of granularity to start to address topic. A word that is used once in every conversation is different from a word used 30 times in a single conversation.

Some examples:

- we can count tokens, regardless of where they occur
- we can count turns including a token
- we can count conversations including a token
- we can count by speaker in a conversation

In [None]:
# Let's start with tokens and turns:
frequencies <- textstat_frequency(convo_turn_dfm)
names(frequencies)

# the feature is the token, the frequency is the token count, and docfreq is the turn count
write_xlsx(frequencies, "results/token_turn_counts.xlsx")

In [None]:
# We'll group the dfm together by the file (conversation) to count tokens and conversations
convo_dfm <- dfm_group(convo_turn_dfm, groups=docvars(convo_corpus, 'X_file_id'))
conversation_frequencies <- textstat_frequency(convo_dfm)

write_xlsx(conversation_frequencies, "results/token_conversation_counts.xlsx")

In [None]:
# Topics by turn
set.seed(192038102)
lda_turns <- textmodel_lda(convo_turn_dfm, k=20)
turn_results <- terms(lda_turns, n=20) |> as.data.frame()

write_xlsx(turn_results, "results/turn_topic_top_words.xlsx")

turn_results

In [None]:
# Topics on conversations as documents
set.seed(50193853)
lda_convo <- textmodel_lda(convo_dfm, k=20)
convo_results <- terms(lda_convo, n=20) |> as.data.frame()

write_xlsx(convo_results, "results/convo_topic_top_words.xlsx")

convo_results

In [None]:
options(repr.matrix.max.rows=500)
        
# Keyword in (conversational) context - to do this we need to retrieve the turns containing the keyword, 
# and show it in context of surrounding turns.

# Show this number of turns either side of each match
turn_window_size = 3
pattern = 'rock'

# This is annoyingly fiddly - grab the matching turns and extract the original doc_id/sequence in our corpus df.
# I feel like there should be a better way to do this?
# We also need to take the unique docids and pre-compute window offsets for displaying the surrounding context. 
matching_turns <- data.frame(matchid = as.integer(unique(index(convo_tokens, pattern = pattern)$docname))) |>
    mutate(window_start = matchid - turn_window_size, window_end = matchid + turn_window_size) |>
    select(matchid, window_start, window_end)

turns_with_context <- convos |> inner_join(matching_turns, join_by(between(doc_id, window_start, window_end))) |>
    arrange(matchid, doc_id) |>
    select(name, text, timeCode, X_file_id, matchid, doc_id)  

filename <- paste("results/snippets_", pattern, ".xlsx", sep = "", collapse = NULL)

print(paste("wrote out ", nrow(matching_turns), " matches to: ", filename, sep="", collapse=NULL))

write_xlsx(turns_with_context, filename)

turns_with_context
