# Bertopic using 1-gram. 
* Clean up langauge (remove 'redacted', 'yeah', etc.)

## Libraries and configurations

In [208]:
import pandas as pd
import numpy as np

In [2]:
import openai
import os

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.getenv('OPENAI_API_KEY')

In [3]:
#  Get figures to actually render in notebook [issue](https://github.com/MaartenGr/BERTopic/issues/764)
import plotly.io as pio
pio.renderers.default='iframe'

In [4]:
# LOAD BERT LIBRARIES
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

In [5]:
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech

## Load and clean interview data

In [6]:
interviews_csv = 'all_public_interviews.csv'
interviews = pd.read_csv(interviews_csv, dtype=str, keep_default_na=False)

In [7]:
# Add class columns to use in differentiating
interviews['class'] = interviews.apply(lambda x: x['Set'] + '_' + x['Interview'], axis=1)
interviews['Set_speaker'] = interviews.apply(lambda x: x['Set'] + '_Interviewer' if x['speaker_type'] == 'ER' else  x['Set'] + '_Interviewee', axis=1)

### Clean up: remove words from strings

In [8]:
# String clean up functions
def recursively_remove_spaces(string):
    if '  ' in string:
        string = string.replace('  ',' ')
        return(recursively_remove_spaces(string))
    else:
        return string
    
def clean_up_punctuation(string):
    for p in punctuation_list:
        space_p = ' ' + p        
        string = string.replace(space_p, p)
    return string

def remove_words_from_string(string, remove_words_list, punctuation_list=[]):   
    # remove remove words
    for remove_word in remove_words_list:
        # TODO: use regex to match to items next to punctuation but not sub-words
        string = string.replace(remove_word, '')
        
    # Condense all spaces into single space
    string = recursively_remove_spaces(string)
                
    # strip spaces from abandonded punctuation
    for p in punctuation_list:
        space_p = ' ' + p        
        string = string.replace(space_p, p)
    return string

In [9]:
remove_words_list = ['[REDACTED]','Okay','okay','No','uh','Yeah', 'yes', 'yeah','yep',  'inaudible', '[crosstalk]','[]']
punctuation_list = ['.',',']

In [10]:
interviews['cleaned_text'] = interviews.apply(lambda x: remove_words_from_string(x['Text'], remove_words_list, punctuation_list), axis=1)

In [11]:
# Drop rows with empyt text field
interviews_cleaned = interviews[(interviews['cleaned_text']!='') & (interviews['cleaned_text']!=' ')].copy()

In [12]:
mask = (interviews_cleaned['cleaned_text'].str.len() < 5 )

In [13]:
interviews_cleaned.loc[mask]['cleaned_text'].unique()

array(['. ', ',. ', '.. ', 'Me. ', 'Yea ', 'Uh. ', '48. ', 'pe. ', 'in. ',
       'do. ', ' I. ', ' - ', 'on? ', 'Oh. ', '... ', 'go. ', 'it. ',
       'In', 'it ', ',.. ', '? ', 'or? ', ' ? ', 'Yes ', ', ', 'So. ',
       'Oh! ', 'to. ', 'me. ', 'on ', ',,. ', 'Or? ', '.'], dtype=object)

In [14]:
# Text length of 5 seems to be dividing line for meaningful text, so drop text length < 5
interviews_cleaned = interviews_cleaned.loc[~mask].copy()

In [15]:
docs = list(interviews_cleaned['cleaned_text'])
origin = list(interviews_cleaned['Set'])
speaker_type = list(interviews_cleaned['speaker_type'])
set_speaker = list(interviews_cleaned['Set_speaker'])

## Precalculate Embeddings

In [16]:
# Pre-calculate embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedding_model.encode(docs)

## Define Base BERTopic with defaults + specified random_state
Take-away: need future work on clustering.  Cleaning looks good

In [17]:
# BASE MODEL CONFIGURATIONS (these are package defaults unless commented)

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42) # Use consistent random state for repeatability

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english" )

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

In [18]:
base_model = BERTopic(umap_model=umap_model) # All default settings + random_state

In [19]:
topics_base, probs_base = base_model.fit_transform(docs, embeddings)

In [20]:
# Save base model topic info for Name / Representation in base mode"
base_df = base_model.get_topic_info()

In [21]:
base_df.head(5)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,3979,-1_to_the_and_water,"[to, the, and, water, that, of, like, in, know...","[Oh,, there's definitely operational challenge..."
1,0,223,0_pipes_pipe_piped_piping,"[pipes, pipe, piped, piping, system, old, new,...","[The pipes,. , in the pipes. , The pipes? ]"
2,1,172,1_questions_anything_asked_should,"[questions, anything, asked, should, ask, any,...","[That's good. Well, I don't think that I have ..."
3,2,129,2_community_communities_people_been,"[community, communities, people, been, members...",[There's a system in place and then the commun...
4,3,100,3_tank_tanks_inside_plastic,"[tank, tanks, inside, plastic, new, holding, c...","[tanks? , that tank to.. , One house I went to..."


In [22]:
base_model.visualize_topics()

## Update Topics with KeyBERT and OpenAi-4

In [32]:
# KeyBERT
keybert_model = KeyBERTInspired()

# OpenAI Prompt
prompt = """
I have a topic that contains the following documents: 
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
topic: <topic label>
"""

# GPT-3.5
openai_35_model = OpenAI(model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

# GPT-4
openai_4_model = OpenAI(model="gpt-4", exponential_backoff=True, chat=True, prompt=prompt)

In [34]:
rep_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,
  ctfidf_model=ctfidf_model,
  representation_model={
        "KeyBERT": keybert_model,
        "OpenAI-4": openai_4_model,    
    },

  # Hyperparameters
  top_n_words=10,
  verbose=True
)

In [35]:
# Do 3.5 separately as it has a tendency to hit timeout errors
representation_model_35 = {
    "OpenAI": openai_35_model,    
}

In [36]:
topics_rep, probs_rep = rep_model.fit_transform(docs)

Batches:   0%|          | 0/291 [00:00<?, ?it/s]

2023-10-20 14:57:15,337 - BERTopic - Transformed documents to Embeddings
2023-10-20 14:57:33,149 - BERTopic - Reduced dimensionality
2023-10-20 14:57:33,580 - BERTopic - Clustered reduced embeddings


In [45]:
rep_model_df = rep_model.get_topic_info()

In [79]:
rep_model_df[1:20]

Unnamed: 0,Topic,Count,Name,Representation,KeyBERT,OpenAI-4,Representative_Docs
1,0,226,0_pipes_pipe_piped_piping,"[pipes, pipe, piped, piping, old, new, hdpe, r...","[pipes, pipe, piping, pipeline, piped, hoses, ...",[Pipe Installation and Replacement],"[pipes, and everything else. , The pipes? , pi..."
2,1,174,1_questions_asked_ask_didn,"[questions, asked, ask, didn, question, missin...","[questions, ask, asking, asked, question, thin...",[Inquiring About Unasked Questions],[.. I think those are all the questions I have...
3,2,163,2_plant_operator_operators_level,"[plant, operator, operators, level, plants, ce...","[plant, plants, water, wastewater, operate, op...",[Water Plant Operator Training],"[water plant operator or about? , plant. , How..."
4,3,125,3_great_awesome_thank_perfect,"[great, awesome, thank, perfect, cool, thanks,...","[great, awesome, fantastic, amazing, good, hap...",[Expressions of Appreciation and Approval],"[,. That's awesome. Well, great. , Great. Awes..."
5,4,124,4_filter_filters_brita_filtration,"[filter, filters, brita, filtration, filtered,...","[filter, filtered, filters, filtering, filtrat...",[Water Filtration and Filters],"[Well here I filter the water. , have a filter..."
6,5,119,5_community_communities_members_meetings,"[community, communities, members, meetings, ro...","[community, communities, people, folks, gather...",[Role and Sense in Communities],"[the community. Some people at least. , commun..."
7,6,104,6_lived_moved_live_grew,"[lived, moved, live, grew, long, living, broug...","[years, relocating, lived, area, long, forever...",[Duration of Residence in Area],"[here, so you moved to, but how long were you ..."
8,7,95,7_tank_tanks_inside_plastic,"[tank, tanks, inside, plastic, holding, new, p...","[tanks, tank, gallon, fuel, inlet, volume, sip...",[Cleaning and Priming New Tanks],"[,. And how big is your tank? , the tank was, ..."
9,8,95,8_piped_water_pipe_enjoy,"[piped, water, pipe, enjoy, pipes, clear, swit...","[pipes, pipe, piped, water, watering, flow, to...",[Enjoyment and Preference of Piped Water],"[, the piped water? , you had piped water? , ..."
10,9,90,9_village_council_villages_tribal,"[village, council, villages, tribal, local, ci...","[village, villages, tribe, tribes, municipalit...",[Tribal Villages Local Government Interaction],"[village , Well, they ruined they ruined the l..."


## Visualize

In [77]:
fig = rep_model.visualize_topics(top_n_topics=20, custom_labels='KeyBERT')

In [78]:
fig.show()

In [75]:
rep_model.visualize_heatmap(top_n_topics=30,  custom_labels='KeyBERT')

### Create topics per group
* Interview set    
* Speaker Type
* Combination

#### Dictionaries for updating default graph labeling

In [163]:
# Update Fig Dictionary
topic_label_dict = {}
for i in range(1,fig_topics_number + 1):
    init_label = 'Topic ' + str(rep_model_df.iloc[i]['Topic'])
    new_label = rep_model_df.iloc[i]['OpenAI-4'][0]
    topic_label[init_label] = new_label
    
topic_labels_list = list(topic_label_dict.keys())    



In [164]:
topic_rep_dict = {}

for i in range(1,fig_topics_number + 1):
    topic_name = rep_model_df.iloc[i]['Name']
    new_label = 'Topic ' + str(rep_model_df.iloc[i]['Topic']) + ': ' + rep_model_df.iloc[i]['OpenAI-4'][0]
    topic_rep_dict[topic_name] = new_label
    
topic_rep_list = list(topic_rep_dict.keys())    


### Interview Set

In [62]:
topics_per_origin = rep_model.topics_per_class(docs, classes=origin)

2it [00:00,  5.20it/s]


In [162]:
fig_topics_number = 20
fig_topics_origin = rep_model.visualize_topics_per_class(topics_per_origin, top_n_topics=fig_topics_number)

In [166]:
fig_topics_origin

In [190]:
new_fig = rep_model.visualize_topics_per_class(topics_per_origin, top_n_topics=fig_topics_number)

In [191]:
# Commands to update labeling in figure
for bar in new_fig['data']:
    bar['name'] = topic_rep_dict[bar['name']]
new_fig['layout']['legend']['title']['text'] = '<b>Topic</b>'
new_fig['layout']['title']['text'] = '<b>Topics per Interview Set</b>'
new_fig['layout']['yaxis']['title']['text'] = 'Interview Set'

In [192]:
new_fig

## Topics per speaker type

In [None]:
topics_per_speaker_type = rep_model.topics_per_class(docs, classes=speaker_type)

2it [00:00,  5.25it/s]


In [194]:
fig_speaker_type = rep_model.visualize_topics_per_class(topics_per_speaker_type, top_n_topics=fig_topics_number)

In [215]:
# Commands to update labeling in figure
# for bar in fig_speaker_type['data']:
#     bar['name'] = topic_rep_dict[bar['name']]
# fig_speaker_type['layout']['legend']['title']['text'] = '<b>Topic</b>'
# fig_speaker_type['layout']['title']['text'] = '<b>Topics per Speaker Class</b>'
# fig_speaker_type['layout']['yaxis']['title']['text'] = 'Speaker Class'

In [None]:
# Commands to update labeling in figure
# speaker_types_array = np.array(['Interviewee', 'Interviewer'])
# for bar in fig_speaker_type['data']:
#     bar['y'] = speaker_types_array

In [216]:
fig_speaker_type

### 4 way split

In [217]:
topics_per_set_and_speaker = rep_model.topics_per_class(docs, classes=set_speaker)

4it [00:00,  6.27it/s]


In [218]:
fig_both = rep_model.visualize_topics_per_class(topics_per_set_and_speaker, top_n_topics=fig_topics_number)

In [220]:
# Commands to update labeling in figure
for bar in fig_both['data']:
    bar['name'] = topic_rep_dict[bar['name']]
fig_both['layout']['legend']['title']['text'] = '<b>Topic</b>'
fig_both['layout']['title']['text'] = '<b>Topics per Interview Group and Speaker Class</b>'
fig_both['layout']['yaxis']['title']['text'] = 'interview Group / Speaker Class'

In [221]:
fig_both

# Modify MOdel Parameters (Future)

## Modify parameters / configurations for clustering
HDBSCAN
* min_cluster_size 15 --> 30
* min_topic_size  default (10?) --> 50

HYPER PARAMETERS
* add top_n_words=10
* min_topic_size (default is 10)

In [50]:
# Step 3 - Cluster reduced embeddings
hdbscan_model_1 = HDBSCAN(min_cluster_size=30, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

In [51]:
model_1 = BERTopic(
  # Pipeline models
  umap_model=umap_model_will,
  hdbscan_model = hdbscan_model_1,

  # Hyperparameters
  top_n_words=10,
  min_topic_size = 50,
  verbose=True
)