# Topic Modeling on New York City Participatory Budget Proposals

### Overview
This analysis started by scraping approximately two thousand participatory budgeting proposals from New York City's [idea collection platform](https://www.participate.nyc.gov/processes/Citywidepb2023/f/321/) submitted during the last quarter of 2023, aiming to perform topic modeling to identify key ideas for potentially funded programs and services. A total of 1,947 proposals were collected, with 1,897 being analyzable; those excluded contained insufficient information. From these 1,897 proposals, 20 topics were generated and assigned back to each of the proposals. 

This topic modeling approach utilized [BERTopic](https://arxiv.org/abs/2203.05794), leveraging [OpenAI's embeddings](https://platform.openai.com/docs/guides/embeddings/) to transform each proposal into a high-dimensional vector, subsequently reduced via [UMAP](https://arxiv.org/abs/1802.03426) and clustered using [HBDSCAN](https://arxiv.org/abs/1911.02282). Each cluster was considered a "big document", where important keywords and n-grams (ranging from 1 to 3 words) were extracted using to define topics. [OpenAI's GPT-4o mini](https://platform.openai.com/docs/models/gpt-4o-mini) further refined these topics into coherent descriptions based on the clusters' prominent proposals and keywords.  This analysis illustrates a scalable approach to understanding civic engagement through data-driven insights.

### Packages, Constants, and Functions

In [None]:
# Standard libraries
import re
import os
import warnings

# Data manipulation
import pandas as pd
import numpy as np

# Data visualization
import plotly.express as px

# NLP
import openai 
from bertopic.backend import OpenAIBackend 
from sklearn.metrics.pairwise import cosine_similarity
from bertopic.representation import OpenAI, KeyBERTInspired, MaximalMarginalRelevance
from bertopic import BERTopic 
import spacy

# Dimensionality reduction
from sklearn.decomposition import PCA 
from umap import UMAP 

# Clustering
from sklearn.cluster import KMeans
from hdbscan import HDBSCAN

# Tokenization
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from gensim.corpora import Dictionary
from nltk.tokenize import word_tokenize
import nltk

# Weighting
from bertopic.vectorizers import ClassTfidfTransformer

# Evaluation 
from gensim.models import CoherenceModel

# Shhh
warnings.filterwarnings('ignore')

# Set the seed
seed = 42
np.random.seed(seed)

# Get large Spacy model for lemmatization
nlp = spacy.load("en_core_web_lg", disable=["parser", "ner"])

# URL pattern for removal
url_pattern = re.compile(r'https?://\S+|www\.\S+')

# Experimental seed words for steering the topic model (not used in the final model)
seed_words = [
    "art", "gallery", "theater", "exhibition", "festival", "cultural", "heritage", "performance", "music", "dance",
    "community", "assistance", "services", "financial", "digital", "esl", "english", "ged", "support", "resources",
    "jobs", "training", "skills", "career", "entrepreneurship", "business", "workforce", "economic", "empowerment", "workshops",
    "education", "youth", "families", "school", "learning", "children", "seniors", "adult", "parenting", "engagement",
    "environment", "sustainability", "eco-friendly", "green", "food", "organic", "recycling", "conservation", "justice", "life",
    "parks", "recreation", "facilities", "space", "playgrounds", "sports", "outdoor", "garden", "safety", "accessibility",
    "health", "wellness", "mental", "physical", "fitness", "well-being", "healthcare", "prevention", "living", "therapy",
    "safety", "justice", "prevention", "policing", "reduction", "recovery", "abuse", "secure", "services", "emergency",
    "streets", "sidewalks", "infrastructure", "transportation", "accessibility", "pedestrian", "safety", "mobility", "biking", "planning",
    "miscellaneous", "various", "diverse", "general", "flexible", "undefined", "exploratory", "innovative", "creative", "unspecified"
]


def get_embedding(text: str, client: openai.OpenAI, model: str) -> np.ndarray:
   '''Get the embeddings for the text using the specified model
   
   Args:
   - text: str, the text to get the embeddings for
   - client: openai.OpenAI, the OpenAI client to use
   - model: str, the model to use
   
   Returns:
   - np.ndarray, the embeddings for the text
   '''
   return client.embeddings.create(
      input=[text], 
      model=model
    ).data[0].embedding

def get_coherence_score(topic_words: list, dictionary: Dictionary, tokenized_docs: list) -> float:
    '''Calculate the Cv coherence score for the topics
    
    Args:
    - topic_words: list of lists of strings, the topics to calculate the coherence score for
    - dictionary: Dictionary, the dictionary created from the tokenized documents
    - tokenized_docs: list of lists of strings, the tokenized documents
    
    Returns:
    - coherence_score: float, the Cv coherence score for the topics
    '''
    coherence_model = CoherenceModel(
        topics=topic_words, 
        texts=tokenized_docs, 
        dictionary=dictionary, 
        coherence='c_v'
    )
    return coherence_model.get_coherence()

def get_diversity(topics: list , top_n=10) -> float:
    '''Calculate the diversity score for the topics
    
    Args:
    - topics: list of lists of strings, the topics to calculate the diversity score for
    - top_n: int, the number of top words to consider for each topic
    
    Returns:
    - diversity_score: float, the diversity score for the topics
    '''
    unique_words = set()
    total_words = 0
    for topic in topics:
        unique_words.update(topic[:top_n])
        total_words += top_n
    diversity_score = len(unique_words) / total_words
    return diversity_score

def tokenize_docs_and_create_dictionary(docs: list) -> tuple[list, Dictionary]:
    '''Tokenize the documents and create a dictionary
    
    Args:
    - docs: list of strings, the documents to tokenize
    
    Returns:
    - tokenized_docs: list of lists of strings, the tokenized documents
    - dictionary: Dictionary, the dictionary created from the tokenized documents
    '''
    # Download the punkt tokenizer
    nltk.download('punkt')  

    # Tokenize each document
    tokenized_docs = [word_tokenize(doc.lower()) for doc in docs]

    # Create a dictionary from the tokenized documents
    dictionary = Dictionary(tokenized_docs)

    return tokenized_docs, dictionary

def get_stopwords() -> list:
    '''Get list of stop words to remove from the documents
    
    Returns:
    - stop_words: list of strings, the stop words to remove
    '''
    # Define base stop words in the English language to remove
    stop_words = set(stopwords.words('english')) 

    # Add others custom stop words 
    # Any others words that are not useful for the analysis?
    stop_words.update([
        'said', 'participatory', 'budget', 'well', 'make', 
        'plan', '2025', 'see', 'best', 'think', 'could', 
        'came', 'consider', 'please', 'take', 'many', 
        'taken', 'would', 'go', 'coming', 'might', 'good', 
        'say', 'get', 'says', 'want', 'asking', 'thinking', 
        'know', 'people', 'thanks', 'ask', 'planning', 
        'thank', 'much', 'thoughts', 'great', 'made', 
        'come', 'thought', 'going', 'really', 'pb', 'also', 
        'may', 'need', 'should', 'taking', 'saying', 'way', 
        'considering', '25', 'goes', 'lot', 'community', 
        'city', 'better', 'public', 'asked', 'like', 
        'nyc', 'fy', 'considered', 'plans', 'one',
        'year', 'years', 'next', 'last', 'first','second', 
        'third', 'fourth', 'fifth','choice', 'choices', 
        'option', 'options','something','anything', 
        'nothing', 'everything', 'september', 'october', 
        'november', 'december', 'january', 'february',
        'march', 'april', 'may', 'june', 'july', 'august',
        'upgrade', 'provide', 'ballot', 'project', 'projects',
        'proposal', 'proposals', 'idea', 'ideas', 'initiative',
        'initiatives', 'vote', 'voting', 'voted', 'votes',
        'new', 'york','title:', 'tags:', 'challenge:', 'soultion:',
        'title', 'tags', 'challenge', 'soultion','solution','tag',
        'empower', 'empowerment', 'empowered', 'empowerment','vibrant',
        'bronx', 'brooklyn', 'manhattan', 'queens', 'staten', 'island',
        'dimitri', 'farah', 'maria','try', 'anh', 'ada', 'jean','nice',
        'daughter', 'en','mas','haruto','zuri'])

    return list(stop_words)

def preprocess_document(doc) -> str:
    """
    Preprocess the document by cleaning and lemmatizing text.
    
    Args:
    - doc: str, the document to preprocess.
    
    Returns:
    - preprocessed_doc: str, the preprocessed document.
    """
    # Lowercase and remove URLs
    doc = doc.lower()
    doc = url_pattern.sub('', doc)

    # Process the document with spaCy
    doc = nlp(doc)

    # Lemmatize and remove punctuation, numbers, and stop words
    lemmatized_text = []
    for token in doc:
        if token.is_alpha and not token.is_stop:
            lemmatized_text.append(token.lemma_)
    
    # Join processed tokens back into a single string
    preprocessed_doc= " ".join(lemmatized_text)

    return preprocessed_doc

def extract_topic_model_words(topic_model: BERTopic) -> list:
    '''Extract the topic words from the BERTopic model
    
    Args:
    - topic_model: BERTopic, the trained BERTopic model
    
    Returns:
    - bertopic_topic_words: list of lists of strings, the topic words from the model
    '''
    # Get the topics from the BERTopic model and filter out empty topics
    bertopic_topics = [
        topic_words 
        for topic_id, topic_words in topic_model.get_topics().items() 
        if (len(topic_words) > 0) and (topic_id != -1)
    ]

    # Make topics list of lists of strings instead of tuples with probabilities
    bertopic_topic_words = [
        [w[0] for w in t] 
        for t in bertopic_topics
    ]

    return bertopic_topic_words

def evaluate_topic_model(topic_model: BERTopic, raw_documents: list, topics: list) -> dict:
    '''Evaluate the BERTopic model using coherence and diversity scores
    
    Args:
    - topic_model: BERTopic, the trained BERTopic model
    - raw_documents: list of strings, the raw documents used to train the model
    - topics: list of integers, the topic id assigned to each document
    
    Returns:
    - evaluation: dict, the evaluation metrics for the model
    '''
    # Preprocess documents (Exclude -1 outlier topic from evaluation)
    preprocessed_documents = [
        preprocess_document(doc) 
        for doc, topic in zip(raw_documents, topics) 
        if topic != -1
    ]

    # Extract the topic words from the BERTopic model
    bertopic_topic_words = extract_topic_model_words(topic_model)

    # Tokenize the preprocessed documents and create a dictionary
    tokenized_docs, dictionary = tokenize_docs_and_create_dictionary(preprocessed_documents)

    # Calculate the coherence score
    coherence_score = get_coherence_score(bertopic_topic_words, dictionary, tokenized_docs)

    # Calculate the diversity score
    diversity_score = get_diversity(bertopic_topic_words)

    # print(f"Coherence score: {coherence_score}")
    # print(f"Diversity score: {diversity_score}")
    return {
        "coherence_score": coherence_score,
        "diversity_score": diversity_score
    }

def openai_representation(
        openai_api_key: str,
        model: str='gpt-4-turbo',
        num_docs: int=5,
        diversity: float=0.2
    ) -> OpenAI:
    '''Get the OpenAI representation model for the topics
    
    Args:
    - openai_api_key: str, OpenAI API key
    - model: str, the OpenAI model to use for the representation (default: 'gpt-4-turbo')
    - num_docs: int, the number of documents to use for the representation (default: 5)
    - diversity: float, the diversity of the topics (default: 0.2)

    Returns:
    - representation_model: OpenAI, the OpenAI representation model

    Notes:
        -  The cost is directly proportional to the
            - type of model
            - number of tokens in documents
            - number of documents
            - number of topics
    '''
    prompt = """
    Consider the following set of documents:
    [DOCUMENTS]
    These documents are associated with the keywords: [KEYWORDS]

    Analyze the documents and keywords provided to generate a concise, descriptive label for the topic. The label should encapsulate the primary theme of the keywords and documents in no more than 10 words and the label should represent program ideas. Please provide the topic label in the format specified below:
    topic: <topic label>
    """

    # prompt = """
    # I have a topic that contains the following documents:
    # [DOCUMENTS]
    # The topic is described by the following keywords: [KEYWORDS]

    # Based on the information above, extract a short but highly descriptive topic label of at most 5 words. Make sure it is in the following format:
    # topic: <topic label>
    # """
    client = openai.OpenAI(api_key=openai_api_key)
    openai_model = OpenAI(
        client, 
        model=model, 
        exponential_backoff=True, 
        chat=True, 
        prompt=prompt,
        nr_docs=num_docs,
        diversity=diversity
    )

    return openai_model 


### Data Cleaning

In [None]:
# Load the data
df = pd.read_excel('../data/nyc_pb_proposals.xlsx')

# Reformate the datetime column
df['datetime'] = pd.to_datetime(df['datetime'])

# Remove rows with missing values in what will be the proposal column
df = df[~df[['title','tags','Describe the challenge you want to address:','Describe your idea and approach to address the challenge']].isna().any(axis=1)]

# Reset the index
df.reset_index(drop=True, inplace=True) 

# Clean the proposal and challenge columns
df['proposal'] = (df['Describe your idea and approach to address the challenge'].str.replace(r'(\n){2,}|"', '\n', regex=True).str.strip()).str.strip()
df['challenge'] = (df['Describe the challenge you want to address:'].str.replace(r'(\n){2,}|"', '\n', regex=True).str.strip()).str.strip()

# Combine the title, tags, challenge, and solution into a single column for the proposal
# df['proposal'] = ('Title: ' + df['title'] 
#                   + '\n\nTags: ' + df['tags'].str.replace("\[|\]|'",'',regex=True)
#                   + '\n\nChallenge: ' + df['Describe the challenge you want to address:'].str.replace(r'(\n){2,}|"', '\n', regex=True).str.strip()
#                   + '\n\nSoultion: ' + df['Describe your idea and approach to address the challenge'].str.replace(r'(\n){2,}|"', '\n', regex=True).str.strip()).str.strip()

# Remove urls from the proposal and challenge columns
df['proposal'] = df['proposal'].apply(lambda x: url_pattern.sub('', x)).str.strip()
df['challenge'] = df['challenge'].apply(lambda x: url_pattern.sub('', x)).str.strip()

# Rename the columns
df.rename(
    columns={
        'In which borough should your idea take place?':'borough',
        'Do you have a specific neighborhood(s) in mind?':'neighborhood',
        'Which audience(s) does your idea help? Select as many as apply.':'audience',
        'Write the zipcode that best represents your New York City community':'zipcode'
    }, inplace=True
)

# Drop the columns we don't need 
df = df[['proposal_id', 'datetime', 'title', 'tags', 
         'borough', 'neighborhood', 'zipcode',
         'audience', 'challenge', 'proposal']]

# Standardize the audience column
def standardize_audience(audience):
    if pd.isna(audience):
        return []
    elif '[' not in audience:
        return [audience]
    else:
        return eval(audience)

df['audience'] = df['audience'].apply(standardize_audience)

# Fill missing values with empty strings
df.fillna('', inplace=True)

# Ambiguous phrases that indicate that part of the proposal is not provided 
idk_phrases = [
    'no', 'not sure', 'not given', 'unknown', 
    'none', 'not provided', 'not applicable', 
    'n/a', 'no idea', 'no response', 'no answer', 
    'no information', 'no data', 'no clue', 
    'no details', 'no comment', 'no opinion',
    'nothing entered', 'nothing provided',
    'nothing given', 'nothing specified',
    '[illegible]','left blank', 'left empty',
    'left unanswered', 'left unspecified',
    "i don't know", "non given", "non provided",
    "non applicable", "see above?", "see above",
    "nothing written","--","non","none listed",
    "nothing written here","none given","na","to be determined"
]

# Remove exact matches in title, challenge, and proposal columns 
for col in ['title', 'challenge', 'proposal']:
    df[col] = df[col].apply(lambda x: '' if x.lower() in idk_phrases else x)

# Concatenate the title, challenge, and proposal columns into a single column
df['proposal'] = (df.title.str.strip() + '\n' 
                  + df.challenge.str.strip() + '\n' 
                  + df.proposal.str.strip()
                  ).str.strip()

# Drop the challenge column
df.drop(columns=['challenge'], inplace=True)

# Remove rows with less than 5 words in the proposal column 
df = df[df['proposal'].str.split().apply(len)>4]

# Export the data
df.to_excel('../data/nyc_pb_proposals_cleaned.xlsx', index=False)

In [None]:
# Visualize the number of words in the proposals
fig = px.histogram(df, x=df['proposal'].str.split().str.len(), nbins=100)
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(title='Number of Words in the Proposals')
fig.update_xaxes(title='Number of Words')
fig.update_yaxes(title='Number of Proposals')
fig.show()

### Generate embeddings from OpenAI's `text-embedding-3-large` model

In [None]:
# Import the NYC Participatory Budgeting data
data = pd.read_excel('../data/nyc_pb_proposals_cleaned.xlsx')

# Initialize OpenAI client and embedding model
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
model = 'text-embedding-3-large'

#Get the embeddings for the submission text (commented out since they are already stored in the data)
data['submission_text_embedding_3_large'] = data.proposal.apply(
   lambda text: get_embedding(
      text=text, 
      client=client, 
      model=model,
    )
)
data.to_json('../data/nyc_pb_proposals_text-embedding-3-large.json', orient='records', lines=True)

### Topic Modeling

In [None]:
# 0. Prepare data for BERTopic
# Read in NYC PB data with embeddings
data = pd.read_json('../data/nyc_pb_proposals_text-embedding-3-large.json', orient='records', lines=True)

# Get documents and pregenerated embeddings
documents = data['proposal'].to_list()
embeddings = np.array(data['submission_text_embedding_3_large'].to_list())

print(f"Number of documents: {len(documents)}")
print(f"Sized of embeddings: {embeddings.shape}")

# 1. Embedding model
# Initialize OpenAI client and embedding model
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
model = 'text-embedding-3-large'

# Initialize the OpenAI backend for BERTopic
embedding_model = OpenAIBackend(client, model)

# 2. Dimensionality reduction 
#dim_model = PCA(n_components=31, random_state=seed)
dim_model = UMAP(n_neighbors=15, n_components=28, min_dist=0.0, metric='cosine', random_state=seed)

# 3. Clustering model
# cluster_model = KMeans(
#     n_clusters=15, 
#     init='k-means++', 
#     random_state=seed, 
#     max_iter=1000
# )
cluster_model = HDBSCAN(min_cluster_size=12, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# 4. Vectorizer model
# Get stop words to remove from the documents
stop_words = get_stopwords()

# Initialize the CountVectorizer model with the custom stop words and preprocessor
vectorizer_model = CountVectorizer(
    stop_words=stop_words, 
    preprocessor=preprocess_document, 
    min_df=2, 
    ngram_range=(1, 3)
)

# 5. Weighting model
weighting_model = ClassTfidfTransformer(
    reduce_frequent_words=True, 
    bm25_weighting=False,
    # seed_words=seed_words,
    # seed_multiplier=2
)

# 6. Representation models
# Key BERT inspired model for first pass refinement of topics
keybert_model = KeyBERTInspired(top_n_words=15)

# Maximal Marginal Relevance model for diversity in topics
mmr_model = MaximalMarginalRelevance(diversity=.3, top_n_words=15)

# OpenAI model for final refinement of topics
openai_model = openai_representation(
    openai_api_key=os.getenv('OPENAI_API_KEY'), 
    model='gpt-4o-mini', 
    num_docs=10,
    diversity=.3
)

# Chain the representation models together
representation_models = {'KeyBERT':keybert_model,
                         'KeyBERT+MMR':[keybert_model, mmr_model],
                         'KeyBERT+MMR+OpenAI':[keybert_model, mmr_model, openai_model]}

# Initialize BERTopic model with the pipeline models
topic_model = BERTopic(
    # Pipeline models
    embedding_model=embedding_model,
    umap_model=dim_model,
    hdbscan_model=cluster_model,
    vectorizer_model=vectorizer_model,
    ctfidf_model=weighting_model,
    # representation_model=representation_model,

    # Hyperparameters
    top_n_words=15,
    verbose=True,
    calculate_probabilities=True,
    # nr_topics='auto',
    # seed_topic_list=seed_topic_list
)

# Fit the BERTopic model to the data and get the topics and probabilities
topics, probs = topic_model.fit_transform(
    documents,      # List of documents
    embeddings # Array of embeddings
)

# Reduce outliers based on probabilities of topics
new_topics = topic_model.reduce_outliers(
    documents, 
    topics, 
    probabilities=probs, 
    strategy="probabilities",
    threshold=0.04
)

# Update the topics based on the new topics
topic_model.update_topics(
    docs=documents, 
    topics=new_topics,
    top_n_words=15,
    vectorizer_model=vectorizer_model,
    ctfidf_model=weighting_model,
    representation_model=representation_models
)

# Evaluate the BERTopic model
metrics = evaluate_topic_model(topic_model, documents, new_topics)

# Create a dictionary of topic labels 
chatgpt_topic_labels = {topic: " | ".join(list(zip(*values))[0]) for topic, values in topic_model.topic_aspects_["KeyBERT+MMR+OpenAI"].items()}

# Add outlier topic label
chatgpt_topic_labels[-1] = "Outlier Topic" 

topic_model.set_topic_labels(chatgpt_topic_labels)

# Get the topics and probabilities
data['topics'] = topic_model.topics_
data['probabilities'] = [max(prob) for prob in topic_model.probabilities_]

# Merge data w/ topics 
topic_data = pd.merge(
    data,
    topic_model.get_topic_info(), 
    left_on='topics', 
    right_on='Topic'
)

# Filter columns
topic_data = topic_data[['proposal_id', 'datetime', 'title', 'tags', 'borough', 'neighborhood',
                         'zipcode', 'audience', 'proposal', 'submission_text_embedding_3_large',
                         'topics', 'probabilities', 'Count', 'CustomName','KeyBERT+MMR']]
# Rename columns
topic_data.rename({'topics':'topic_id',
                   'Count':'proposals_per_this_topic',
                   'CustomName':'openai_topic_name',
                   'KeyBERT+MMR':'keywords'}, axis=1, inplace=True)

# Export the data
# topic_data.to_excel('../data/nyc_pb_proposals_topics_20240806.xlsx', index=False)

## Code below is for calculating cosine similarities to cluster centroids when using KMeans clustering
# reduced_embeddings = dim_model.fit_transform(embeddings)
# cluster_labels = cluster_model.fit_predict(reduced_embeddings)
# centroids = cluster_model.cluster_centers_

# # Calculate cosine similarities for each point to its cluster centroid
# cosine_similarities_to_centroid = []
# for i, embedding in enumerate(reduced_embeddings):
#     centroid = centroids[cluster_labels[i]]
#     # Reshape inputs for cosine_similarity function
#     similarity = cosine_similarity(
#         embedding.reshape(1, -1), 
#         centroid.reshape(1, -1)
#     )
#     cosine_similarities_to_centroid.append(similarity[0][0])

# Add to DataFrame
# topic_data['cosine_similarity_to_centroid'] = cosine_similarities_to_centroid



## Visualizations for Tuning

In [None]:
# plot histogram of probabilities
fig = px.histogram(data[data['topics']==-1], x='probabilities', nbins=100)
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(title='Probabilities of Topics')
fig.update_xaxes(title='Probability')
fig.update_yaxes(title='Number of Proposals')
fig.show()

In [None]:
# plot cumulative distribution of probabilities 
fig = px.histogram(data[data['topics']==-1], x='probabilities', cumulative=True, histnorm='probability',nbins=100)
fig.update_traces(marker=dict(line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(title='Cumulative Distribution of Probabilities')
fig.update_xaxes(title='Probability')
fig.update_yaxes(title='Cumulative Probability')
fig.show()

In [None]:
# Distribution of proposal topic probabilities by topic id
data.groupby('topics')['probabilities'].describe()

In [None]:
# Topic information
topic_model.get_topic_info()

In [None]:
# Visualize the topics and their keyword scores
fig = topic_model.visualize_barchart(
    top_n_topics=39,
    height=400, width=700,
    n_words=15,
    custom_labels=True
)
# increase x tick font size
fig.update_yaxes(tickfont=dict(size=13))
fig.write_html('../viz/word_score.html')

In [None]:
# Visualize the number of proposals per topic
agg_topic_data = topic_model.get_topic_info()
agg_topic_data = agg_topic_data[agg_topic_data['Topic'] != -1]

fig = px.bar(
    agg_topic_data.sort_values(by='Count',ascending=False), 
    y='CustomName', 
    x='Count', 
    title='Proposals by Topic', 
    color='CustomName', 
    labels={'CustomName': '', 'Count': 'Number of Proposals'},
    width=1200,
    height=800,
    orientation='h'
)
fig.update_layout(showlegend=False,plot_bgcolor='rgba(0, 0, 0, 0)')
fig.update_xaxes(showgrid=True,gridcolor='lightgray')
fig.write_html('../viz/proposal_distribution.html')

In [None]:
# Visualize the topic model similarity heatmap
fig = topic_model.visualize_heatmap(custom_labels=True, width=1600, height=1050)
fig.update_layout(title_text="")
fig.update_xaxes(tickfont=dict(size=14))
fig.update_yaxes(tickfont=dict(size=14))
fig.write_html('../viz/topic_similarity.html') 

## Merging Similar Topics

In [None]:
# Reduce the number of topics to 21
new_topics = topic_model.reduce_topics(documents,nr_topics=21)

# Set custom topic names
chatgpt_topic_labels = {
    topic: " | ".join(list(zip(*values))[0]) 
    for topic, values in topic_model.topic_aspects_["KeyBERT+MMR+OpenAI"].items()
}
# Add outlier topic label
chatgpt_topic_labels[-1] = "Outlier Topic" 
topic_model.set_topic_labels(chatgpt_topic_labels)

# Add topic ids and probabilities to the data
data['topics'] = topic_model.topics_
data['probabilities'] = [max(prob) for prob in topic_model.probabilities_]

# Merge data w/ topics 
topic_data = pd.merge(
    data,
    topic_model.get_topic_info(), 
    left_on='topics', 
    right_on='Topic'
)

# Filter columns
topic_data = topic_data[['proposal_id', 'datetime', 'title', 'tags', 'borough', 'neighborhood',
                         'zipcode', 'audience', 'proposal', 'submission_text_embedding_3_large',
                         'topics', 'probabilities', 'Count', 'CustomName','KeyBERT+MMR']]

# Rename columns
topic_data.rename({'topics':'topic_id',
                   'Count':'proposals_per_this_topic',
                   'CustomName':'openai_topic_name',
                   'KeyBERT+MMR':'keywords'}, axis=1, inplace=True)

# Export the data
topic_data.to_excel('../data/nyc_pb_proposals_topics_20240806.xlsx', index=False)

In [None]:
# Visualize the number of proposals by topic and borough
# Count of proposals by topic and borough, excluding outliers
temp = topic_data[topic_data['topic_id'] != -1].groupby(['openai_topic_name', 'borough']).size().reset_index(name='proposals')
fig = px.bar(temp, 
             y='openai_topic_name', 
             x='proposals', 
             color='borough',
             title='Proposals by Topic and Borough',
             labels={'openai_topic_name': 'Topic Name', 'proposals': 'Count of Proposals'},
             width=1400,
             height=1200)
fig.update_layout(yaxis_title='Topic Name',
                  xaxis_title='Number of Proposals',
                  title={'text': 'Proposals by Topic and Borough', 'x':0.5},
                  legend_title='Borough',
                  barmode='group')
fig.update_yaxes(tickfont=dict(size=15))
fig.write_html('../viz/proposal_frequency_by_borough.html')