# Topic Modeling for Commits

In [None]:
import pygsheets
import pandas as pd
import stanza
from bs4 import BeautifulSoup
from IPython.display import clear_output
from gensim.parsing.preprocessing import STOPWORDS
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LsiModel, LdaModel, HdpModel
from markdown import markdown

### Pre-processing and helper methods

In [None]:
STOPWORDS = STOPWORDS.union((
    "var", "variable", "computed", "costa", "botocore", "version", "step",
    "support", "source", "hashicorp", "service", "branch", "pull", "merge", "issue",
    "pr", "galoy-pay", "bumped", "add", "payload", "boto", "accurics", "hana",
    "bump", "added", "latest", "update", "tf", "github", "test", "sourced",
    "instead", "use", "plan", "updates", "diff", "bump-galoy-pay-image", "draft",
    "iam", "i'm", "v1", "apply", "fix", "fixes", "kvo", "needed", "tco", "create",
    "run", "code", "feat", "lambda", "need", "link", "project", "new", "change",
    "they're"
))

UPOS = ('PROPN', 'NOUN', 'VERB', 'ADJ', 'ADV')
nlp_pipeline = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')

def prepare_document(doc):
    # Remove MarkDown and HTML
    clean_doc = markdown(doc)
    clean_doc = ''.join(BeautifulSoup(clean_doc).findAll(text=True))

    tokens = [token.to_dict()[0]["lemma"]
                for token in nlp_pipeline(clean_doc).iter_tokens()
                if token.to_dict()[0]["upos"] in UPOS and not token.to_dict()[0]["text"] in STOPWORDS
            ]
    return tokens

def prepare_corpus(documents):
    corpus = []
    total_docs = len(documents)

    for i in range(total_docs):
        print(f'{total_docs} documents: {(i+1)/total_docs*100:.2f}% parsed')
        print(documents[i])
        corpus.append(prepare_document(documents[i]))
        clear_output(wait=False)

    return corpus

def build_tfidf_model(corpus):
    corpus_dict = Dictionary(corpus)
    corpus_bow = tuple(corpus_dict.doc2bow(sentence) for sentence in corpus)
    tfidf_model = TfidfModel(corpus_bow, normalize=True)

    return corpus_dict, corpus_bow, tfidf_model

def get_keywords(model, num_topics=-1, num_words=5):
    topic_vectors = model.show_topics(num_topics=num_topics, num_words=num_words, formatted=False)
    return sorted(tuple(set([w[0] for t in topic_vectors for w in t[1]])))

### Data recovery

We download the labeled data from the Google Sheet.
This data can also be obtained from a local spreadsheet file.

In [None]:
# access the sheets and get the commit sheet
gc = pygsheets.authorize(service_file='< JSON file with credentials for Google API >')
sheets = gc.open_by_url("< URL of your Google Sheet >")
commit_sheet = sheets[0]  # Specific sheet of your Google Sheet

# get a dictionary of the tables
data_dic = commit_sheet.get_all_records()

pandas_df = pd.DataFrame(data=data_dic)
new_df = pandas_df[pandas_df["Labels"].str.contains("cost-")]
new_df  # Pandas dataframe with cost-related entries

### Preparing Commit corpus

All the commit messages are fetched and pre-processed for the topic modelers.

In [None]:
commit_array = []
for idx, item in new_df.iterrows():
    commit_array.append(item['Commit Description'])
commit_corpus = prepare_corpus(commit_array)
(commit_corpus_dict, commit_corpus_bow, commit_tfidf_model) = build_tfidf_model(commit_corpus)

### Topic Modeling

All three topic modelers are run independently and their results are intersected in a single list.

In [None]:
commit_lsi_model = LsiModel(commit_tfidf_model[commit_corpus_bow], id2word=commit_corpus_dict, num_topics=100, onepass=False)
commit_lda_model = LdaModel(commit_corpus_bow, id2word=commit_corpus_dict, num_topics=100)
commit_hdp_model = HdpModel(commit_corpus_bow, id2word=commit_corpus_dict)

# extract the common keywords among the three models (top 10 topic vectors)
commit_topics = set(get_keywords(commit_lsi_model, num_topics=100))\
    .intersection(set(get_keywords(commit_lda_model, num_topics=100)))\
    .intersection(set(get_keywords(commit_hdp_model, num_topics=100)))

### Presenting results

In [None]:
commit_topics

### Visualization of results

In [None]:
from collections import Counter
from matplotlib import pyplot as plt
from wordcloud import STOPWORDS
import matplotlib.colors as mcolors

topics = commit_lsi_model.show_topics(formatted=False, num_topics=-1, num_words=10)
data_flat = [w for w_list in commit_corpus for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(2, 2, figsize=(16,10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i], fontsize=16)
    ax_twin.set_ylim(-1.25, 1.25); ax.set_ylim(-500, 500)
    ax.set_title('Topic ' + str(i+1), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right', fontsize=16)
    ax.legend(loc='upper left', fontsize=12); ax_twin.legend(loc='upper right', fontsize=12)

fig.tight_layout(w_pad=2)
fig.suptitle('Word Count and Importance of Topic Keywords for LSI in commits', fontsize=22, y=1.05)
plt.show()

In [None]:
plt.figure().clear()
topics = commit_lda_model.show_topics(formatted=False, num_topics=-1, num_words=10)
data_flat = [w for w_list in commit_corpus for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])
df.sort_values(by=['topic_id'])

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(2, 2, figsize=(16,10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i], fontsize=16)
    ax_twin.set_ylim(0, 0.085); ax.set_ylim(0, 500)
    ax.set_title('Topic ' + str(i+1), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right', fontsize=16)
    ax.legend(loc='upper left', fontsize=12); ax_twin.legend(loc='upper right', fontsize=12)

fig.tight_layout(w_pad=2)
fig.suptitle('Word Count and Importance of Topic Keywords for LDA in commits', fontsize=22, y=1.05)
plt.show()

In [None]:
plt.figure().clear()
topics = commit_hdp_model.show_topics(num_words=10, formatted=False)
data_flat = [w for w_list in commit_corpus for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i , weight, counter[word]])

df = pd.DataFrame(out, columns=['word', 'topic_id', 'importance', 'word_count'])

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(2, 2, figsize=(16,10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
    ax_twin = ax.twinx()
    ax_twin.bar(x='word', height="importance", data=df.loc[df.topic_id==i, :], color=cols[i], width=0.2, label='Weights')
    ax.set_ylabel('Word Count', color=cols[i], fontsize=16)
    ax_twin.set_ylim(0, 0.005); ax.set_ylim(0, 150)
    ax.set_title('Topic ' + str(i+1), color=cols[i], fontsize=16)
    ax.tick_params(axis='y', left=False)
    ax.set_xticklabels(df.loc[df.topic_id==i, 'word'], rotation=30, horizontalalignment= 'right', fontsize=16)
    ax.legend(loc='upper left', fontsize=12); ax_twin.legend(loc='upper right', fontsize=12)

fig.tight_layout(w_pad=2)
fig.suptitle('Word Count and Importance of Topic Keywords for HDP in commits', fontsize=22, y=1.05)
plt.show()