# LDA Topic Modeling for Readme Go GitHub Repositories
Identify topics or categories in set of documents using a statistical model (Latent Dirichlet Allocation)

In [41]:
import pandas as pd
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" # see the value of multiple statements at once.

# Global Variables
Variables that are used throughout the notebook

In [42]:
# import spacy
# spacy.load('en_core_web_sm')
# from spacy.lang.en import English
# parser = English()

output_dir = 'assets'
num_topics = 10
num_words = 3
passes = 20

# Read in the data
The output.json file contains the data from the GitHub API. The data is in JSON format and contains the following fields:
- name: The name of the repository
- owner: The owner of the repository
- description: The description of the repository
- topics: The topics of the repository
- readme_base64: The base64 encoded content of the README file
- url: The URL of the repository

In [43]:
df = pd.read_json(f'{output_dir}/output.json')
df.shape
df.head()

(15862, 6)

Unnamed: 0,name,owner,description,topics,url,readme_base64
0,hub,mislav,A command-line tool that makes git easier to u...,"[go, homebrew, git, github-api, pull-request]",https://github.com/mislav/hub,aHViIGlzIGEgY29tbWFuZCBsaW5lIHRvb2wgdGhhdCB3cm...
1,lantern,getlantern,Lantern官方版本下载 蓝灯 翻墙 代理 科学上网 外网 加速器 梯子 路由 - Быс...,"[lantern, vpn, censorship, circumvention, gfw,...",https://github.com/getlantern/lantern,IyBMYW50ZXJuClshW2VuXShyZXNvdXJjZXMvRW5nbGlzaC...
2,direnv,direnv,unclutter your .profile,"[direnv, environment, shell-extension, bash, z...",https://github.com/direnv/direnv,ZGlyZW52IC0tIHVuY2x1dHRlciB5b3VyIC5wcm9maWxlCj...
3,go-cache,patrickmn,An in-memory key:value store/cache (similar to...,"[go, cache, library]",https://github.com/patrickmn/go-cache,IyBnby1jYWNoZQoKZ28tY2FjaGUgaXMgYW4gaW4tbWVtb3...
4,dns,miekg,DNS library in Go,"[dnssec, go, dns-library, dns]",https://github.com/miekg/dns,WyFbQnVpbGQgU3RhdHVzXShodHRwczovL3RyYXZpcy1jaS...


# Data Preprocessing
- Convert the base64 encoded content of the README file to a decoded string
- Combine the description, topics, and README content into a single column

In [44]:
from base64 import standard_b64decode
df['readme_base64'] = df['readme_base64'].apply(lambda x: standard_b64decode(x).decode('utf-8'))
df['topics'] = df['topics'].apply(lambda x: ' '.join(x))
df['merged'] = df['description'] + ' ' + df['topics'] + ' ' + df['readme_base64']


In [45]:
doc_set = df.values.T.tolist()[6]
print(doc_set[0:3])
# doc_set




In [46]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from math import isnan

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()


from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

tokenized_docs = []
for doc in doc_set:
    if type(doc) != str and isnan(doc):
        doc = ''
    tokens = tokenizer.tokenize(doc.lower())
    tokenized_docs.append(tokens)
    
print("Tokenized Docs:")
print(tokenized_docs[0:3])
# print(tokenized_docs)



[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jbenitezg/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

Tokenized Docs:
[['a', 'command', 'line', 'tool', 'that', 'makes', 'git', 'easier', 'to', 'use', 'with', 'github', 'go', 'homebrew', 'git', 'github', 'api', 'pull', 'request', 'hub', 'is', 'a', 'command', 'line', 'tool', 'that', 'wraps', 'git', 'in', 'order', 'to', 'extend', 'it', 'with', 'extra', 'features', 'and', 'commands', 'that', 'make', 'working', 'with', 'github', 'easier', 'for', 'an', 'official', 'potentially', 'more', 'user', 'friendly', 'command', 'line', 'interface', 'to', 'github', 'see', 'cli', 'github', 'com', 'https', 'cli', 'github', 'com', 'and', 'this', 'comparison', 'https', 'github', 'com', 'cli', 'cli', 'blob', 'trunk', 'docs', 'gh', 'vs', 'hub', 'md', 'this', 'repository', 'and', 'its', 'issue', 'tracker', 'is', 'not', 'for', 'reporting', 'problems', 'with', 'github', 'com', 'web', 'interface', 'if', 'you', 'have', 'a', 'problem', 'with', 'github', 'itself', 'please', 'contact', 'support', 'https', 'github', 'com', 'contact', 'usage', 'sh', 'hub', 'clone', 'rtom

In [47]:
lemmatized_tokens = []
for lst in tokenized_docs:
    tokens_lemma = [lemmatizer.lemmatize(i) for i in lst]
    lemmatized_tokens.append(tokens_lemma)
    
print(lemmatized_tokens[0:3])


[['a', 'command', 'line', 'tool', 'that', 'make', 'git', 'easier', 'to', 'use', 'with', 'github', 'go', 'homebrew', 'git', 'github', 'api', 'pull', 'request', 'hub', 'is', 'a', 'command', 'line', 'tool', 'that', 'wrap', 'git', 'in', 'order', 'to', 'extend', 'it', 'with', 'extra', 'feature', 'and', 'command', 'that', 'make', 'working', 'with', 'github', 'easier', 'for', 'an', 'official', 'potentially', 'more', 'user', 'friendly', 'command', 'line', 'interface', 'to', 'github', 'see', 'cli', 'github', 'com', 'http', 'cli', 'github', 'com', 'and', 'this', 'comparison', 'http', 'github', 'com', 'cli', 'cli', 'blob', 'trunk', 'doc', 'gh', 'v', 'hub', 'md', 'this', 'repository', 'and', 'it', 'issue', 'tracker', 'is', 'not', 'for', 'reporting', 'problem', 'with', 'github', 'com', 'web', 'interface', 'if', 'you', 'have', 'a', 'problem', 'with', 'github', 'itself', 'please', 'contact', 'support', 'http', 'github', 'com', 'contact', 'usage', 'sh', 'hub', 'clone', 'rtomayko', 'tilt', 'git', 'clon

In [48]:
from stop_words import get_stop_words
en_stop_words = get_stop_words('en')

In [49]:
n=2
tokens = []
for lst in lemmatized_tokens:
    tokens.append([i for i in lst if not i in en_stop_words if len(i) > n])

print(tokens[0:3])
# tokens

[['command', 'line', 'tool', 'make', 'git', 'easier', 'use', 'github', 'homebrew', 'git', 'github', 'api', 'pull', 'request', 'hub', 'command', 'line', 'tool', 'wrap', 'git', 'order', 'extend', 'extra', 'feature', 'command', 'make', 'working', 'github', 'easier', 'official', 'potentially', 'user', 'friendly', 'command', 'line', 'interface', 'github', 'see', 'cli', 'github', 'com', 'http', 'cli', 'github', 'com', 'comparison', 'http', 'github', 'com', 'cli', 'cli', 'blob', 'trunk', 'doc', 'hub', 'repository', 'issue', 'tracker', 'reporting', 'problem', 'github', 'com', 'web', 'interface', 'problem', 'github', 'please', 'contact', 'support', 'http', 'github', 'com', 'contact', 'usage', 'hub', 'clone', 'rtomayko', 'tilt', 'git', 'clone', 'http', 'github', 'com', 'rtomayko', 'tilt', 'git', 'prefer', 'ssh', 'protocol', 'git', 'config', 'global', 'hub', 'protocol', 'ssh', 'hub', 'clone', 'rtomayko', 'tilt', 'git', 'clone', 'git', 'github', 'com', 'rtomayko', 'tilt', 'git', 'see', 'usage', 'e

In [50]:
from gensim import corpora, models

dictionary = corpora.Dictionary(tokens)

In [51]:
corpus = [dictionary.doc2bow(text) for text in tokens]

import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save(f'{output_dir}/dictionary.gensim')

In [52]:
corpus[0]

[(0, 1),
 (1, 1),
 (2, 1),
 (3, 1),
 (4, 7),
 (5, 3),
 (6, 5),
 (7, 2),
 (8, 2),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 3),
 (13, 4),
 (14, 1),
 (15, 1),
 (16, 1),
 (17, 1),
 (18, 1),
 (19, 1),
 (20, 1),
 (21, 1),
 (22, 2),
 (23, 2),
 (24, 1),
 (25, 1),
 (26, 1),
 (27, 6),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 2),
 (33, 4),
 (34, 6),
 (35, 1),
 (36, 1),
 (37, 20),
 (38, 9),
 (39, 2),
 (40, 1),
 (41, 1),
 (42, 3),
 (43, 4),
 (44, 1),
 (45, 4),
 (46, 1),
 (47, 1),
 (48, 2),
 (49, 1),
 (50, 1),
 (51, 1),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 2),
 (57, 1),
 (58, 1),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 1),
 (63, 1),
 (64, 3),
 (65, 6),
 (66, 1),
 (67, 1),
 (68, 1),
 (69, 1),
 (70, 2),
 (71, 1),
 (72, 1),
 (73, 1),
 (74, 2),
 (75, 1),
 (76, 2),
 (77, 1),
 (78, 3),
 (79, 1),
 (80, 1),
 (81, 1),
 (82, 1),
 (83, 1),
 (84, 3),
 (85, 3),
 (86, 1),
 (87, 1),
 (88, 1),
 (89, 1),
 (90, 1),
 (91, 1),
 (92, 2),
 (93, 3),
 (94, 1),
 (95, 1),
 (96, 2),
 (97, 1),
 (98, 3),
 (99, 1),
 (100, 1)

In [53]:
import gensim
ldamodel_3 = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=passes)
ldamodel_3.save(f'{output_dir}/model3.gensim')
# ldamodel_4 = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=20)
# ldamodel_4.save(f'{output_dir}/model4.gensim')

In [54]:
for el in ldamodel_3.print_topics(num_topics=num_topics, num_words=num_words):
    print(el,'\n')


(0, '0.122*"http" + 0.075*"com" + 0.053*"github"') 

(1, '0.036*"string" + 0.026*"err" + 0.021*"func"') 

(2, '0.037*"sponsor" + 0.020*"color" + 0.019*"app"') 

(3, '0.037*"http" + 0.029*"com" + 0.027*"github"') 

(4, '0.023*"http" + 0.022*"server" + 0.013*"com"') 

(5, '0.016*"http" + 0.013*"can" + 0.009*"com"') 

(6, '0.027*"http" + 0.026*"com" + 0.021*"tree"') 

(7, '0.022*"file" + 0.016*"github" + 0.014*"com"') 

(8, '0.041*"aws" + 0.036*"key" + 0.029*"secret"') 

(9, '0.021*"kubernetes" + 0.019*"http" + 0.014*"cluster"') 


In [55]:
# for el in ldamodel_4.print_topics(num_topics=3, num_words=3):
#     print(el,'\n')


In [56]:
dictionary = gensim.corpora.Dictionary.load(f'{output_dir}/dictionary.gensim')

In [57]:
corpus = pickle.load(open('corpus.pkl', 'rb'))

In [58]:
lda = gensim.models.ldamodel.LdaModel.load(f'{output_dir}/model3.gensim')

In [59]:
topic_distributions_3 = [ldamodel_3.get_document_topics(doc) for doc in corpus]
for i, (readme, topics_3) in enumerate(zip(doc_set, topic_distributions_3)):
    print(f"Readme {i+1} Topics (Model 3):")
    for topic, prob in topics_3:
        print(f"Topic {topic}: Probability {prob}")
    print("\n")
    if i >= 3:
        break
    

Readme 1 Topics (Model 3):
Topic 0: Probability 0.23124918341636658
Topic 5: Probability 0.04281098023056984
Topic 7: Probability 0.7145851850509644
Topic 8: Probability 0.010369670577347279


Readme 2 Topics (Model 3):
Topic 0: Probability 0.768635630607605
Topic 4: Probability 0.016907067969441414
Topic 5: Probability 0.06586843729019165
Topic 7: Probability 0.13979987800121307


Readme 3 Topics (Model 3):
Topic 0: Probability 0.047741394490003586
Topic 5: Probability 0.22595329582691193
Topic 7: Probability 0.7253847122192383


Readme 4 Topics (Model 3):
Topic 1: Probability 0.6454435586929321
Topic 5: Probability 0.3514516055583954


In [60]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)

  if isinstance(node, ast.Num):  # <number>
  if isinstance(node, ast.Num):  # <number>
  return node.n
  if isinstance(node, ast.Num):  # <number>
  return node.n
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()
  pid = os.fork()


In [61]:
pyLDAvis.display(lda_display)