In [2]:
import pandas as pd
import datetime
from github import Github
import os
import requests
from requests.adapters import HTTPAdapter

from urllib3 import Retry
from csv import DictWriter

# Step 1: Obtain a list of relevant repositories

Used Github Search: https://seart-ghs.si.usi.ch to get repositories in Python with not less then 5 commits for the last 3 years.

In [7]:
df = pd.read_csv('git_repos_20-23.csv')
df = df.sort_values(['stargazers'], ascending=False)
df.head()

Unnamed: 0,name,isFork,commits,branches,defaultBranch,releases,contributors,license,watchers,stargazers,...,homepage,mainLanguage,totalIssues,openIssues,totalPullRequests,openPullRequests,lastCommit,lastCommitSHA,hasWiki,isArchived
15442,ultralytics/yolov5,False,2103,36,master,8,240.0,GNU General Public License v3.0,311,28567,...,https://ultralytics.com,Python,6002.0,321.0,1773.0,16.0,2022-07-08T12:32:40,526e650553819dbff67897b9c752c4072e989823,True,False
14249,z4nzu/hackingtool,False,218,1,master,0,27.0,MIT License,1003,26980,...,https://forms.gle/ntuAX8BGRR5yAb9ZA,Python,249.0,80.0,69.0,9.0,2022-12-03T04:47:23,8af26be04fd173113448347c778c7b0edde9a794,True,False
13606,mingrammer/diagrams,False,451,4,master,36,105.0,MIT License,351,26199,...,https://diagrams.mingrammer.com,Python,392.0,236.0,372.0,85.0,2022-09-13T09:11:45,834899659ae2e4f9f0d0dd9d01a4d7f31513d726,True,False
26476,babysor/mockingbird,False,164,10,main,1,33.0,Other,240,25499,...,,Python,713.0,346.0,79.0,8.0,2022-12-16T03:16:25,cd20d21f3d845b2e8b8d0e65b9de5d974ccc85c4,True,False
22515,tencentarc/gfpgan,False,106,1,master,8,11.0,Other,369,25034,...,,Python,259.0,166.0,36.0,10.0,2022-09-16T11:33:26,2eac2033893ca7f427f4035d80fe95b92649ac56,True,False


In [8]:
topics = ['maching-learning', 'deep-learning', 'natural-language-processing', 
          'nlp', 'cnn', 'rnn', 'gnn', 'transformer', 'attention', 'generative-adversarial-network',
          'pytorch', 'tensorflow', 'keras', 'pytorch-lightning', 'neural-network']

contents = ['import torch', 'import keras', 'import tensorflow']

descriptions = ['torch', 'keras', 'tensorflow']

In [7]:
# retries to avoid github api rate limit (403 status code)
def create_github_session() -> requests.Session:
    s = requests.Session()
    retries = Retry(total=50, backoff_factor=10, status_forcelist=[403])
    s.mount('https://', HTTPAdapter(max_retries=retries))
    return s

In [10]:
class GetRepositories():
    def __init__(self, topics, contents, descriptions):
        self.topics = topics
        self.contents = contents
        self.descriptions = descriptions
        
    def check_repo(self, access_token, session, name):
        url = 'https://api.github.com/'
        print(f'Query {name}...', end=' ')
        
        header = {'Authorization': f'Bearer {access_token}'}
        
        if session.get(url + 'repos/' + name, headers=header).status_code > 400:
            print('Failed!')
            return None

        retries = Retry(total=50, backoff_factor=10, status_forcelist=[403])
        g = Github(access_token, retry=retries)
        
        try:
            repo = g.get_repo(name)
        except Exception as e:
            print(f'Failed! {e.message}')
            return None

        repo_info = {'name': name, 'full_name': repo.full_name, 'topics': repo.get_topics(), 'description': repo.description}

        if self.check_topics(repo_info['topics']):
            print(f'Success!\nFound required topics in: {repo_info["topics"]}!\n')
            return repo_info
        
        if self.check_description(repo_info['description']):
            print(f'Success!\nFound required description in: <{repo_info["description"]}>!\n')
            return repo_info
        
        print('Failed!')
    
        return None
                
    def check_topics(self, repo_topics):
        for topic in self.topics:
            if topic in repo_topics:
                return True
        return False
        
    def check_description(self, repo_description):
        if repo_description is None:
            return False
        
        for desc in self.descriptions:
            if desc in repo_description:
                return True
        return False
        
    def check_contents(self, repo):
        repo_contents = repo.get_contents('')
        while repo_contents:
            file_content = repo_contents.pop(0)
            if file_content.type == "dir":
                repo_contents.extend(repo.get_contents(file_content.path))
            else:
                file_name = file_content.path
                
                filename, file_extension = os.path.splitext(file_name)
                if file_extension in [".py"]:
                    # print(filename)
                    try:
                        file_data = file_content.decoded_content.decode()
                        for content in self.contents:
                            if content in file_data:
                                return True
                    except AssertionError:
                        break
                        
        return False

In [None]:
access_token = ""
session = create_github_session()
names = df.name.to_list()

field_names = ['name', 'full_name', 'topics', 'description']
get_ml_repos = GetRepositories(topics, contents, descriptions)

with open('ml_repos_sorted.csv', 'a') as f_object:
    dictwriter_object = DictWriter(f_object, fieldnames=field_names)
    dictwriter_object.writeheader()

    for i, name in enumerate(names):
        print(i, end=' ')
        repo_info = get_ml_repos.check_repo(access_token, session, name)
        if repo_info is not None:
            dictwriter_object.writerow(repo_info)

    f_object.close()

# Step 2.0: Gather issues from repositories

In [None]:
df = pd.read_csv("ml_repos_sorted.csv")
df.head()

Unnamed: 0,name,full_name,topics,description
0,ultralytics/yolov5,ultralytics/yolov5.1,"['yolov3', 'yolov4', 'yolov5', 'object-detecti...",YOLOv5 🚀 in PyTorch > ONNX > CoreML > TFLite
1,babysor/mockingbird,babysor/MockingBird,"['ai', 'speech', 'pytorch', 'deep-learning']",🚀AI拟声: 5秒内克隆您的声音并生成任意语音内容 Clone a voice in 5 s...
2,tencentarc/gfpgan,TencentARC/GFPGAN,"['pytorch', 'gan', 'deep-learning', 'super-res...",GFPGAN aims at developing Practical Algorithms...
3,xinntao/real-esrgan,xinntao/Real-ESRGAN,"['esrgan', 'pytorch', 'real-esrgan', 'super-re...",Real-ESRGAN aims at developing Practical Algor...
4,jaidedai/easyocr,JaidedAI/EasyOCR,"['ocr', 'deep-learning', 'crnn', 'pytorch', 'l...",Ready-to-use OCR with 80+ supported languages ...


In [None]:
def check_issues(access_token, session, name):
    url = 'https://api.github.com/'
    print(f'Query {name}...', end=' ')
    
    header = {'Authorization': f'Bearer {access_token}'}
    
    if session.get(url + 'repos/' + name, headers=header).status_code > 400:
        print('Failed!')
        return None

    retries = Retry(total=50, backoff_factor=10, status_forcelist=[403])
    g = Github(access_token, retry=retries)
    
    try:
        repo = g.get_repo(name)
    except Exception as e:
        print(f'Failed! {e.message}')
        return None

    issues = repo.get_issues()
    issues_list = []
    for issue in issues:
        if (issue.comments == 0 and issue.pull_request is None):
            continue

        issue_url = issue.url
        url = 'https://github.com/' + issue_url[issue_url.find('repos/') + 6:]
        number = issue.number
        title = issue.title
        labels = issue.labels

        issue_list = {'repo': name,
                      'url': url,
                      'number': issue.number,
                      'title': issue.title,
                      'labels': [label.name for label in labels]}

        issues_list.append(issue_list)
        
    print(f'Success! Found {len(issues_list)} issues!')
    return issues_list

#### Extract issues in the same DataFrame

In [None]:
access_token = ""
session = create_github_session()
field_names = ['name', 'full_name', 'topics', 'description', 'url', 'issues']

with open('ml_repos_with_issues.csv', 'a') as f_object:
    dictwriter_object = DictWriter(f_object, fieldnames=field_names)
    dictwriter_object.writeheader()

    for i, row in df.iterrows():
        print(i, end=' ')
        row = row.to_dict()
        issues_list = check_issues(access_token, session, row['name'])

        row['url'] = 'https://github.com/' + row['name']
        row['issues'] = issues_list
        dictwriter_object.writerow(row)

    f_object.close()

0 Query ultralytics/yolov5... Success! Found 267 issues!
1 Query babysor/mockingbird... Success! Found 301 issues!
2 Query tencentarc/gfpgan... Success! Found 105 issues!
3 Query xinntao/real-esrgan... Success! Found 230 issues!
4 Query jaidedai/easyocr... Success! Found 98 issues!
5 Query jina-ai/jina... Success! Found 22 issues!
6 Query huggingface/datasets... Success! Found 419 issues!
7 Query lucidrains/vit-pytorch... Success! Found 66 issues!
8 Query microsoft/bringing-old-photos-back-to-life... Success! Found 53 issues!
9 Query ml-tooling/best-of-ml-python... Success! Found 3 issues!
10 Query microsoft/qlib... Success! Found 196 issues!
11 Query lyhue1991/eat_tensorflow2_in_30_days... Success! Found 11 issues!
12 Query lucidrains/dalle2-pytorch... Success! Found 24 issues!
13 Query megvii-basedetection/yolox... Success! Found 458 issues!
14 Query xmu-xiaoma666/external-attention-pytorch... Success! Found 28 issues!
15 Query microsoft/deepspeed... Success! Found 456 issues!
16 Que

#### Extract issues in separate DataFrame

In [None]:
access_token = ""
session = create_github_session()
field_names = ['name', 'number', 'title', 'labels', 'url']

with open('ml_issues.csv', 'a') as f_object:
    dictwriter_object = DictWriter(f_object, fieldnames=field_names)
    dictwriter_object.writeheader()
 
    for i, row in df.iterrows():
        print(i, end=' ')
        issues_list = check_issues(access_token, session, row['name'])
        dictwriter_object.writerows(issues_list)

    f_object.close()

# Step 2.1: Make a separated DataFrame with issues

In [None]:
df = pd.read_csv("ml_repos_with_issues.csv", header=None)

In [28]:
field_names = ['repo', 'number', 'title', 'labels', 'url']

with open('ml_issues.csv', 'a') as f_object:
    dictwriter_object = DictWriter(f_object, fieldnames=field_names)
    dictwriter_object.writeheader()
 
    for i, row in df.iterrows():
        for issue in row["issues"]:
            issue["repo"] = row["name"]
            dictwriter_object.writerow(issue)

    f_object.close()

# Step 2.2: Get issues' body

In [3]:
repos_df = pd.read_json("json/ml_repos_sorted.jsonl", orient="records", lines=True)
repos_df.head()

Unnamed: 0,name,full_name,topics,description
0,ultralytics/yolov5,ultralytics/yolov5.1,"[yolov3, yolov4, yolov5, object-detection, pyt...",YOLOv5 🚀 in PyTorch > ONNX > CoreML > TFLite
1,babysor/mockingbird,babysor/MockingBird,"[ai, speech, pytorch, deep-learning]",🚀AI拟声: 5秒内克隆您的声音并生成任意语音内容 Clone a voice in 5 s...
2,tencentarc/gfpgan,TencentARC/GFPGAN,"[pytorch, gan, deep-learning, super-resolution...",GFPGAN aims at developing Practical Algorithms...
3,xinntao/real-esrgan,xinntao/Real-ESRGAN,"[esrgan, pytorch, real-esrgan, super-resolutio...",Real-ESRGAN aims at developing Practical Algor...
4,jaidedai/easyocr,JaidedAI/EasyOCR,"[ocr, deep-learning, crnn, pytorch, lstm, mach...",Ready-to-use OCR with 80+ supported languages ...


In [4]:
issues_df = pd.read_json("json/ml_issues.jsonl", orient="records", lines=True)
issues_df.head()

Unnamed: 0,repo,number,title,labels,url
0,ultralytics/yolov5,10910,Custom Models using PyTorch Hub,[question],https://github.com/ultralytics/yolov5/issues/1...
1,ultralytics/yolov5,10909,Training my custom dataset with unfrozen the l...,[question],https://github.com/ultralytics/yolov5/issues/1...
2,ultralytics/yolov5,10905,Cannot reproduce the 64.1 mAP on COCO dataset ...,[question],https://github.com/ultralytics/yolov5/issues/1...
3,ultralytics/yolov5,10904,urllib.error.URLError: <urlopen error [WinErro...,[question],https://github.com/ultralytics/yolov5/issues/1...
4,ultralytics/yolov5,10900,detect -> COM,[question],https://github.com/ultralytics/yolov5/issues/1...


In [78]:
def check_issues_body(access_token, session, name, number):
    url = 'https://api.github.com/'
    print(f'Query {url}repos/{name}/issues/{number} ...', end=' ')
    
    header = {'Authorization': f'Bearer {access_token}'}
    
    if session.get(url + 'repos/' + name, headers=header).status_code > 400:
        print('Failed!')
        return None

    retries = Retry(total=50, backoff_factor=10, status_forcelist=[403])
    g = Github(access_token, retry=retries)
    
    try:
        repo = g.get_repo(name)
    except Exception as e:
        print(f'Failed! {e.message}')
        return None

    # body_list = []
    # issues = repo.get_issues()
    # for issue in issues:
    #     if (issue.comments == 0 and issue.pull_request is None):
    #         continue
    #     body = issue.body
    #     body_list.append(body)

    # return body_list

    try:
        issue = repo.get_issue(number=number)
    except Exception as e:
        print(f'Failed!')
        return None

    print(f'Success!')
    return issue.body

In [80]:
pd.options.mode.chained_assignment = None  # default='warn'

access_token = "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"
session = create_github_session()

with open('json/transformer_issues_with_body.jsonl', 'a') as f_object:
    for i, row in repos_df.iterrows():
        if 'transformer' not in row.topics \
            and 'transformers' not in row.topics:
            continue

        print(i)
        
        repo_issues = issues_df.loc[issues_df.repo == row['name']]
        # body_list = check_issues_body(access_token, session, row['name'])
        # print(len(body_list))
        # repo_issues['body'] = body_list
        for _, issue in repo_issues.iterrows():
            body = check_issues_body(access_token, session, row['name'], issue['number'])
            repo_issues['body'] = body

        f_object.write(repo_issues.to_json(orient="records", lines=True))
    
    f_object.close()

7
Query https://api.github.com/repos/lucidrains/vit-pytorch/issues/233 ... Success!
Query https://api.github.com/repos/lucidrains/vit-pytorch/issues/232 ... Success!
Query https://api.github.com/repos/lucidrains/vit-pytorch/issues/228 ... Success!
Query https://api.github.com/repos/lucidrains/vit-pytorch/issues/225 ... Success!
Query https://api.github.com/repos/lucidrains/vit-pytorch/issues/222 ... Success!
Query https://api.github.com/repos/lucidrains/vit-pytorch/issues/217 ... Success!
Query https://api.github.com/repos/lucidrains/vit-pytorch/issues/216 ... Success!
Query https://api.github.com/repos/lucidrains/vit-pytorch/issues/215 ... Success!
Query https://api.github.com/repos/lucidrains/vit-pytorch/issues/213 ... Success!
Query https://api.github.com/repos/lucidrains/vit-pytorch/issues/204 ... Success!
Query https://api.github.com/repos/lucidrains/vit-pytorch/issues/201 ... Success!
Query https://api.github.com/repos/lucidrains/vit-pytorch/issues/199 ... Success!
Query https://