In [10]:
import pandas as pd
import datetime
from github import Github
import os
import requests
from requests.adapters import HTTPAdapter

from urllib3 import Retry
import json

In [3]:
df = pd.read_csv('csv/git_repos_20-23.csv')
df = df.sort_values(['stargazers'], ascending=False)
df.head()

Unnamed: 0,name,isFork,commits,branches,defaultBranch,releases,contributors,license,watchers,stargazers,...,homepage,mainLanguage,totalIssues,openIssues,totalPullRequests,openPullRequests,lastCommit,lastCommitSHA,hasWiki,isArchived
15442,ultralytics/yolov5,False,2103,36,master,8,240.0,GNU General Public License v3.0,311,28567,...,https://ultralytics.com,Python,6002.0,321.0,1773.0,16.0,2022-07-08T12:32:40,526e650553819dbff67897b9c752c4072e989823,True,False
14249,z4nzu/hackingtool,False,218,1,master,0,27.0,MIT License,1003,26980,...,https://forms.gle/ntuAX8BGRR5yAb9ZA,Python,249.0,80.0,69.0,9.0,2022-12-03T04:47:23,8af26be04fd173113448347c778c7b0edde9a794,True,False
13606,mingrammer/diagrams,False,451,4,master,36,105.0,MIT License,351,26199,...,https://diagrams.mingrammer.com,Python,392.0,236.0,372.0,85.0,2022-09-13T09:11:45,834899659ae2e4f9f0d0dd9d01a4d7f31513d726,True,False
26476,babysor/mockingbird,False,164,10,main,1,33.0,Other,240,25499,...,,Python,713.0,346.0,79.0,8.0,2022-12-16T03:16:25,cd20d21f3d845b2e8b8d0e65b9de5d974ccc85c4,True,False
22515,tencentarc/gfpgan,False,106,1,master,8,11.0,Other,369,25034,...,,Python,259.0,166.0,36.0,10.0,2022-09-16T11:33:26,2eac2033893ca7f427f4035d80fe95b92649ac56,True,False


In [4]:
topics = ['maching-learning', 'deep-learning', 'natural-language-processing', 
          'nlp', 'cnn', 'rnn', 'gnn', 'transformer', 'attention', 'generative-adversarial-network',
          'pytorch', 'tensorflow', 'keras', 'pytorch-lightning', 'neural-network']

contents = ['import torch', 'import keras', 'import tensorflow']

descriptions = ['torch', 'keras', 'tensorflow']

In [5]:
# retries to avoid github api rate limit (403 status code)
def create_github_session() -> requests.Session:
    s = requests.Session()
    retries = Retry(total=50, backoff_factor=10, status_forcelist=[403])
    s.mount('https://', HTTPAdapter(max_retries=retries))
    return s

In [21]:
class GetRepositories():
    def __init__(self, topics, contents, descriptions, f_repos, f_issues):
        self.topics = topics
        self.contents = contents
        self.descriptions = descriptions

        self.f_repos = f_repos
        self.f_issues = f_issues

    def check_repo(self, access_token, session, name):
        try:
            url = 'https://api.github.com/'
            print(f'Query {name}...', end=' ')
            
            header = {'Authorization': f'Bearer {access_token}'}
            
            # if session.get(url + 'repos/' + name, headers=header).status_code > 400:
            #     print('Failed!')
            #     return None

            retries = Retry(total=50, backoff_factor=10, status_forcelist=[403])
            g = Github(access_token, retry=retries)
            
            repo = g.get_repo(name)

            repo_info = {'name': name,
                         'full_name': repo.full_name,
                         'topics': repo.get_topics(),
                         'description': repo.description}

            if self.check_topics(repo_info['topics']):
                print(f'Success!\nFound required topics in: {repo_info["topics"]}!\n')
                issues_list = self.check_issues(repo, name)

                json.dump(repo_info, self.f_repos)
                self.f_repos.write('\n')

                return repo_info, issues_list
            
            if self.check_description(repo_info['description']):
                print(f'Success!\nFound required description in: <{repo_info["description"]}>!\n')
                issues_list = self.check_issues(repo, name)

                json.dump(repo_info, self.f_repos)
                self.f_repos.write('\n')

                return repo_info, issues_list
            
            print('Failed!')
            return None
        except Exception as e:
            print('Failed!')
            print('In check_repo : {e}')
            return None
                
    def check_topics(self, repo_topics):
        for topic in self.topics:
            if topic in repo_topics:
                return True
        return False
        
    def check_description(self, repo_description):
        if repo_description is None:
            return False
        
        for desc in self.descriptions:
            if desc in repo_description:
                return True
        return False
    
    def check_issues(self, repo, name):
        try:
            issues = repo.get_issues()
            issues_list = []
            for issue in issues:
                if (issue.comments == 0 and issue.pull_request is None):
                    continue

                issue_url = issue.url
                url = 'https://github.com/' + issue_url[issue_url.find('repos/') + 6:]
                number = issue.number
                title = issue.title
                body = issue.body
                labels = issue.labels

                issue_info = {'repo': name,
                            'url': url,
                            'number': number,
                            'title': title,
                            'body': body,
                            'labels': [label.name for label in labels]}

                json.dump(issue_info, self.f_issues)
                self.f_issues.write('\n')

                issues_list.append(issue_info)
                
            print(f'Success! Found {len(issues_list)} issues!')
            return issues_list

        except Exception as e:
            print('Failed!')
            print(f'In check_issues : {e}')
            return None

In [23]:
access_token = ""
session = create_github_session()
names = df.name.to_list()

with open('json/ml_repos_sorted_v1.jsonl', 'a') as f_repos, open('json/ml_issues_v1.jsonl', 'a') as f_issues:
    get_ml_repos = GetRepositories(topics, contents, descriptions, f_repos, f_issues)
    
    for i, name in enumerate(names):
        print(i, end=' ')
        result = get_ml_repos.check_repo(access_token, session, name)

0 Query ultralytics/yolov5... Success!
Found required topics in: ['yolov3', 'yolov4', 'yolov5', 'object-detection', 'pytorch', 'onnx', 'coreml', 'ios', 'tflite', 'yolo', 'deep-learning', 'machine-learning', 'ml']!

Success! Found 252 issues!
1 Query z4nzu/hackingtool... Failed!
2 Query mingrammer/diagrams... Failed!
3 Query babysor/mockingbird... Success!
Found required topics in: ['ai', 'speech', 'pytorch', 'deep-learning']!

Success! Found 308 issues!
4 Query tencentarc/gfpgan... Success!
Found required topics in: ['pytorch', 'gan', 'deep-learning', 'super-resolution', 'face-restoration', 'image-restoration', 'gfpgan']!



KeyboardInterrupt: 