In [1]:
import pandas as pd
import datetime
from github import Github
import os
import requests

# Step 1: Obtain a list of relevant repositories

Used Github Search: https://seart-ghs.si.usi.ch to get repositories in Python with not less then 5 commits for the last 3 years.

In [4]:
df = pd.read_csv('git_repos_20-23.csv')
df.shape[0]
df.head()

Unnamed: 0,name,isFork,commits,branches,defaultBranch,releases,contributors,license,watchers,stargazers,...,homepage,mainLanguage,totalIssues,openIssues,totalPullRequests,openPullRequests,lastCommit,lastCommitSHA,hasWiki,isArchived
0,familysearch/gedcom,False,346,5,main,12,9.0,Apache License 2.0,19,85,...,,Python,113.0,46.0,102.0,2.0,2023-01-10T09:52:20,c2c51d4b5db95cea90d56f27b5c08626f30f790f,True,False
1,er10yi/magicude,False,98,2,main,12,1.0,,12,423,...,,Python,9.0,2.0,2.0,0.0,2022-01-10T03:28:29,b20f98939e928847f323d903f358221380a1ea74,True,False
2,returntocorp/semgrep-rules,False,2234,171,develop,0,140.0,Other,24,413,...,https://semgrep.dev/registry,Python,480.0,22.0,1870.0,12.0,2022-09-02T05:45:18,7ae34f1ef07022d3b318f43aa3cafc646af944be,False,False
3,keguoyu/minijvm,False,11,3,main,0,0.0,,1,62,...,,Python,,,,,2021-02-20T07:13:03,4da519cadead7164de37fb9b8913360dd45f9193,True,False
4,wzhe06/sparrowrecsys,False,104,4,master,0,11.0,Apache License 2.0,51,1673,...,http://wzhe.me/SparrowRecSys/,Python,28.0,14.0,36.0,9.0,2021-09-09T07:18:59,579577c9a25f75fc6bbdfeb2f258f7c8a35833dd,True,False


In [5]:
split_names = [name.split('/') for name in df.name.to_list()]
user_names = [name[0] for name in split_names]
repo_names = [name[1] for name in split_names]

In [6]:
topics = ['maching-learning', 'deep-learning', 'natural-language-processing', 
          'nlp', 'cnn', 'rnn', 'gnn', 'transformer', 'attention', 'generative-adversarial-network',
          'pytorch', 'tensorflow', 'keras', 'pytorch-lightning', 'neural-network']

contents = ['import torch', 'import keras', 'import tensorflow']

descriptions = ['torch', 'keras', 'tensorflow']

In [None]:
class GetRepositories():
    def __init__(self, topics, contents, descriptions):
        self.topics = topics
        self.contents = contents
        self.descriptions = descriptions
        
    def check_repo(self, access_token, user_name, repo_name):
        url = 'https://api.github.com/'
        print(f'Query user = {user_name} repo = {repo_name}...', end=' ')
        
        header = {'Authorization': f'Bearer {access_token}'}
        
        if requests.get(url + 'users/' + user_name, headers=header).status_code > 400:
            print('Failed!\nUser does not exist!\n')
            # print(requests.get(url + 'users/' + user_name).status_code)
            return False
        
        if requests.get(url + 'repos/' + user_name + '/' + repo_name, headers=header).status_code > 400:
            print('Failed!\nRepo does not exist!\n')
            return False

        # using an access token
        g = Github(access_token)
        user = g.get_user(user_name)
        repo = user.get_repo(repo_name)

                
        if self.check_topics(repo):
            print(f'Success!\nFound required topics in: {repo.get_topics()}!\n')
            return True
        
        if self.check_description(repo):
            print(f'Success!\nFound required description in: <{repo.description}>!\n')
            return True
            
        if  self.check_contents(repo):
            print('Success!\nFound required content!\n')
            return True
        
        print('Failed!')
    
        return False
                
    def check_topics(self, repo):
        repo_topics = repo.get_topics()

        for topic in self.topics:
            if topic in repo_topics:
                return True
        return False
        
    def check_description(self, repo):
        repo_description = repo.description
        
        if repo_description is None:
            return False
        
        for desc in self.descriptions:
            if desc in repo_description:
                return True
        return False
        
    def check_contents(self, repo):
        repo_contents = repo.get_contents('')
        while repo_contents:
            file_content = repo_contents.pop(0)
            if file_content.type == "dir":
                repo_contents.extend(repo.get_contents(file_content.path))
            else:
                file_name = file_content.path
                
                filename, file_extension = os.path.splitext(file_name)
                if file_extension in [".py"]:
                    # print(filename)
                    try:
                        file_data = file_content.decoded_content.decode()
                        for content in self.contents:
                            if content in file_data:
                                return True
                    except AssertionError:
                        break
                    # print(file_data)

        return False

In [57]:
access_token = ""

get_ml_repos = GetRepositories(topics, contents, descriptions)
ml_repos = []
# get_ml_repos.check_repo(access_token, 'olyandrevn', 'web-scraper')
# get_ml_repos.check_repo(access_token, 'olyandrevn', 'text-generator')
# get_ml_repos.check_repo(access_token, 'strangerealintel', 'cerberus')

for i, (user_name, repo_name) in enumerate(zip(user_names, repo_names)):
    print(i, end=' ')
    is_ml_repo = get_ml_repos.check_repo(access_token, user_name, repo_name)
    
    if is_ml_repo:
        ml_repos.append([user_name, repo_name])

0 Query user = familysearch repo = gedcom... Failed!
1 Query user = er10yi repo = magicude... Failed!
2 Query user = returntocorp repo = semgrep-rules... Failed!
3 Query user = keguoyu repo = minijvm... Failed!
4 Query user = wzhe06 repo = sparrowrecsys... Success!
Found required topics in: ['recommender-system', 'deep-learning', 'machine-learning']!

5 Query user = cesena repo = ghidra2dwarf... Failed!
6 Query user = notro repo = gud... Failed!
7 Query user = nvlabs repo = poserbpf... Success!
Found required content!

8 Query user = kharacternyk repo = barva... Failed!
9 Query user = microsoft repo = microsoft-rocketbox... Failed!
10 Query user = strangerealintel repo = cerberus... Failed!
11 Query user = lightstep repo = hipster-shop... Failed!
12 Query user = dataignitelab repo = whaleshark_iiot... Failed!
13 Query user = yaourdt repo = mgos-to-tasmota... Failed!
14 Query user = aws repo = aws-graviton-getting-started... Failed!
15 Query user = joshwardell repo = canserver... Failed

RateLimitExceededException: 403 {"message": "API rate limit exceeded for user ID 33371372.", "documentation_url": "https://docs.github.com/rest/overview/resources-in-the-rest-api#rate-limiting"}