In [31]:
import requests
import json
import os

# Replace these placeholders with your GitHub username and personal access token
username = 'USER_NAME'
token = 'YOUR_TOKEN'

# Create a session and set the authentication headers
session = requests.Session()
session.auth = (username, token)

#save path
save_to = "github_crawl"


In [49]:
field_path = "field.json"
with open(field_path, 'r') as json_file:
        fields = json.load(json_file)

In [52]:
os.mkdir(save_to)

## Crawl Repository by Search Keyword (sorted by best matching)

In [54]:

# GitHub API base URL for searching repositories
base_url = 'https://api.github.com/search/repositories'

# Keyword to search for


# Number of results to fetch (GitHub API allows up to 50 per page)
results_to_fetch = 50

# Initialize a list to store the collected data
repository_data = []

# Function to fetch search results
def fetch_search_results(keyword, page=1):

    params = {
        'q': keyword,
        'per_page': results_to_fetch,
        'page': page,

    }

    headers = {
        'Authorization': f'token {token}',
    }

    response = requests.get(base_url, params=params, headers=headers)

    if response.status_code == 200:
        data = response.json()
        return data['items']
    else:
        print(f"Error fetching results. Status code: {response.status_code}")
        return []



def search_from_keyword(key, save_json = True):

    # Fetch the first 100 search results
    for page in range(1, (results_to_fetch // 50) + 1):
        search_results = fetch_search_results(key, page)
        repository_data.extend(search_results)
    

    # Extract owner and repo name from the search results
    result_data = [{'owner': item['owner']['login'], 'repo': item['name']} for item in repository_data]

    # Save the collected data to a JSON file
    if save_json:
        with open(f'{save_to}/{key}.json', 'w') as json_file:
            json.dump(result_data, json_file, indent=4)
        print(f'Data saved to github_search_results.json for keyword: {key}')
        return result_data
    else:
        return result_data

    


In [None]:

# merge subfield as keywords
sub = []
sub_dict = {}
for f in fields.keys():
    for ssf in fields[f]:
        sub.append(ssf)
        sub_dict[ssf] = f


In [None]:

start_idx = 0

for sub_idx, subfield in enumerate(sub):
    if sub_idx <= start_idx:
        continue
    subfield_list = []          
    print(f'searching {subfield}...')
    search_result = search_from_keyword(subfield, False)
    if search_result == None:
        continue
    #print(len(search_result))
    for i, s in enumerate(search_result):
        info = {}
        info = {"owner": s["owner"],
                "repo":s["repo"],
                "tags": [sub_dict[subfield], subfield],
                "rank": i
                }
        subfield_list.append(info)
    if len(subfield_list) > 0:
        with open(f'{save_to}/owner_repo_tags_until(sub_idx({sub_idx})).json', 'w') as json_file:
                    json.dump(subfield_list, json_file, indent=4)
    
    

## Crawl Details of Repository

In [None]:
import base64
import tqdm

In [None]:
headers = {
        'Authorization': f'token {token}',
    }

def crawl_info(repository_owner, repository_name, save_ph="", save_result= True):

    # Fetch the repository details
    repo_url = f'https://api.github.com/repos/{repository_owner}/{repository_name}'
    response = session.get(repo_url)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        repo_data = response.json()
        
        # Extract information from the response
        author = repo_data["owner"]["login"]
        project_title = repo_data["name"]
        start_date = repo_data["created_at"]
        description = repo_data['description']
        tags = repo_data["topics"]
        url = repo_data["html_url"]
 
        
        # Fetch README content from the repository
        readme_url = f"https://api.github.com/repos/{repository_owner}/{repository_name}/readme"
        readme_response = requests.get(readme_url, headers=headers)
        
        if readme_response.status_code == 200:
            readme_content = readme_response.json()["content"]
            # Decode the base64-encoded content
            import base64
            readme_content = base64.b64decode(readme_content).decode('utf-8')
        else:
            #readme_content = "README not found"
            readme_content = ""

        if description is None:
            description = ""
        
        # Create a dictionary with the collected data
        repo_info = {
            "author": author,
            "title": project_title,
            "Start Date": start_date,
            "sub_tags": tags,
            "about": description,
            "url": url,
            "ReadMe": readme_content

        }
        
        
        return repo_info
    else:
        print(f"Failed to retrieve repository data (status code {response.status_code})")
        return(response.status_code)



In [None]:
with open(f'github_crawl/merge_owner_repo_tags.json', 'r') as json_file:
    all_targets = json.load(json_file)

In [None]:
start_idx = 0

for loss_id in tqdm(len(all_targets)):
    target = all_targets[loss_id]
    result = crawl_info(target['owner'], target['repo'], save_result=False)
    if result == 403: #do not have the permission
        print(f"end at {loss_id}, {target}")
        break
    else:
        result['main_tag'] = target['main_tag']
        result["sub_tag"] = target["sub_tag"]+result["sub_tags"]
        #print(f"crawl {result['author']}, {result['title']}")
        with open(f'github_crawl/crawl_info_no_tags/{target["owner"]}_{target["repo"]}.json', 'w') as json_file:
            json.dump(result, json_file, indent=4)