In [None]:
import pandas as pd
from github import Github
import base64
import os
import json
import requests
from requests.structures import CaseInsensitiveDict
import sys
import os
import re
import time
from random import shuffle, randint, choice
from functools import cache 
from local_secrets import API_KEYS


In [None]:
def get_topic_list(main_topic):
    naics = pd.read_csv("NAICS Topics.csv")
    lst_of_topics = naics[naics["Definition"] ==
                          main_topic]["Related Github Topics"].tolist()
    return lst_of_topics[0]

In [None]:
g = Github("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")


def get_name_repo_txt(repo_topic, base_folder):
    text = repo_topic
    pattern = r'full_name="([^"]+)"'
    match = re.search(pattern, text)
    if match:
        full_name = match.group(1)

    text = full_name
    pattern = r'(.+)/'
    match = re.search(pattern, text)

    if match:
        owner = match.group(1)

    text = full_name
    pattern = r'/(.+)'

    match = re.search(pattern, text)

    if match:
        repo = match.group(1)

    data = [f"{owner}", f"{repo}"]
    return data

def put_txt_in_folder(topic_input_list, base_folder_path, Main_Topic, limit=1000):
    new_file_path = f"data/repo_by_topic/{Main_Topic}/topics_{Main_Topic}.csv"
    
    if os.path.exists(new_file_path):
        print(f"{new_file_path} already exists. Skipping...")
        return

    topics = topic_input_list.split(", ")

    list_of_repos = []
    repo_topics = []

    for topic in topics:
        print(f"Fetching repositories for topic {topic}...")
        try: 
            repos = g.search_repositories(query=f'topic:{topic}')
        except Exception as e:
            print(f"Error fetching repositories for topic {topic}: {e}")
            time.sleep(5)  
            continue

        for repo in repos:
            repo_string = str(repo)
            list_of_repos.append(repo_string)
            repo_topics.append(topic)

    data = []

    for repo, topic in zip(list_of_repos, repo_topics):
        try:
            new_entry = get_name_repo_txt(str(repo), base_folder_path)
            new_entry.append(topic)
            data.append(new_entry)
        except:
            print("Failure")

    if not os.path.exists(f"data/repo_by_topic/{Main_Topic}"):
        os.makedirs(f"data/repo_by_topic/{Main_Topic}")

    with open(new_file_path, "w") as file:
        file.write(f"{Main_Topic}\n")
        file.write("User, Repo, Topic\n")
        for entry in data:
            file.write(f"{entry[0]}, {entry[1]}, {entry[2]}\n")

In [None]:
def get_url(user, repo, path=""):
    if path != "":
        return "https://api.github.com/repos/" + user + "/" + repo
    else:
        return "https://api.github.com/repos/" + user + "/" + repo + "/" + path


def file_to_list_dict(filepath):
    with open(filepath) as file:
        content = file.read()
    result = []
    i = 0
    while i < len(content):
        if content[i] == '{':
            start = i
            cb1 = 1
            cb2 = 0
            while cb1 != cb2:
                i += 1
                if content[i] == "{":
                    cb1 += 1
                elif content[i] == '}':
                    cb2 += 1
            result.append(json.loads(content[start:i+1]))
        i += 1
    return result


def get(user, repo, category, nested=False):
    rdict = get_content(user, repo)
    if category in rdict:
        return rdict[category]
    else:
        return "N/A"


def get_readme(user, repo):
    rdict = get_content(user, repo, path="contents/README.md")
    if rdict and 'content' in rdict:
        decoded = base64.b64decode(rdict['content']).decode('utf-8')
        return decoded
    else:
        return rdict

working_keys = API_KEYS.copy()

def get_content(user, repo, path="", max_retries=3, backoff_time=60):
    global working_keys

    url = f"https://api.github.com/repos/{repo}/{user}"

    if path != "":
        url += f"/{path}"

    headers = CaseInsensitiveDict()
    headers["Accept"] = "application/vnd.github+json"
    headers["X-Github-Api-Version"] = "2022-11-28"

    retries = 0
    while retries <= max_retries:
        try: 
            if not working_keys:
                print("Resetting working keys...")
                working_keys = API_KEYS.copy()
            current_key = choice(working_keys)
            headers["Authorization"] = f"Bearer {current_key}"

            resp = requests.get(url, headers=headers)
            resp_json = json.loads(resp.text)

            if resp.status_code == 200:
                return resp_json
            elif "message" in resp_json and "rate limit exceeded for" in resp_json["message"].lower():
                print(f"Rate limit reached for a key. Removing key...")
                working_keys.remove(current_key)
                time.sleep(randint(0, backoff_time))
            elif resp_json.get("message") == "Not Found":
                print(f"Repository {url} not found. Giving up.")
                return None
            else:
                print(f"Request failed with status code {resp.status_code}. Retrying in {backoff_time} seconds...")
                print(f"Response: {resp_json}")
                time.sleep(randint(0, backoff_time))
                retries += 1
            backoff_time *= 3
        except Exception as e:
            print(f"SOMETHING CRASHED IT: {e}. Retrying in {backoff_time} seconds...")
            time.sleep(backoff_time)

    print(f"Max retries reached. Could not fetch data for {url}.")
    return None


def get_repos(filepath):
    lst = []
    f = open(filepath, 'r')
    f.readline()
    f.readline()
    curr = f.readline().split(",")
    while len(curr) > 1:
        lst.append((curr[0].strip(), curr[1].strip(), curr[2][:-1].strip()))
        curr = f.readline().split(",")
    f.close()
    return lst

def get_and_save_data(topic, repos):
    data_path = "./data/repo_by_topic/" + topic + "/"
    csv_file_path = data_path + "data_" + topic + ".csv"
    txt_file_path = data_path + "repos_" + topic + ".txt"
    
    if not os.path.exists(data_path):
        os.makedirs(data_path)
    
    existing_data = pd.DataFrame()
    reposlst = []
    
    if os.path.exists(csv_file_path):
        existing_data = pd.read_csv(csv_file_path)
    
    data_list = []
    count = 0

    shuffle(repos)

    for rname, ruser, rtopic in repos:
        if not existing_data.empty:
            temp = existing_data[(existing_data['repo'] == rname) & (existing_data['user'] == ruser)]
            if not temp.empty:
                continue

        url_content = get_content(ruser, rname)
        if not url_content:
            print('skipping', ruser, rname)
            continue
        
        reposlst.append(url_content)
        
        organization = get(ruser, rname, "organization")
        org_login = organization['login'] if organization != "N/A" else organization

        topics = get(ruser, rname, "topics")
        topics_str = '[' + ', '.join(topics) + ']' if topics else "N/A"

        row_dict = {
            "repo": rname,
            "user": ruser,
            "organization": org_login,
            "url (HTML)": get(ruser, rname, "html_url"),
            "url (API)": get_url(ruser, rname)[:-1],
            "description": get(ruser, rname, "description"),
            "readme": get_readme(ruser, rname),
            "stargazer count": get(ruser, rname, "stargazers_count"),
            "watcher count": get(ruser, rname, "watchers_count"),
            "subscriber count": get(ruser, rname, "subscribers_count"),
            "open issue count": get(ruser, rname, "open_issues_count"),
            "topic (search)": rtopic,
            "topics": topics_str,
            "NAICS Code": "00000"
        }        
        data_list.append(row_dict)
        count += 1

        if count % 1 == 0:
            print(f'savign data for {topic}, it has {count} repos')
            temp_data = pd.DataFrame(data_list)
            temp_data.to_csv(csv_file_path, mode='a', header=not os.path.exists(csv_file_path), index=False)
            
            # Save to text file
            with open(txt_file_path, "a") as f:
                for repo in reposlst:
                    f.write(f"{repo}\n")
            
            # Clear temporary lists
            data_list = []
            reposlst = []

    # Save remaining data
    if data_list:
        temp_data = pd.DataFrame(data_list)
        temp_data.to_csv(csv_file_path, mode='a', header=not os.path.exists(csv_file_path), index=False)
        
    if reposlst:
        with open(txt_file_path, "a") as f:
            for repo in reposlst:
                f.write(f"{repo}\n")




In [None]:
from concurrent.futures import ThreadPoolExecutor
import pandas as pd

def process_main_topic(main_topic):
    print(f'Working on {main_topic}')
    main_topic_list = get_topic_list(main_topic)
    
    base_folder_path = '/content/drive/My Drive/Finance and Insurance Repo (Data)'
    put_txt_in_folder(main_topic_list, base_folder_path, main_topic)

    print(f'Finished putting txt in folder {main_topic}')

    file = f"data/repo_by_topic/{main_topic}/topics_{main_topic}.csv"
    repos = get_repos(file)
    get_and_save_data(main_topic, repos)

naics = pd.read_csv("NAICS Topics.csv")


In [None]:
from concurrent.futures import ThreadPoolExecutor

with ThreadPoolExecutor(max_workers=10) as executor:
    executor.map(process_main_topic, naics['Definition']
)