In [5]:
import pandas as pd
import numpy as np

In [6]:
import os
import csv
import shutil

In [56]:
def read_repo_urls(input_csv):
    """
    Read the repo URLs from a CSV file and return a dictionary mapping repo names to URLs.
    """
    repo_urls = {}
    with open(input_csv, mode='r', newline='') as csvfile:
        reader = csv.reader(csvfile)
        next(reader) 
        for row in reader:
            parts = row[0].split('/')
            repo_name = parts[-2] + "_" + parts[-1]  # Extract repo folder name from URL
            repo_urls[repo_name] = row[0]
    return repo_urls


def cvs_header(output_csv):
    """
    Create header for the csv file where repo urls will be stored
    """
    with open(output_csv, mode='w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(["Repo Name", "GitHub URL"])
    

def write_repo_url(repo_name, repo_url, output_csv):
    """
    Write the filtered repository names and their URLs to a new CSV file.
    """
    with open(output_csv, mode='a', newline='') as outfile:
        writer = csv.writer(outfile) 
        writer.writerow([repo_name, repo_url])


def has_test_files(repo_dir):
    """
    Check for test files
    """
    for root, dirs, files in os.walk(repo_dir):
        for file in files:
            if "test" in file.lower():
                return True
            
    return False


def ignore_env_dir(directory, contents):
    """
    Directories to avoid
    """
    ignored_dirs = ['env', 'venv', 'envs']  # Add more virtual environment folder names if needed
    return [d for d in contents if d in ignored_dirs]


def copy_repo(src_dir, dst_dir, repo_name):
    """
    Copy the repository to a destination directory.
    """
    dst_path = os.path.join(dst_dir, repo_name)

    try:
        # Attempt to copy the repository
        shutil.copytree(src_dir, dst_path, ignore=ignore_env_dir)
        return 1
    except FileNotFoundError as fnf_error:
        print(f"FileNotFoundError: {fnf_error} for {repo_name}")
        return 0
    except PermissionError as perm_error:
        print(f"PermissionError: {perm_error} for {repo_name}")
        return 0
    except shutil.Error as sh_error:
        print(f"shutil.Error: {sh_error} while copying {repo_name}")
        return 0
    except Exception as e:
        print(f"Unexpected error: {e} while copying {repo_name}")
        return 0


def get_repos_with_test_files(src_dir, dst_dir, input_csv, output_csv):
    """
    Get the repositories that contain test files and copy them to a new directory.
    Write the filtered repository names and their URLs to a new CSV file.
    """
    repo_urls = read_repo_urls(input_csv)
    repo_dir = os.listdir(src_dir)
    repo_dir.sort(key=lambda x:x.lower())

    count = 0
    i = 0
    cvs_header(output_csv)
    for repo_name in repo_dir:
        if repo_name == "OrchidTechnologies_orchid": continue
        
        print(f"\033[1m{repo_name}\033[0m")
        repo_path = os.path.join(src_dir, repo_name)
        
        if os.path.isdir(repo_path) and has_test_files(repo_path):
            print(f"Test files found in {repo_name}")
            count += copy_repo(repo_path, dst_dir, repo_name)
            if repo_name in repo_urls:
                write_repo_url(repo_name, repo_urls[repo_name], output_csv)
        
        if i % 50 == 0:
            print(f"\n\033[1m{i} repos searched\033[0m\n")
        i += 1

    print(f"Number of repos with tests: {count}")

In [58]:
src_dir = "/home/shrikara/Downloads/pyLoad/wonderless/repos/repositories/repositories/AWS"
# src_dir = "/home/shrikara/sample"
dst_dir = "/home/shrikara/SERC/LMM+Serverless/serverless_repos_with_test"
# dst_dir = "/home/shrikara/SERC/LMM+Serverless/sample"
input_csv = "dataset.csv"
output_csv = "filtered_dataset.csv"

get_repos_with_test_files(src_dir, dst_dir, input_csv, output_csv)

[1m._.DS_Store[0m

[1m0 repos searched[0m

[1m.DS_Store[0m
[1m0x4D31_honeyLambda[0m
[1m20minutes_serverless-github-check[0m
[1m3B00D_DevX-sls[0m
[1m3box_3box-address-server[0m
Test files found in 3box_3box-address-server
[1m3box_3box-graphql[0m
[1m3box_3box-verifications[0m
Test files found in 3box_3box-verifications
[1m3boysdad_ActionFurnitureRepair.com[0m
[1m3PillarGlobal_engineering-playbook[0m
Test files found in 3PillarGlobal_engineering-playbook
[1m3scale_awsThreeScale_Authorizer[0m
[1m4art_insider_s3l3ct_kinesis[0m
Test files found in 4art_insider_s3l3ct_kinesis
[1m65_aws-slackops-serverless[0m
Test files found in 65_aws-slackops-serverless
[1m70-10_sandbox[0m
Test files found in 70-10_sandbox
[1m70-10_serverless-boilerplate[0m
[1m70-10_serverless-typescript[0m
[1m99xt-incubator_interns-portal[0m
Test files found in 99xt-incubator_interns-portal
[1m99xt_serverless-delivery-framework[0m
[1m99xt_serverless-react-boilerplate[0m
Test files fou

---

In [1]:
import subprocess
import json

In [2]:
def get_repo_info_from_url(repo_url):
    """
    Extract the owner and repo name from a GitHub URL.
    """
    parts = repo_url.strip().split('/')
    if len(parts) >= 2:
        owner = parts[-2]
        repo_name = parts[-1]
        return owner, repo_name
    return None, None


def get_repo_rating(owner, repo_name):
    """Fetch repository stars and forks count using GitHub GraphQL API via gh CLI."""
    query = '''
    query($name: String!, $owner: String!) {
      repository(owner: $owner, name: $name) {
        forkCount,
        stargazerCount
      }
    }
    '''
    try:
        result = subprocess.run(
            [
                "gh", "api", "graphql",
                "-F", f"owner={owner}",
                "-F", f"name={repo_name}",
                "-f", f"query={query}"
            ],
            capture_output=True,
            text=True
        )

        if result.returncode == 0:
            data = json.loads(result.stdout)
            repo_data = data.get('data', {}).get('repository', {})
            stars = repo_data.get('stargazerCount', 0)
            forks = repo_data.get('forkCount', 0)
            return stars, forks
        else:
            print(f"Error fetching data for {owner}/{repo_name}: {result.stderr}")
            return None, None
    except Exception as e:
        print(f"Exception occurred: {e}")
        return None, None

def get_repo_info_from_url(repo_url):
    """Extract the owner and repo name from a GitHub URL."""
    parts = repo_url.strip().split('/')
    if len(parts) >= 2:
        owner = parts[-2]
        repo_name = parts[-1]
        return owner, repo_name
    return None, None

In [3]:

def cvs_header(output_csv):
    """
    Create header for the csv file where repo urls will be stored
    """
    with open(output_csv, mode='w', newline='') as outfile:
        writer = csv.writer(outfile)
        writer.writerow(["Repo Name", "GitHub URL", "Rating (Stars)", "Forks"])
    

def repos_with_ratings(repo_name, repo_url, rating, forks, output_csv):
    """
    Write the filtered repository names and their URLs to a new CSV file.
    """
    with open(output_csv, mode='a', newline='') as outfile:
        writer = csv.writer(outfile) 
        writer.writerow([repo_name, repo_url, rating, forks])


def sort_repo_by_rating(input_csv, output_csv):
    """
    Sort the repositories in the input CSV file by their rating and number of forks.
    """
    data = []

    with open(input_csv, mode='r', newline='') as csvfile:
        reader = csv.reader(csvfile)
        next(reader) 

        for row in reader:
            repo_name = row[0]
            repo_url = row[1]

            owner, repo_name_from_url = get_repo_info_from_url(repo_url)

            if owner and repo_name_from_url:
                rating, forks = get_repo_rating(owner, repo_name_from_url)

                if rating is not None:
                    # data.append({
                    #     "Repo Name": repo_name,
                    #     "Repo URL": repo_url,
                    #     "Rating (Stars)": rating,
                    #     "Forks": forks
                    # })
                    repos_with_ratings(repo_name, repo_url, rating, forks, output_csv)
                else:
                    print(f"Could not fetch rating for {repo_url}")

    # df = pd.DataFrame(data, columns=["Repo Name", "Repo URL", "Rating (Stars)", "Forks"])
    # df.sort_values(by=["Rating (Stars)", "Forks"], ascending=False, inplace=True)
    
    # df.to_csv(output_csv, index=False)
    return

In [7]:
in_csv = "filtered_dataset.csv"
final_csv = "dataset_with_ratings.csv"
sort_repo_by_rating(in_csv, final_csv)

Error fetching data for 3box/3box-address-server: [0;1;39mWelcome to GitHub CLI![0m

To authenticate, please run `gh auth login`.

Could not fetch rating for https://github.com/3box/3box-address-server
Error fetching data for 3box/3box-verifications: [0;1;39mWelcome to GitHub CLI![0m

To authenticate, please run `gh auth login`.

Could not fetch rating for https://github.com/3box/3box-verifications
Error fetching data for 3PillarGlobal/engineering-playbook: [0;1;39mWelcome to GitHub CLI![0m

To authenticate, please run `gh auth login`.

Could not fetch rating for https://github.com/3PillarGlobal/engineering-playbook
Error fetching data for 4art/insider_s3l3ct_kinesis: [0;1;39mWelcome to GitHub CLI![0m

To authenticate, please run `gh auth login`.

Could not fetch rating for https://github.com/4art/insider_s3l3ct_kinesis
Error fetching data for 65/aws-slackops-serverless: [0;1;39mWelcome to GitHub CLI![0m

To authenticate, please run `gh auth login`.

Could not fetch rating fo

KeyboardInterrupt: 