In [21]:
import requests
import base64
import os
import subprocess

def get_readme_content(repo_full_name):
    readme_endpoint = f'https://api.github.com/repos/{repo_full_name}/readme'
    readme_response = requests.get(readme_endpoint)
    if readme_response.status_code == 200:
        readme_data = readme_response.json()
        readme_content = base64.b64decode(readme_data['content']).decode('utf-8')
        return readme_content
    else:
        return "README content could not be retrieved."

def save_content_to_file(folder_path, file_name, content):
    file_path = os.path.join(folder_path, file_name)
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(content)

        
def clone_repository(repo_url, clone_path):
    subprocess.run(["git", "clone", repo_url, clone_path])





In [22]:
# Define an array of search queries
search_queries = [
    'assembly transcript RNA-seq',
    'protein structure prediction',
    'gene expression quantification',
    'biological sequence analysis',
    'metagenomics data analysis',
    'phylogenetic analysis',
    'mass spectrometry data analysis',
    'microarray data processing',
    'molecular docking simulations',
    'single-cell sequencing analysis'
]

# Define paths for questions, answers, and repositories
base_path = "D:\\Halima's Data\\NLP\\project\\usingGithub\\"
questions_path = os.path.join(base_path, "question\\")
answers_path = os.path.join(base_path, "questions_answers\\")
repos_path = os.path.join(base_path, "repositories\\")

# Ensure the directories exist
os.makedirs(questions_path, exist_ok=True)
os.makedirs(answers_path, exist_ok=True)
os.makedirs(repos_path, exist_ok=True)


In [None]:
# Loop through each search query
for query_number, search_query in enumerate(search_queries, start=1):
    api_endpoint = f'https://api.github.com/search/repositories?q={search_query}'
    response = requests.get(api_endpoint)
    if response.status_code == 200:
        search_results = response.json()
        for i, repo in enumerate(search_results['items'], start=1):
            # Construct file names
            if i == 10 :
                break;
            else:    
                question_file_name = f"question_{query_number}_{i}.txt"
                answer_file_name = f"question_ans_{query_number}_{i}.txt"
                repo_dir_name = f"question_ans_{query_number}_{i}_{repo['name']}"

                # Save the question
                question_content = f"Search Query: {search_query}\nRepository name: {repo['full_name']}\nRepository URL: {repo['html_url']}\nDescription: {repo['description']}\n"
                save_content_to_file(questions_path, question_file_name, question_content)

                # Fetch the README content and save the answer
                readme_content = get_readme_content(repo['full_name'])
                save_content_to_file(answers_path, answer_file_name, readme_content)

                # Clone the repository
                clone_repository(repo['clone_url'], os.path.join(repos_path, repo_dir_name))
    else:
        print(f"Failed to retrieve data for query '{search_query}': {response.status_code}")