In [1]:
import requests
import pandas as pd
import io
from urllib.parse import urlparse

# Global counter for inaccessible files and valid csv files 
inaccessible_files_count = 0
valid_csv_count = 0

In [2]:


def get_repo_contents(owner, repo, path="", token=None):
    api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}" #using github api to access public repos
    
    headers = {}  #they are needed to request from api
    if token:
        headers['Authorization'] = f'token {token}'   #if we have token then the authorization header is set 

    response = requests.get(api_url, headers=headers) #sending request to github api to access the contents of the repo
    
    
    print("For URL- ",api_url,"->")
    print(f"Status Code: {response.status_code}\n")
    
    
    if response.status_code == 200: #means it can be accessed 
        return response.json()
    else:
        global inaccessible_files_count
        inaccessible_files_count += 1
        print(f"Failed to retrieve contents. Error: {response.json()}")
        return None


In [3]:

def validate_csv(file_url):
    response = requests.get(file_url) #response is the requests object that later we convert to string is .text
    if response.status_code == 200:
        df = pd.read_csv(io.StringIO(response.text)) #io.StringIO converts string into an in-memory file object that we reading as csv
        
        # h1n1 and xyz both are valid so -> 
        required_columns_1 = ['respondent_id', 'h1n1_vaccine', 'seasonal_vaccine']
        required_columns_2 = ['respondent_id', 'xyz_vaccine', 'seasonal_vaccine']
        
        if (all(col in df.columns for col in required_columns_1) and len(df.columns) == len(required_columns_1) or all(col in df.columns for col in required_columns_2) and len(df.columns) == len(required_columns_2)):
            
            return df
        else:
            return None
    else:
        return None


In [4]:


def process_repository(url, criteria, token):
    if not is_github_repo(url):
        return "Invalid GitHub URL"

    # Remove .git extension if present because it was casusing it to not being able to access the URL
    if url.endswith(".git"):
        url = url[:-4]

    # Extract owner and repo from URL
    parts = urlparse(url).path.split('/')
    owner, repo = parts[1], parts[2]

    contents = get_repo_contents(owner, repo, "", token)
    if contents is None:
        return "Unable to access repository contents"

    csv_url = find_any_csv(contents, owner, repo, token)
    if not csv_url:
        return "No CSV file found in the repository"
    
    global valid_csv_count
    valid_csv_count_repo=0
    valid_csv = validate_csv(csv_url)
    if valid_csv is not None:
        valid_csv_count += 1
        valid_csv_count_repo+=1
        valid_csv_df=valid_csv
        repo_name = urlparse(url).path.replace('/', '_')
        valid_csv_df.to_csv(f"{repo_name}.csv", index=False)

    if valid_csv_count_repo > 0:
       
        return f"{valid_csv_count_repo} valid CSV files found in the repository", valid_csv_df
    else:
        return "No valid CSV file found in the repository"
    
    
    valid_csv = validate_csv(csv_url) #checking if the csv file is valid according to the format
    if valid_csv is not None:
        return "Valid CSV file found", valid_csv #prints the valid csv file
    
    else:
        return "No valid CSV file found in the repository", None

In [5]:

def read_github_links(csv_path):
    df = pd.read_csv(csv_path)
    links = df['GitHub Links'].tolist()  
    return links #list of the all links 


In [6]:

def read_submission_criteria(csv_path):
    criteria_df = pd.read_csv(csv_path)
    # Convert criteria to a dictionary cuz its better to check conditions on
    criteria = criteria_df.to_dict(orient='list')
    return criteria #dictionary that has keys as the columns titles 


In [7]:
def is_github_repo(url):
    parsed_url = urlparse(url)
    return parsed_url.netloc == "github.com" and len(parsed_url.path.split('/')) >= 3

In [8]:
def get_file_metadata(owner, repo, path, token):
    api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
    
    headers = {}
    if token:
        headers['Authorization'] = f'token {token}'
    
    response = requests.get(api_url, headers=headers)
    
    if response.status_code == 200:
        return response.json() #retruns the json response
    else:
        return None


In [9]:
def get_last_commit_date(owner, repo, path, token):
    api_url = f"https://api.github.com/repos/{owner}/{repo}/commits?path={path}&per_page=1" 
    
    ###
    #&per_page=1 ensures we get the last commit -> usage of the github API
    ###
    
    headers = {}
    if token:
        headers['Authorization'] = f'token {token}'

    response = requests.get(api_url, headers=headers)
    
    if response.status_code == 200:
        commit_data = response.json() #to parse the JSON response to return the data of last commit 
        if commit_data:
            return commit_data[0]['commit']['committer']['date']
    return None

def find_any_csv(contents, owner, repo, token):
    csv_files = []
    for item in contents:
        if item['type'] == 'file' and item['name'].endswith('.csv') and 'training' not in item['name']:
            last_commit_date = get_last_commit_date(owner, repo, item['path'], token)
            if last_commit_date:
                item['last_commit_date'] = last_commit_date
                csv_files.append(item)
        elif item['type'] == 'dir':
            dir_contents = get_repo_contents(owner, repo, item['path'], token)
            if dir_contents is not None:
                result = find_any_csv(dir_contents, owner, repo, token)
                if result is not None:
                    csv_files.extend(result)
    
    if csv_files:
        csv_files = [file for file in csv_files if 'last_commit_date' in file]
        if not csv_files:
            return None
        # Sort by last_commit_date and return the most recent CSV file url
        
        csv_files.sort(key=lambda x: x['last_commit_date'], reverse=True)
        return csv_files[0]['download_url']
    
    return None


In [10]:
# commit_data:
# This variable holds the JSON data returned from the GitHub API endpoint after requesting commit information. 
#It represents an array (list) of commit objects.

# commit_data[0]:
# Accesses the first (most recent) commit object in the commit_data array. 
#GitHub API returns commit objects sorted by the commit date in descending order by default when using per_page=1.

# commit_data[0]['commit']:
# Within the first commit object (commit_data[0]), accesses the nested commit object. 
#This object contains detailed information about the commit itself, such as the commit message, author details, and committer details.

# commit_data[0]['commit']['committer']:
# Inside the commit object, accesses the committer field. 
#The committer field contains information about the person who applied the commit to the repository 
#This includes details such as the name, email, and date of the committer.

# commit_data[0]['commit']['committer']['date']:
# Finally, accesses the date field within the committer object. 
#This field specifically contains the date and time when the commit was made.

In [11]:

def main(github_links_path, criteria_path, token):
    global inaccessible_files_count,valid_csv_count

    inaccessible_files_count = 0

    valid_csv_count=0
    # Read GitHub links and submission criteria from CSV
    github_links = read_github_links(github_links_path) #list
    criteria = read_submission_criteria(criteria_path) #dictionary

    results = []
    for url in github_links:
        result = process_repository(url, criteria, token)
        results.append((url, result))

    for url, result in results:
        if isinstance(result, pd.DataFrame):
            print(f"Valid CSV file found for {url}:")
            print(result)
        else:
            print(f"{url}: {result}")

    # Print the count of inaccessible files
    print(f"Number of repositories that couldn't be accessed: {inaccessible_files_count}")
    print(f"Number of valid csv files : {valid_csv_count}")


if __name__ == "__main__":
    github_links_path = "links_of_submission.csv" 
    criteria_path = "submission_format.csv" 
    token = #hidden#  # GitHub personal access token 

    main(github_links_path, criteria_path, token)


For URL-  https://api.github.com/repos/ASNR1010/Hack-a-thon-DataHack-by-IIT-Guwahati/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/gursharankaur13/Summer-Analytics-2024/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/gursharankaur13/Summer-Analytics-2024/contents/Week 3 Hackathon ->
Status Code: 200

For URL-  https://api.github.com/repos/gursharankaur13/Summer-Analytics-2024/contents/Week 3 Hackathon/ Solution ->
Status Code: 200

For URL-  https://api.github.com/repos/gursharankaur13/Summer-Analytics-2024/contents/Week 3 Hackathon/Datasets ->
Status Code: 200

For URL-  https://api.github.com/repos/gursharankaur13/Summer-Analytics-2024/contents/Week 3 Hackathon/Output ->
Status Code: 200

For URL-  https://api.github.com/repos/jigs-bot/GFG_DataHack/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/IISHITA-BANERJEE/-IISHITABANERJEE-_Datahack-/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/iam-venk

For URL-  https://api.github.com/repos/rayaankhan/DataHack-by-IIT-Guwahati-GFG-24/contents/data ->
Status Code: 200

For URL-  https://api.github.com/repos/chanda-yeswanthreddy/Multilabel_Classification/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/DiyaPrasad2002/Vaccine-Prediction/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/sharvilkothari/Sharvil_Kothari_DataHack/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/kanishka73/Hackhathon-iit-guhwati-gfg/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/FlyingDragon112/ArnavAgarwal_DataHack/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/bhaskar101010/Probability-Prediction-of-two-vaccines-labels/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/Saksh8/Aastha_Ojha_DataHack/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/Soumyadeep8016/Soumya_datahack/contents/ ->
Status Code: 200

For URL

For URL-  https://api.github.com/repos/Keshav-1729/DataHack_SA2024/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/Keshav-1729/DataHack_SA2024/contents/Seasonal ->
Status Code: 200

For URL-  https://api.github.com/repos/Keshav-1729/DataHack_SA2024/contents/xyz ->
Status Code: 200

For URL-  https://api.github.com/repos/therahulgithub/RahulKumar_Datahack/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/swaralipi04/SwaralipiDatta_Datahack/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/T-arn21/ArnavTiku_Datahack/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/Reboot2004/DataHack-Submission_2/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/SketchyCarrot/DataHack_by_IITG/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/himani2853/project_datahack/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/himani2853/project_datahack/contents/hackathon

For URL-  https://api.github.com/repos/sha345trip/Vaccine-Prediction/contents/share/jupyter/lab/schemas/@jupyter-notebook/help-extension ->
Status Code: 200

For URL-  https://api.github.com/repos/sha345trip/Vaccine-Prediction/contents/share/jupyter/lab/schemas/@jupyter-notebook/notebook-extension ->
Status Code: 200

For URL-  https://api.github.com/repos/sha345trip/Vaccine-Prediction/contents/share/jupyter/lab/schemas/@jupyter-notebook/tree-extension ->
Status Code: 200

For URL-  https://api.github.com/repos/sha345trip/Vaccine-Prediction/contents/share/jupyter/lab/schemas/@jupyterlab ->
Status Code: 200

For URL-  https://api.github.com/repos/sha345trip/Vaccine-Prediction/contents/share/jupyter/lab/schemas/@jupyterlab/application-extension ->
Status Code: 200

For URL-  https://api.github.com/repos/sha345trip/Vaccine-Prediction/contents/share/jupyter/lab/schemas/@jupyterlab/apputils-extension ->
Status Code: 200

For URL-  https://api.github.com/repos/sha345trip/Vaccine-Prediction/c

For URL-  https://api.github.com/repos/sha345trip/Vaccine-Prediction/contents/share/jupyter/labextensions/@jupyter-widgets ->
Status Code: 200

For URL-  https://api.github.com/repos/sha345trip/Vaccine-Prediction/contents/share/jupyter/labextensions/@jupyter-widgets/jupyterlab-manager ->
Status Code: 200

For URL-  https://api.github.com/repos/sha345trip/Vaccine-Prediction/contents/share/jupyter/labextensions/@jupyter-widgets/jupyterlab-manager/schemas ->
Status Code: 200

For URL-  https://api.github.com/repos/sha345trip/Vaccine-Prediction/contents/share/jupyter/labextensions/@jupyter-widgets/jupyterlab-manager/schemas/@jupyter-widgets ->
Status Code: 200

For URL-  https://api.github.com/repos/sha345trip/Vaccine-Prediction/contents/share/jupyter/labextensions/@jupyter-widgets/jupyterlab-manager/schemas/@jupyter-widgets/jupyterlab-manager ->
Status Code: 200

For URL-  https://api.github.com/repos/sha345trip/Vaccine-Prediction/contents/share/jupyter/labextensions/@jupyter-widgets/jupy

For URL-  https://api.github.com/repos/Keerthan-Shenoy/KEERTHANSHENOY_Datahack/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/Sourajjal/Summer-Analytics-DataHack/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/ArranabManhas/Data_hack_project/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/HARSHITH-MANDLI/M.V.S.Harshith_Datahack/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/Anushika0601/Anushika_Srivastava_DataHack/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/madhu04075/Datahack_1/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/Manas8953/Project/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/swatiswapna/swati_swapna_Datahack/contents/ ->
Status Code: 200

For URL-  https://api.github.com/repos/swatiswapna/swati_swapna_Datahack/contents/dataset ->
Status Code: 200

For URL-  https://api.github.com/repos/suchiagarwal23/Suchi_Agarw

https://github.com/tarun-009/Tarun_Datahack: ('1 valid CSV files found in the repository',        respondent_id  xyz_vaccine  seasonal_vaccine
0              26707     0.158510          0.346319
1              26708     0.050767          0.060106
2              26709     0.485164          0.786012
3              26710     0.539803          0.843318
4              26711     0.224811          0.444695
...              ...          ...               ...
26703          53410     0.285122          0.457058
26704          53411     0.257305          0.304490
26705          53412     0.082599          0.255930
26706          53413     0.102180          0.372132
26707          53414     0.439546          0.615224

[26708 rows x 3 columns])
https://github.com/pranjal3060/PranjalSingh_Datahack.git: ('1 valid CSV files found in the repository',        respondent_id  xyz_vaccine  seasonal_vaccine
0              26707         0.27              0.30
1              26708         0.00              0.0

https://github.com/sha345trip/Vaccine-Prediction: ('1 valid CSV files found in the repository',        respondent_id  xyz_vaccine  seasonal_vaccine
0              26707         0.19              0.37
1              26708         0.09              0.10
2              26709         0.33              0.73
3              26710         0.64              0.68
4              26711         0.34              0.52
...              ...          ...               ...
26703          53410         0.26              0.47
26704          53411         0.26              0.32
26705          53412         0.12              0.44
26706          53413         0.05              0.46
26707          53414         0.46              0.66

[26708 rows x 3 columns])
https://github.com/rushi-k12/-Rushikesh_Kusuma-_Datahack: ('1 valid CSV files found in the repository',        respondent_id  xyz_vaccine  seasonal_vaccine
0              26707         0.22              0.30
1              26708         0.04            