# Github API Extraction with Python on Google Colab

## Importing Libraries

In [20]:
import requests
import time
import os
from getpass import getpass
import pandas as pd
# from dotenv import load_dotenv
# from google.colab import files

## Configuring Connection

In [31]:
# Getting personal token
TOKEN = getpass("Insert your GitHub token: ")
os.environ['GITHUB_TOKEN'] = TOKEN
GITHUB_API_URL = "https://api.github.com"
HEADERS = {
    "Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}",
    "Accept": "application/vnd.github+json",
    "X-GitHub-Api-Version": "2022-11-28"
}

Insert your GitHub token: ··········


In [23]:
|# # I chose not to use .env as I didn't find it necessary. However, you can feel free to modify it as you must

# !pip install python-dotenv

# uploaded = files.upload()
# load_dotenv()

# TOKEN = os.getenv('GITHUB_TOKEN')
# USER = os.getenv('USER')

SyntaxError: invalid syntax (ipython-input-23-703246657.py, line 1)

## Handling API Rate Limits

In [32]:
# If the rate limit exceds API limit, code wait until API allow new rates
def handle_rate_limit(response):
  if response.status_code == 403 and 'X-RateLimit-Remaining' in response.headers:
      if int(response.headers['X-RateLimit-Remaining']) == 0:
          reset_time = int(response.headers['X-RateLimit-Reset'])
          wait_time = reset_time - int(time.time()) + 1
          print(f"Rate limit reached. Waiting {wait_time} seconds...")
          time.sleep(wait_time)

## Handling Pages

In [33]:
# Create a list of results from pages
def get_paginated_results(url, params={}):
  results = []
  while url:
      response = requests.get(url, headers=HEADERS, params=params)
      handle_rate_limit(response)
      if response.status_code == 200:
        data = response.json()
        results.extend(data if isinstance(data, list) else [data])
        # Next page detection
        if 'next' in response.links:
          url = response.links['next']['url']
          params = {}
        else:
          break
      # Handling error
      else:
        print(f"Error: {response.status_code}, Message: {response.text}")
        break
  return results

## List Repositories (By User)

In [34]:
# Getting username
user = input("Enter the GitHub username (identical to GitHub): ")
# Getting user repos
repos_url = f"{GITHUB_API_URL}/users/{user}/repos"
repos = get_paginated_results(repos_url)
# Listing user repos
print("Repos:", [repo['name'] for repo in repos])

Enter the GitHub username (identical to GitHub): alura-cursos
Repos: ['-IA-Aumentada-Prevendo-atrasos-de-voos.', '-robotron-2000', '01-fullstack-nextjs', '01-nextjs-course', '02-nextjs-course', '04-nextjs-course', '06-github-actions-front-end', '07-storybook-com-react', '1236-springapi', '13-clojure-tests', '14-EN-clojure-schemas', '1426-pep8-linters-python', '1474-delphi', '15-EN-clojure-generators-tests-properties', '1501-angular-unit-tests', '1526-FetchAPI', '1527-cluster_r', '1530-springmvc-parte1', '1540-laravel-parte3', '1563-treinando-pytorch', '1576-mlops-machine-learning', '1598-jpa', '16-EN-flutter-first-app', '1616vpns', '1622-web-scraping-php', '1659-postgresql-primeiros-passos', '1660-openshift', '1699-data-science-previsao-arima', '1734-cnnpytorch', '1736-java-primeros-pasos', '1743-jpa-pesquisas', '1750-jakartaee', '1755-aula1-novidades-java', '1755-aula3-novidades-java-api', '1755-aula3-novidades-java-http2', '1755-aula4-novidades-java-reactive-streams', '1755-aula5-nov

## Handling Repo Contents

In [35]:
# Getting contents of a Repo Folder
repo_name = input("Enter the GitHub repo name (identical to GitHub): ")
contents_url = f"{GITHUB_API_URL}/repos/{user}/{repo_name}/contents/dados"
contents = requests.get(contents_url, headers=HEADERS)
# Showing files
if contents.status_code == 200:
  files_info = contents.json()
  for item in files_info:
    print("File:", item['name'])
# Handling error
else:
  print("Failed to fetch repo contents")

Enter the GitHub repo name (identical to GitHub): pandasspark
File: cursos-prouni.csv
File: reajuste.csv


## Handling Commits

In [36]:
# Getting and printing (count) commits
commits_url = f"{GITHUB_API_URL}/repos/{user}/{repo_name}/commits"
commits = get_paginated_results(commits_url)
print("Total commits fetched:", len(commits))

Total commits fetched: 9


## Handling "Unauthorized" Error (401)

In [37]:
def simulate_unauthorized():
  # Creating wrong headers (as simulation)
  wrong_headers = {
    "Authorization": "Bearer WRONG_TOKEN",
    "Accept": "application/vnd.github+json"
  }
  # Applying wrong headers (as simulation)
  r = requests.get(f"{GITHUB_API_URL}/user", headers=wrong_headers)
  # Handling wrong headers (as simulation)
  if r.status_code == 401:
    print("401 Unauthorized. Cause: Invalid token.")

simulate_unauthorized()

401 Unauthorized. Cause: Invalid token.


# Extra

In [None]:
# Optional: loading the files from repo (colab only)
dataframes = {}
file_extension = input("Enter the file extension (.csv, .parquet, etc): ")
for file in files_info:
    if file['name'].endswith(file_extension):
        print(f"Loading {file['name']}...")
        df = pd.read_csv(file['download_url'])
        dataframes[file['name']] = df

Enter the file extension (.csv, .parquet, etc): .csv
Loading cursos-prouni.csv...
Loading reajuste.csv...
