# Repository Mining

In [1]:
from github import RateLimitExceededException, Github
import time

### Providing GitHub credentials

A private access token is necessary to make use of less restrictive API limits.

In [None]:
# Providing access token
access_token = "< YOUR PRIVATE ACCESS TOKEN >"
g = Github(login_or_token=access_token)

# Confirm your login is successful
user = g.get_user()
print(f"Authenticated as: {user.login}")

### Repository recovery using GitHub search parameters

This script queries the GitHub search API for repositories that use HCL as language, for each individual day from 2014 until present day.
Some dates queried do not exist, an exception is caught to avoid interruptions.
Every repository is saved in ``hclURLs.txt`` so no progress is lost in case of interruptions.

In [None]:
script_urls = []
for year in range(2014, 2023):
    for month in range(1, 13):
        print(f"Scraping month {month} of year {year}")
        for day in range(1, 32):
            # Formatting compatible with search parameters
            date = f"{year}-{month:02d}-{day:02d}"
            try:
                time.sleep(2)  # sleep to reset API search limit
                repos = g.search_repositories(query=f"created:{date} language:HCL")
                for repo in repos:
                    time.sleep(0.2)  # sleep to reset API core limit
                    # URLs are added to a txt file to avoid data loss
                    with open("hclURLs.txt", "a") as file:
                        file.write(f"{repo.clone_url}\n")
                    script_urls.append(repo.clone_url)
            except RateLimitExceededException:
                print("Rate Limit Exception reached!")
            except Exception as e:
                print(e)
                # These are impossible dates (31-2-2022)
                print(f"Skipping: {date}")

In [None]:
# Number of HCL repositories obtained
print(len(script_urls))

### Mining for Terraform Repositories

We start by reading the repositories saved in ``hclURLs.txt`` from the previous script.

In [None]:
# read urls from the file and strip the '\n'
gitUrls_file = open('hclURLs.txt', 'r')
repo_links = gitUrls_file.readlines()
repo_links = [repo.strip() for repo in repo_links]

The following script scans for the content of each repository looking for files with extension ``.tf`` and ``.tf.json``, which are Terraform artifact files.
If any of the repositories is not reachable for any reason, its URL is added to ``404_list.txt`` for later re-analysis.

In [None]:
counter = 0
terraform_keywords = ['.tf', '.tf.json']
terraform_relevant_repos = []
for repo_url in repo_links:
    if counter % 100 == 0:
        print(f'Got to {counter}')
    try:
        time.sleep(2)  # sleep for API search limit
        split_list = repo_url.split("/")
        actual_url = split_list[3]+ '/' + split_list[4]
        repo = g.get_repo(actual_url.split('.git')[0])
        contents = repo.get_contents('')
        while contents:
            time.sleep(0.2)  # sleep for API core limit
            file_content = contents.pop(0)
            if file_content.type == "dir":
                contents.extend(repo.get_contents(file_content.path))
            else:
                if file_content.name is not None and any(key in file_content.name.lower() for key in terraform_keywords):
                    terraform_relevant_repos.append(repo_url)
                    with open("terraform_repos.txt", "a") as file:
                        file.write(f"{repo_url}\n")
                    break
        counter += 1
    except RateLimitExceededException:
        print("Rate Limit Exception reached!")
    except Exception as e:
        print(f"{e}\n{repo_url}")
        with open("404_list.txt", "a") as file:
            file.write(f"{repo_url}\n")