# Commits Mining

In [1]:
import json
from pydriller import Repository

### Preparing our data

We begin by reading all the previously extracted Terraform repositories.
We then style a keyword list meant to be used in the commit message filtering phase.

In [None]:
# read urls from the file and strip the '\n'
all_repos = open('terraform_repos.txt', 'r')
repo_links = [repo.strip() for repo in all_repos.readlines()]
cost_keywords = ['cheap', 'expens', 'cost', 'efficient', 'bill', 'pay']

### Extracting relevant commits

The following script makes use of PyDriller. For every commit in the repositories if a commit message exists and any of the keywords appear in it, then it is taken into consideration.
The information saved are: commit id, message, author, date and list of modified files.
If none of the commits resulted to be relevant, then the whole repository is discarded.
Any repository that is not accessible at the time is saved in the ``commit_not_retrievable.txt`` for later re-analysis.

In [None]:
relevant_repos = []
count = 0
for repo in repo_links:
    commits = []

    if count % 100 == 0:
        print("Got to {}".format(count))

    try:
        # For each commit in the repository
        for commit in Repository(repo).traverse_commits():
            # If any of the keyword appear in the commit message
            if commit.msg is not None and any(key in commit.msg.lower() for key in cost_keywords):
                changed_files = []
                # Save the modified files
                for file in commit.modified_files:
                    changed_files.append(file.filename)
                commit_dic = {"id": commit.hash, "msg":commit.msg, "author":commit.author.name, "date":str(commit.author_date),
                              "modified_files": changed_files}
                commits.append(commit_dic)
        repo_dic = {"name":repo, "commits":commits}

        # Mark the repository as relevant if it has any relevant commits
        if len(commits) != 0:
            relevant_repos.append(repo_dic)
    except Exception as e:
        # so that we document what errors can happen when accessing commits
        print(f"{e}\n{repo}")
        with open("commit_not_retrievable.txt", "a") as file:
            file.write(f"{repo}\n")
    count = count + 1

### Saving to JSON

Our results are saved in a JSON file for safekeeping and future analysis.

In [None]:
output = {"no_of_repos":len(relevant_repos) ,"repositories": relevant_repos}
with open("terraform_keyworded.json", "w") as outfile:
    json.dump(output, outfile)

### Repository selection

This script refines the previous JSON file so that only commits that modify ``.tf`` and ``.tf.json`` files are taken into consideration.

In [None]:
# Opening JSON file
terraform_output = open('terraform_keyworded.json')
selected_repos = json.load(terraform_output)

filtered_repos = []
terraform_keywords = ['.tf', '.tf.json']
print(len(selected_repos["repositories"]))

for repo in selected_repos["repositories"]:
    relevant_commits = []
    flag = False
    for commit in repo["commits"]:
        for mod_file in commit["modified_files"]:
            if mod_file is not None and any(key in mod_file for key in terraform_keywords):
                relevant_commits.append(commit)
                flag = True
                break

    if flag:
        # new_commit_repo = {"name":repo["name"], "commits":relevant_commits}
        repo["commits"] = relevant_commits
        filtered_repos.append(repo)



print(f"Identified {len(filtered_repos)}")

output = {"no_of_repos":len(filtered_repos) ,"repositories": filtered_repos}
with open("terraform_tf_keywords", "w") as outfile:
    json.dump(output, outfile)