# Issue Mining

In [None]:
import time
import json
import calendar
from perceval.backends.core.github import GitHub as pGithub
from github import Github
from pathlib import Path

### Logging into GitHub

The access token is necessary for Perceval and it is declared here.
The GitHub logic is also necessary to calculate the Rate Limit waiting time.

In [None]:
# Providing access token
access_token = "< YOUR PRIVATE ACCESS TOKEN >"
g = Github(login_or_token=access_token)

# Confirm your login is successful
user = g.get_user()
print(f"Authenticated as: {user.login}")

### Preparing Data

We take the repository URLs from all the commits that contain one of our keyword, ``terraform_keyworded.json`` was obtained during commit mining.

In [None]:
terraform_keyworded_urls = []
f = open('terraform_keyworded.json')
data = json.load(f)

for entry in data['repositories']:
    terraform_keyworded_urls.append(entry['name'])

### Extracting relevant issues

The following script makes use of Perceval to extract any issue that contains a cost-related keyword in either the title, body or comments. Pull requests are NOT filtered and are still part of the end result.
This process is highly time-consuming, therefore in the off chance we incur in the GitHub API limit, a proper waiting time is implemented which calculates exactly how long it takes to reset our limits.

In [None]:
cost_keywords = ['cheap', 'expens', 'cost', 'efficient', 'pay', 'bill']
relevant_repos = []
count = 0

for repo in terraform_keyworded_urls:
    print(repo)
    time.sleep(2)
    count += 1
    try:
        if count % 50 == 0:
            time.sleep(120)
            print(f"At: {count}")

        # Extracting owner username and repository name from the URL
        owner = repo.split('/')[3]
        repository = ".".join(repo.split('/')[4].split('.')[:-1])
        fetched = pGithub(owner=owner, repository=repository, api_token=[access_token])
        issue_list = []

        for item in fetched.fetch():
            time.sleep(1)
            item_data = item['data']
            # initialize all entries in case of empty fields (to prevent errors)
            issue_type = 'issue'
            if 'pull_request' in item_data:
                issue_type = 'pull_request'

            title_flag = False
            body_flag = False
            comment_flag = False

            # If title, body or comments contain any of the keywords, then the issue is relevant
            if 'title' in item_data and item_data['title'] is not None and any(key in item_data['title'] for key in cost_keywords):
                title_flag = True
            elif 'body' in item_data and item_data['body'] is not None and any(key in item_data['body'] for key in cost_keywords):
                    body_flag = True
            elif 'comments_data' in item_data and 'comments_data' is not None:
                for comment in item_data['comments_data']:
                    if 'body' in comment and comment['body'] is not None and any(key in comment['body'] for key in cost_keywords):
                        comment_flag = True
                        break

            time.sleep(1)

            if title_flag or body_flag or comment_flag:
                print("-adding elements to dictionary")
                issue_dict = {
                'title': None if 'title' not in item_data else item_data['title'],
                'html_url': None if 'html_url' not in item_data else item_data['html_url'],
                'user': None if 'user' not in item_data else item_data['user']['url'],
                'category': issue_type,
                'labels': None if 'labels' not in item_data else item_data['labels'],
                'closed_at': None if 'closed_at' not in item_data else item_data['closed_at'],
                'assignee': None if 'assignee' not in item_data else item_data['assignee'],
                'assignees': None if 'assignees' not in item_data else item_data['assignees'],
                'body': None if 'body' not in item_data else item_data['body'],
                'comments': None if 'comments' not in item_data else item_data['comments'],
                'comments_data': None if 'comments_data' not in item_data else item_data['comments_data']
                }
                issue_list.append(issue_dict)

        repo_dic = {"name":repo, "issues":issue_list}
        if len(issue_list) > 0:
            relevant_repos.append(repo_dic)

    except Exception as e:
        print(f"Repo: {repo} failed")
        print(e)
        core_rate_limit = g.get_rate_limit().core
        reset_timestamp = calendar.timegm(core_rate_limit.reset.timetuple())
        sleep_time = reset_timestamp - calendar.timegm(time.gmtime()) + 5  # add 5 seconds to be sure the rate limit has been reset
        time.sleep(sleep_time)
        with open("terraform_issues_error_list.txt", "a") as file:
            file.write(f"{repo}\n")

### Writing our results

Our results are saved in a JSON file for later analysis.

In [None]:
myfile = Path('terraform_issues_updated.json')  # Failsafe if I forget to create the file
myfile.touch(exist_ok=True)

output = {"no_of_repos":len(relevant_repos) ,"repositories": relevant_repos}
with open("terraform_issues_updated.json", "w") as outfile:
    json.dump(output, outfile)