# Issues2dataframe

Read the body of a GitHub issue requesting the creation of a new repo.  
Extract relevant fields and write them to a Pandas Dataframe.

In [11]:
import os
from dotenv import load_dotenv

import requests
import pandas as pd 

In [12]:
load_dotenv()

True

In [13]:
def get_issue_content(owner, repo, issue_number, token):
    url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}"
    headers = {
        "Authorization": f"token {token}",
        "Accept": "application/vnd.github.v3+json"
    }
    
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Failed to fetch issue {issue_number}: {response.status_code}")
        return None



def fetch_issues_numbers_id(owner, repo, token, label):
    url = f"https://api.github.com/repos/{owner}/{repo}/issues"
    headers = {
        "Authorization": f"token {token}",
        "Accept": "application/vnd.github.v3+json"
    }
    
    issues = []
    page = 1
    
    while True:
        response = requests.get(url, headers=headers, params={'state': 'open', 'labels': label, 'page': page, 'per_page': 100})
        if response.status_code != 200:
            print(f"Failed to retrieve issues: {response.status_code}")
            break
        
        page_issues = response.json()
        if not page_issues:
            break

        for issue in page_issues:
            if 'pull_request' not in issue:
                #print(issue)
                issues.append({
                    'Issue Number': issue['number'],
                    'Title' : issue['title'],
                    'Node ID' : issue['node_id']
                })
        
        page += 1
    
    return issues



def extract_fields(description, fields):
    extracted_data = {}
    lines = description.splitlines()
    
    for line in lines:
        #print(line)
        for field in fields:
            if line.startswith(field):
                extracted_data[field] = line[len(field)+1:].lstrip()
    
    return extracted_data



def issue_description2dict(description,fields):
    separator = 'SEP'
    text_clean = description.replace('\r\n\r\n', separator).replace('\r\n',separator)
    text_clean = text_clean.split(separator)
    #print(text_clean)
    
    RES = {}
    
    for line_num,line in enumerate(text_clean):
        line = line.strip()
        #print(line_num,line)
         
        for field in fields:
            if line.startswith(field):
                
                if field == "Visibility":
                    #print('OK')
                    visibility_section = ''.join(text_clean[line_num+1:]).split('Purpose')[0]
                    #print(visibility_section)
                    if '- [x] Public'in visibility_section:
                        RES['Private'] = False
                    else:
                        RES['Private'] = True
                    continue
            
                RES[field] = line[len(field)+1:].lstrip()
    
    return RES

## Configuration

In [8]:
owner = "ICSC-CN-HPC-Spoke-4-Earth-Climate"
repo = "administration"
token = os.getenv('ICSC_GITHUB_ORG_PAT')

## Fetch issues numbers

In [9]:
a = fetch_issues_numbers_id(owner,repo,token,label="new_repo")
a

[{'Issue Number': 14,
  'Title': '[Repository Creation] - test',
  'Node ID': 'I_kwDOMO4l_86Oge3k'},
 {'Issue Number': 13,
  'Title': '[Repository Creation] - test_repo_3',
  'Node ID': 'I_kwDOMO4l_86OP2ml'},
 {'Issue Number': 11,
  'Title': '[Repository Creation] - test_repo_2',
  'Node ID': 'I_kwDOMO4l_86OPkVa'},
 {'Issue Number': 10,
  'Title': '[Repository Creation] - test_repo',
  'Node ID': 'I_kwDOMO4l_86OPOUC'}]

In [14]:
df = pd.DataFrame(a).sort_values(by='Issue Number',ascending=True)
df

Unnamed: 0,Issue Number,Title,Node ID
3,10,[Repository Creation] - test_repo,I_kwDOMO4l_86OPOUC
2,11,[Repository Creation] - test_repo_2,I_kwDOMO4l_86OPkVa
1,13,[Repository Creation] - test_repo_3,I_kwDOMO4l_86OP2ml
0,14,[Repository Creation] - test,I_kwDOMO4l_86Oge3k


## Get issues content

In [15]:
fields = ["Repository name","WP","Task","Repository manager","Collaborators","Visibility"]

In [16]:
new_columns = fields.copy()
new_columns.remove("Visibility")
new_columns.append("Private")

df[new_columns] = None
df

Unnamed: 0,Issue Number,Title,Node ID,Repository name,WP,Task,Repository manager,Collaborators,Private
3,10,[Repository Creation] - test_repo,I_kwDOMO4l_86OPOUC,,,,,,
2,11,[Repository Creation] - test_repo_2,I_kwDOMO4l_86OPkVa,,,,,,
1,13,[Repository Creation] - test_repo_3,I_kwDOMO4l_86OP2ml,,,,,,
0,14,[Repository Creation] - test,I_kwDOMO4l_86Oge3k,,,,,,


In [18]:
for n in df['Issue Number']:
    print(f'Retrieving the description of issue #{n}')
    
    issue_content = get_issue_content(owner,repo,n,token)
    #print(issue_content)
    descr = issue_description2dict(issue_content["body"],fields)
    #print(descr)

    # Update dataframe
    for c in new_columns:
        df.loc[df['Issue Number'] == n, c] = descr[c]

df

Retrieving the description of issue #10
Retrieving the description of issue #11
Retrieving the description of issue #13
Retrieving the description of issue #14


Unnamed: 0,Issue Number,Title,Node ID,Repository name,WP,Task,Repository manager,Collaborators,Private
3,10,[Repository Creation] - test_repo,I_kwDOMO4l_86OPOUC,test_repo,1,1.1,MclTTI,"[MclTTI, agalizia]",True
2,11,[Repository Creation] - test_repo_2,I_kwDOMO4l_86OPkVa,test_repo_2,1,1.3,MclTTI,"MclTTI, agalizia",True
1,13,[Repository Creation] - test_repo_3,I_kwDOMO4l_86OP2ml,test_repo_3,1,1.2,MclTTI,MclTTI,True
0,14,[Repository Creation] - test,I_kwDOMO4l_86Oge3k,test,2,2.3,<GitHub username of the repository manager>,[comma separated list of collaborators' GitHub...,False


## Save to file

In [19]:
save_file = os.path.join(os.getcwd(),"Repository_list.csv")
df.to_csv(save_file,index=False)