# Dataset extraction 
This notebook will:
* Find github issues by label
* Produce one dataset with the given label

In [None]:
import pandas as pd
import re
import os

In [None]:
# folder path
dir_path = r'../../../csv/all_issues' ## Point to the extracted folder containing all the issues csv files

# list to store files
res = []

# Iterate directory
for path in os.listdir(dir_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(dir_path, path)):
        res.append(path)
print(res)

In [None]:
appended_data = []
# Regular expression to capture various variations of "high priority"
high_priority = r"\bhigh\W*p(?:ri(?:o(?:rity)?)?)?\b|\bp(?:ri(?:o(?:rity)?)?)?\W*high\b"

not_high_priority = r"\b(?:high\W*|critical\W*|severe\W*|important\W*|urgent\W*|essential\W*|imperative\W*|paramount\W*|pressing\W*|crucial\W*|vital\W*|mandatory\W*|top\W*priority\W*|compulsory\W*|expedient\W*)(?:p(?:ri(?:o(?:rity)?)?)?|\burgent\b|\bsevere\b)\b|\b(?:p(?:ri(?:o(?:rity)?)?)?|\burgent\b|\bsevere\b)\W*(?:high|critical|severe|important|urgent|essential|imperative|paramount|pressing|crucial|vital|mandatory|top\W*priority|compulsory|expedient)\b"
medium_priority = r"\b(?:medium|mid)\W*p(?:ri(?:o(?:rity)?)?)?\b|\bp(?:ri(?:o(?:rity)?)?)?\W*(?:medium|mid)\b"

low_priority = r"\blow\W*p(?:ri(?:o(?:rity)?)?)?\b|\bp(?:ri(?:o(?:rity)?)?)?\W*low\b"


# Change to the pattern you want to extract
pattern=high_priority
file_name = "high_priority"

length_res = len(res)
for i, r in enumerate(res):
    try:
        file_path = f"{dir_path}/{r}"
        df = pd.read_csv(file_path, index_col=0)
        
        # Make sure the labels column is not empty 
        df = df[df['labels'].notnull() & df['labels'].str.strip().astype(bool)]

        df = df[
           df["labels"].str.contains(pattern, case=False, na=False, regex=True)
        ]

        if not df.empty:  # Append non-empty dataframes to the list
           appended_data.append(df)


        print(f"{i}/{length_res} Processed: {r} ")
    except Exception as e:
        print(f"{i}/{length_res} Error processing {r}: {e}")

appended_data = pd.concat(appended_data, ignore_index=True)


In [None]:
appended_data = pd.DataFrame(appended_data)
appended_data

In [None]:
# Show top labels
appended_data.labels.value_counts().to_frame()[:50] 

In [None]:
# Repos with most issues
appended_data.repo.value_counts().to_frame()[:50] 

In [None]:
# Number of unique repos
df["repo"].nunique()

In [None]:
# Create csv dir if it does not exist
path_dir = "csv"
if not os.path.exists(path_dir):
    os.makedirs(path_dir)

In [None]:
# Convert to csv
priority_file_name = f"csv/{file_name}_full_dataset.csv"
appended_data.to_csv(priority_file_name)

In [None]:
# Read csv to check if not corrupted
pri = pd.read_csv(priority_file_name, index_col=0)
pri