# Dataset creation
This notebook will:
* Find github issues by label

In [1]:
import pandas as pd
import os

# Read the CSV file into a DataFrame
repo_names_df = pd.read_csv('csv/repo_names.csv')

# Convert the 'Repository Name' column to a Python list
repo_names = repo_names_df['Repository Name'].tolist()
#Should be 60238
len(repo_names)


60238

In [2]:
# folder path
dir_path = r'../../../csv/all_issues' ## Point to the extracted folder containing all the issues csv files

# list to store files
res = []

# Iterate directory
for path in os.listdir(dir_path):
    # check if current path is a file
    if os.path.isfile(os.path.join(dir_path, path)):
        res.append(path)
print(res)

['issues_all_2020-02-03.csv', 'issues_all_2016-09-19.csv', 'issues_all_2023-09-11.csv', 'issues_all_2020-09-24.csv', 'issues_all_2023-05-07.csv', 'issues_all_2022-09-18.csv', 'issues_all_2018-02-13.csv', 'issues_all_2019-12-22.csv', 'issues_all_2016-04-20.csv', 'issues_all_2017-01-01.csv', 'issues_all_2017-08-27.csv', 'issues_all_2022-08-20.csv', 'issues_all_2016-11-23.csv', 'issues_all_2018-12-12.csv', 'issues_all_2019-06-06.csv', 'issues_all_2023-01-20.csv', 'issues_all_2022-04-13.csv', 'issues_all_2022-06-10.csv', 'issues_all_2021-06-12.csv', 'issues_all_2019-07-13.csv', 'issues_all_2019-01-24.csv', 'issues_all_2017-03-06.csv', 'issues_all_2015-01-01.csv', 'issues_all_2022-05-03.csv', 'issues_all_2020-02-16.csv', 'issues_all_2015-06-09.csv', 'issues_all_2022-11-23.csv', 'issues_all_2017-02-08.csv', 'issues_all_2020-11-26.csv', 'issues_all_2016-08-11.csv', 'issues_all_2023-04-24.csv', 'issues_all_2019-01-14.csv', 'issues_all_2017-04-30.csv', 'issues_all_2018-12-06.csv', 'issues_all_2

In [3]:
repo_names_set = set(repo_names)  # Convert list to set for faster lookup
# total number of unique repos in set
len(repo_names_set)

60238

In [4]:
out_file_name= 'csv/issues_in_repos_with_priority'  # File to store the processed data



# Calculate the total number of files to be processed
total_files = len(res)  # Assuming 'res' is a list of your CSV filenames
files_per_csv = total_files // 5 + (1 if total_files % 5 > 0 else 0)

# Process and save each file, distributing across five different CSV files
for i, filename in enumerate(res):
    csv_number = i // files_per_csv + 1
    output_file_path = f'{out_file_name}_part_{csv_number}.csv'
    
    # Ensure the output directory exists
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

    print(f"Processing file {i + 1}/{total_files} into {output_file_path}: {filename}")
    full_file_path = os.path.join(dir_path, filename)

    try:
        for chunk in pd.read_csv(full_file_path, chunksize=10000):
            try:
                filtered_chunk = chunk[chunk['repo'].isin(repo_names_set)]
                if not filtered_chunk.empty:
                    # If the file doesn't exist, write the header. Otherwise, append without the header.
                    with open(output_file_path, 'a') as f:
                        filtered_chunk.to_csv(f, header=f.tell() == 0, index=False)
            except Exception as e_chunk:
                print(f"Error processing chunk in {filename}: {e_chunk}")
    except Exception as e_file:
        print(f"Error processing file {filename}: {e_file}")


Processing file 1/3137 into csv/issues_in_repos_with_priority_part_1.csv: issues_all_2020-02-03.csv
Processing file 2/3137 into csv/issues_in_repos_with_priority_part_1.csv: issues_all_2016-09-19.csv
Processing file 3/3137 into csv/issues_in_repos_with_priority_part_1.csv: issues_all_2023-09-11.csv
Error processing file issues_all_2023-09-11.csv: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.

Processing file 4/3137 into csv/issues_in_repos_with_priority_part_1.csv: issues_all_2020-09-24.csv
Processing file 5/3137 into csv/issues_in_repos_with_priority_part_1.csv: issues_all_2023-05-07.csv
Processing file 6/3137 into csv/issues_in_repos_with_priority_part_1.csv: issues_all_2022-09-18.csv
Processing file 7/3137 into csv/issues_in_repos_with_priority_part_1.csv: issues_all_2018-02-13.csv
Processing file 8/3137 into csv/issues_in_repos_with_priority_part_1.csv: issues_all_2019-12-22.csv
Processing file 9/3137 into csv/issues_in_repos_with_priority_