# Extract C/C++ source codes from GitHub repositories

This python notebook was used to gather C/C++ source codes from GitHub repositories, extract functions together with their descriptions, and save them in a file.

In [None]:
import time
from github import Github
import os
import pandas as pd
import subprocess
from lxml import etree
import pandas as pd
from typing import List, Dict
import regex as re
from dotenv import load_dotenv
load_dotenv()

## Download the repositories according to criteria

We chose the repositories according to the following criteria:
- Topic: we chose topics ["microcontroller", "firmware", "embedded-systems"]
- Keywords: ["Cortex-M0", "Cortex-M3", "Cortex-M4", "Cortex-A", "STM32", "ESP32", "ESP8266", "ATmega328", "PIC16F877A"]
- Language: we chose the language "C++" or "C"
- Number of stars: we chose repositories in descending order of stars

In [None]:
# These repositories are skipped
skip_repos = ["msdk",'renode', 'u8g2', 'GuiLite']

# Create the Github object
g = Github(os.environ['GITHUB_TOKEN'])

# Create the folder to save the repositories
if not os.path.exists('finalRepos'):
    os.makedirs('finalRepos')

# Create the folder to save the index .json files - information about the downloaded repositories
if not os.path.exists('finalRepos/index'):
    os.makedirs('finalRepos/index')


def clone_repository(repo, parent_folder: str):
    '''
    Clone the repository, remove the .git folder, remove all files except C/C++ files and remove empty directories

    Args:
        repo: Repository object
        parent_folder: Folder to save the repository
    '''
    # Clone the whole repository
    os.system(f'git clone --depth=1 --filter=blob:none {repo.clone_url} {parent_folder}/{repo.name}')

    # Change directory to the repository
    os.chdir(f'{parent_folder}/{repo.name}')

    # Remove the .git folder
    os.system("rm -rf .git")

    # Remove all files from the repository except C/C++ files
    os.system("find . -type f ! -name '*.c' ! -name '*.cpp' -delete")

    # Remove empty directories
    os.system("find . -type d -empty -delete")

    # Change directory back to parent
    os.chdir('../..')

def process_query(query: str, n: int, folder_to_save: str, keywords: List[str]):
    '''
    Process the query and download the repositories

    Args:
        query: Query to search the repositories
        n: maximum number of repositories to download
        folder_to_save: Folder to save the repositories
        keywords: List of keywords to search
    '''
    for keyword in keywords:
        i = 0 # Number of downloaded repositories
        for repo in g.search_repositories(query=query(keyword), sort="stars", order="desc"):
            if i == n: # Check if we have downloaded n repositories
                break
            else:
                if repo.name in skip_repos:
                    continue
                if os.path.exists(f'{folder_to_save}/{repo.name}'):
                    print(f"Skipping {repo.name} as it is already downloaded")
                    continue
                else:
                    clone_repository(repo, folder_to_save)
                    i += 1
                
                # Add file with information about the repository
                license = repo.license.name if repo.license else "None"
                with open(f'{folder_to_save}/index/{repo.name}.json', 'w') as f:
                    f.write(f'{{"name": "{repo.name}", "url": "{repo.html_url}", "license": "{license}", "key": "{keyword}", "stars": {repo.stargazers_count}, "user": "{repo.owner.login}", "time_downloaded": "{time.strftime("%Y-%m-%d %H:%M:%S")}"}}')
                    
    print(f"Downloaded {i} repositories for {keyword}")

# Process each topic
process_query(lambda x: f"topic:{x} NOT graphics NOT opengl NOT directx NOT WebGL NOT GUI language:C language:C++", 50, 'finalRepos', ["microcontroller", "firmware", "embedded-systems"])

# Process each keyword
process_query(lambda x: f"{x} NOT graphics NOT opengl NOT directx NOT WebGL NOT GUI language:C language:C++", 30, 'finalRepos', ["Cortex-M0", "Cortex-M3", "Cortex-M4", "Cortex-A", "STM32", "ESP32", "ESP8266", "ATmega328", "PIC16F877A"])



## Filter folders

Delete folders and files that contain keyword 'test' in their names.

In [None]:
# Delete folders that have 'test' in their name as they may be less relevant
os.system("find finalRepos -type d -name '*test*' -exec rm -rf {} +")
# Delete files that have 'test' in their name as they may be less relevant
os.system("find finalRepos -type f -name '*test*' -exec rm -rf {} +")

## Edit non doxygen comments so that doxygen can process them

USed to rewrite comments from format /* ... */ to /**... */ so that doxygen can process them.

In [None]:
wrong_decoded_files = 0
not_found_files = 0

# Rewrite multiline comments from format /* ... */ to /**... */, use regex
for root, dirs, files in os.walk('finalRepos'):
    for file in files:
        if file.endswith('.c') or file.endswith('.cpp'):
            file_path = os.path.join(root, file)
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    content = f.read()
                    content = re.sub(r'\/\*\n', '/**\n', content, flags=re.DOTALL)
                    with open(file_path, 'w', encoding='utf-8') as f:
                        f.write(content)
            except UnicodeDecodeError  as e:
                print(f"Error decoding file '{file_path}': {e}")
                wrong_decoded_files += 1
                # remove the file
                os.remove(file_path)
            except FileNotFoundError as e:
                print(f"File '{file_path}' not found: {e}")
                not_found_files += 1
                # remove the file
                os.remove(file_path)

print(f"Wrong decoded files: {wrong_decoded_files}")
print(f"Not found files: {not_found_files}")

## Extract functions from source files

This part of code is used to extract functions from source files. It uses doxygen to parse the source files and create an XML file. Then, it reads the XML file and extracts functions together with their descriptions. The extracted functions are periodically saved in .parquet format to prevent data loss in case of a crash and to save memory.

In [None]:
total_files = int(subprocess.check_output("find finalRepos -name '*.c' -o -name '*.cpp' | wc -l", shell=True))
print(f"Total files: {total_files}")

In [None]:
def create_doxyfile(filename: str):
    '''
    Creates a Doxyfile for the specified file.

    Parameters:
        filename (str): Path to the file for which the Doxyfile will be created.
    '''
    with open('Doxyfile', 'w') as f:
        f.write(f'''
            DOXYFILE_ENCODING      = UTF-8
            JAVADOC_BLOCK          = YES
            PROJECT_NAME           = "AutoGeneratedDoxyfile"
            OUTPUT_DIRECTORY       = DoxygenOutput
            GENERATE_XML           = YES
            GENERATE_HTML          = NO
            GENERATE_LATEX         = NO
            XML_OUTPUT             = xml_output
            INPUT                  = {filename}
        ''')

def run_doxygen():
    '''
    Runs Doxygen with autogenerated Doxyfile
    '''
    try:
        subprocess.run(['doxygen', 'Doxyfile'], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        # print("Doxygen executed successfully.")
    except subprocess.CalledProcessError as e:
        print(f"Error running Doxygen: {e}")

def parse_xml_and_extract_functions(xml_file: str) -> Dict:
    '''
    Parses the XML index file and extracts functions from it.

    Parameters:
        xml_file (str): Path to the XML index file.

    Returns:
        Dict: Dictionary containing extracted functions.
    '''
    # Parse the XML index file and extract file names
    try:
        tree = etree.parse(xml_file)
        root = tree.getroot()
    except etree.XMLSyntaxError as e:
        print(f"Error parsing XML file: {e}")
        return {}

    # Function records for the current source file
    functions_data = []

    def get_description(element: etree.Element) -> Dict:
        '''
        Extracts the description of the function from the XML element.

        Parameters:
            element (etree.Element): XML element containing the function.
        
        Returns:
            Dict: Dictionary containing the description of the function.
        '''
        description = { 'brief': None, 'detailed': None, 'return': None, 'parameters': [] }
        for para in element.xpath('.//briefdescription/para'):
            description['brief'] = para.text + '\n' if para.text is not None else None
        for para in element.xpath('.//detaileddescription/para'):
            if para.text is not None:
                description['detailed'] = para.text + '\n'
            else:
                for item in para.iterchildren():
                    if item.tag == 'simplesect' and item.get('kind') == 'return':
                        description['return'] = item.find('para').text + '\n' if item.find('para') is not None and item.find('para').text is not None else None
                    elif item.tag == 'parameterlist':
                        for parameter_item in item.iter('parameteritem'):
                            kind = item.get('kind')
                            name = parameter_item.find("parameternamelist/parametername").text if parameter_item.find("parameternamelist/parametername") is not None and parameter_item.find("parameternamelist/parametername").text is not None else None
                            param_desc = parameter_item.find("parameterdescription/para").text if parameter_item.find("parameterdescription/para") is not None and parameter_item.find("parameterdescription/para").text is not None else None
                            description['parameters'].append({'kind': kind, 'name': name, 'description': param_desc})
        return description


    for compound in root.xpath('//compound[@kind="file"]'):
        try:
            c_file_tree = etree.parse(os.path.join('./DoxygenOutput/xml_output/', f"{compound.get('refid')}.xml"))
            c_file_root = c_file_tree.getroot()
        except etree.XMLSyntaxError as e:
            print(f"Error parsing XML file: {e}")
            continue

        for member in c_file_root.xpath('//memberdef[@kind="function"]'):
            try:
                record = {'language': c_file_root.find('.//compounddef').get('language')}

                if member.find('.//briefdescription/para') is None and member.find('.//detaileddescription/para') is None:
                    continue

                # Add the file name to the record, remove finalRepos/ from the path
                record['file'] = '/'.join(member.find('.//location').get('file').split('/')[1:])

                record['description'] = get_description(member)

                record['name'] = member.find('.//name').text
                record['signature'] = member.find('.//definition').text + member.find('.//argsstring').text

                source_file_location = member.find('.//location').get('file')

                if member.find('.//location').get('bodystart') is None or member.find('.//location').get('bodyend') is None:
                    # Probably 1-liner function - skip
                    continue

                body_start = int(member.find('.//location').get('bodystart'))
                body_end = int(member.find('.//location').get('bodyend'))
                with open(source_file_location, 'r') as f:
                    lines = f.readlines()
                    record['code'] = ''.join(lines[body_start:body_end + 1])    
                
                functions_data.append(record)
            except Exception as e:
                print(f"Error processing function in file {compound.get('refid')}.xml: {e}")
    return functions_data


def run_function_extraction(folder_paths: List[str], save_path: str) -> pd.DataFrame:
    '''
    Extracts functions from all C/C++ files in the specified folder and saves them to a parquet file.

    The function creates .parquet files for every 20000 files processed to save memory.

    Parameters:
        folder_paths (List[str]): List of paths to the folders containing C/C++ files.
        save_path (str): Path to the parquet file where the extracted functions will be saved.
    '''

    extracted_functions_dataset = []    # List of dictionary records containing information about a function

    
    n_file = 0  # Processed files counter

    # Iterate over all specified folders
    for folder_path in folder_paths:
        for root, dirs, files in os.walk(folder_path):            
            for file in files:
                if file.endswith('.c') or file.endswith('.cpp'):

                    # Checkpoint - Save the extracted functions to a parquet file
                    if n_file % 20000 == 0 and n_file != 0:
                        print(f"File number {n_file} out of {total_files}")

                        # Save the dataframe to a file and clear list to save memory
                        pd.DataFrame(extracted_functions_dataset).to_parquet(f'{save_path}_{n_file}.parquet', index=False)

                        # Clear the list to save memory
                        extracted_functions_dataset = []

                    file_path = os.path.join(root, file)

                    # Create a Doxyfile for the file
                    create_doxyfile(file_path)

                    # Run Doxygen with the generated Doxyfile
                    run_doxygen()

                    # Parse the XML index file and extract file names
                    extracted_functions_dataset += parse_xml_and_extract_functions('./DoxygenOutput/xml_output/index.xml')
                    n_file += 1
                
    pd.DataFrame(extracted_functions_dataset).to_parquet(f'{save_path}_{n_file}.parquet', index=False)

run_function_extraction(['finalRepos'], 'big_dataset_extraction')