# Build the dataset for fine-tuning models

This dataset is built for fine-tuning models on the task of code generation based on function descriptions in the field of embedded-systems.

In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import regex as re
import os
import datasets

## Concatenate checkpoint .parquet files from previous step

In [None]:
# Load the dataset by concatenating all the parquet files
df = pd.DataFrame()
for file in os.listdir('.'):
    if file.endswith('.parquet') and 'big_dataset_extraction_' in file:
        print(f'Loading {file}')
        df = pd.concat([df, pd.read_parquet(file)], axis=0)

In [None]:
def extract_function_info(row):
    """
    Extracts parameters, docstring, and return information from a function's description.

    Args:
        row (Series): A row from a DataFrame containing the function description.
    """
    description = row['description']
    parameters = '\n'.join(f'@{param["kind"]} {param["name"]} {param["description"]}' 
                           for param in description['parameters'] 
                           if all(param[key] is not None for key in ['kind', 'name', 'description']))

    docstring = ''.join(filter(None, [description.get('brief'), description.get('detailed')]))
    return_val = description.get('return', None)

    return pd.Series([parameters if parameters else None, docstring if docstring else None, return_val])

# Apply the function and update the DataFrame
df[['parameters', 'docstring', 'return']] = df.apply(extract_function_info, axis=1)

# Drop the original 'description' column
df.drop(columns=['description'], inplace=True)


In [None]:
# Add column repository
df['repository'] = df['file'].apply(lambda x: x.split('/')[0])

# Add column language
df['language'] = df['file'].apply(lambda x: 'C' if x.split('/')[1].endswith('.c') else 'C++')

# Drop rows, where value in column 'code' and 'docstring' is NaN
df.dropna(inplace=True, subset=['code', 'docstring'])

# Remove duplicates
df = df.drop_duplicates(subset=['signature', 'code'], keep='first').reset_index(drop=True)

# Sort by repository
df = df.sort_values(by='repository', ascending=True)

# Reset index
df.reset_index(drop=True, inplace=True)

In [None]:
# Save the combined dataset (checkpoint)
df.to_parquet('combined_dataset.parquet')

## Clean up dataset for further processing

In [None]:
df = pd.read_parquet('combined_dataset.parquet')

In [None]:
# Find count of instances of eacg keyword in docstring and code
keywords = ['TODO', 'FIXME', 'HACK', 'BUG', 'ISSUE', 'REVIEW', 'REFACTOR', 'DEPRECATED', 'OBSOLETE', 'JOKE', 'TEST', 'COPYRIGHT', 'TOKEN', 'HTTP', '#IFDEF', '#IFNDEF', '#ENDIF', '#UNDEF', '#IF', '#ELSE', '#ENDIF', 'LICENSE']

keywords_pattern = '|'.join(keywords)
pattern = re.compile(keywords_pattern, re.IGNORECASE)

# Count of instances of each keyword in docstring and code
for keyword in keywords:
    print(f"{keyword} in code: {df['code'].str.count(keyword).sum()}, in docstring: {df['docstring'].str.count(keyword).sum()}")

# Filter rows based on regex pattern
mask =  df.apply(lambda x: False if (pattern.search(x['code']) or pattern.search(x['docstring'])) is None else True, axis=1)
df = df[~mask].reset_index(drop=True)

In [None]:
class Stack:
    def __init__(self):
        self.items = []

    def push(self, item):
        self.items.append(item)

    def pop(self):
        return self.items.pop()

    def is_empty(self):
        return len(self.items) == 0

def check_braces(file_data) -> bool:
    stack = Stack()
    for char in file_data:
        if char == '{':
            stack.push('{')
        elif char == '}':
            if stack.is_empty():
                return 'Braces are invalid'  # Too many closing braces
            stack.pop()
    
    if stack.is_empty():
        return True
    else:
        return False
    
def clean_code(text: str) -> str:
    """
    Cleans the code by removing trailing and leading content outside the main code block, 
    one line comments, unnecessary newlines, and empty lines.

    Args:
        text (str): A string containing the code.
    
    Returns:
        str: Cleaned code.
    """

    # Sometimes, the extarction fails and wrong code blocks are extracted, for example when the code block is not enclosed in braces or there is some text before the code block.
    # First we need to check if the code block is enclosed in braces, if not, we will remove the code block.
    # If the code block is enclosed in braces, we will remove the text before the code block.
    if not check_braces(text):
        return None
    else:
        text = text[text.find('{'):text.rfind('}') + 1]

    # Stop if the text is empty
    if not text:
        return None

    # Remove comments /* text */ from code
    text = re.sub(r'[\s]*\/\*[^\*]*(?:\*\/)+', '', text, flags=re.DOTALL | re.MULTILINE)

    # Remove comments // text from code
    text = re.sub(r'[\s]*\/\/[^\n]*', '', text, flags=re.DOTALL | re.MULTILINE)

    # Remove multiline comments
    text = re.sub(r'\/\*[\s\S]*?\*\/', '', text)

    # Remove double newlines and similar patterns
    text = re.sub(r'\t+\n', '', text)
    text = re.sub(r'\n *\n', '\n', text)
    text = re.sub(r'^\s*\n', '', text, flags=re.MULTILINE) # Remove empty lines

    return text

In [None]:
# Apply the cleaning function for code
df['code'] = df['code'].apply(clean_code)

# Remove rows with empty code
df.replace({'code': ''}, np.nan, inplace=True)
df.dropna(subset=['code'], inplace=True)
df = df.reset_index(drop=True)

invalid_counter = 0
for i in range(df.shape[0]):
    if not check_braces(df.loc[i, 'code']):
        invalid_counter += 1
        df.loc[i, 'code'] = None

df.dropna(subset=['code'], inplace=True)
df.reset_index(drop=True, inplace=True)

print(f"Number of invalid code blocks removed: {invalid_counter}")

In [None]:
def clean_description(text: str) -> str:
    """
    Cleans the description by removing unnecessary content from description

    Args:
        text (str): A string containing the description.

    Returns:
        str: Cleaned description.
    """

    # Remove dates in various formats
    text = re.sub(r'\d{4}-\d{1,2}-\d{1,2}', '', text)
    text = re.sub(r'\d{1,2}-\d{1,2}-\d{4}', '', text)
    text = re.sub(r'\d{1,2}-\d{1,2}-\d{2,4}', '', text)

    text = re.sub(r'\d{1,2}/\d{1,2}/\d{4}', '', text)
    text = re.sub(r'\d{4}/\d{1,2}/\d{1,2}', '', text)
    text = re.sub(r'\d{1,2}/\d{1,2}/\d{2,4}', '', text)

    text = re.sub(r'\d{1,2}\.\d{1,2}\.\d{4}', '', text)
    text = re.sub(r'\d{4}\.\d{1,2}\.\d{1,2}', '', text)
    text = re.sub(r'\d{1,2}\.\d{1,2}\.\d{2,4}', '', text)

    # Remove text in [] brackets, this tends to be present, but is not useful
    text = re.sub(r'\[.*?\]', '', text)

    return text

    
# Apply the cleaning function for description
df['docstring'] = df['docstring'].apply(clean_description)

df = df[df['docstring'] != ''].reset_index(drop=True)

In [None]:
# Load the tokenizer. We will use the codellama/CodeLlama-7b-hf tokenizer since ths will be the model we will fine-tune.
base_model_name = "bigcode/starcoderbase-1b"
tokenizer = AutoTokenizer.from_pretrained(base_model_name, add_bos_token=False, add_eos_token=False, token='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')
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Remove records with less than 8 words in docstring
df = df[df['docstring'].apply(lambda x: len(x.split()) >= 8)].reset_index(drop=True)

In [None]:
# Tokenize code
df['code_tokens'] = df['code'].apply(lambda x: tokenizer.tokenize(x))
df['code_tokens_len'] = df['code_tokens'].apply(lambda x: len(x))

# Tokenize docstring
df['docstring_tokens'] = df['docstring'].apply(lambda x: tokenizer.tokenize(x))
df['docstring_tokens_len'] = df['docstring_tokens'].apply(lambda x: len(x))

df['n_of_lines'] = df['code'].apply(lambda x: len(x.split('\n')))

df['code_unique_tokens'] = df['code_tokens'].apply(lambda x: len(np.unique(x))) # Number of unique tokens in code

In [None]:
# Filter by number of tokens in code and docstring
rows = df[((df['code_tokens_len'] >= 15) & (df['code_tokens_len'] <= 256) & (df['docstring_tokens_len'] <= 100) & (df['n_of_lines'] >= 3) & (df['n_of_lines'] <= 30))].index
new_df = df.loc[rows].reset_index(drop=True)
len(new_df)

In [None]:
new_df.to_parquet('filtered_dataset.parquet')

## Prepare dataset and export it

In [None]:
training_dataset = pd.read_parquet('filtered_dataset.parquet')

In [None]:
def format_docstring(docstring: str) -> str:
    """
    Formats the docstring to be used in the model training.

    We will use the following format:
    /* docstring */
    Since the same fomart is used in THUDM/humaneval-x dataset for c++ code.

    Args:
        docstring (str): The docstring to be formatted.

    Returns:
        str: The formatted docstring.
    """
    docstring = re.sub(r'\n*$', '', docstring)

    return f"/* {docstring} */"

In [None]:
# Format docstring
training_dataset['docstring'] = training_dataset['docstring'].apply(lambda x: format_docstring(x))

# Create prompt
training_dataset['prompt'] = training_dataset['docstring'] + '\n' + training_dataset['signature']

training_dataset = training_dataset.reset_index(drop=True)

In [None]:
# Remove duplicates in signature
training_dataset = training_dataset.drop_duplicates(subset=['signature', 'code'], keep='first').reset_index(drop=True)

# Choose only examples where repository has more than 100 examples
training_dataset = training_dataset[training_dataset['repository'].map(training_dataset['repository'].value_counts()) >= 100].reset_index(drop=True)

In [None]:
# Save whole dataframe
training_dataset.to_parquet('dataset-all-features.parquet')

In [None]:
export_dataset = training_dataset[['docstring', 'signature', 'prompt', 'code', 'repository', 'language']]


# Split the dataset into training, validation and test set
train, validate, test = np.split(export_dataset.sample(frac=1), [int(.80*len(export_dataset)), int(.90*len(export_dataset))])

# Create a dataset from the dataframe
train_dataset = datasets.Dataset.from_pandas(train, preserve_index=False)
val_dataset = datasets.Dataset.from_pandas(validate, preserve_index=False)
test_dataset = datasets.Dataset.from_pandas(test, preserve_index=False)


datasetDict = datasets.DatasetDict({"train":train_dataset, "validation":val_dataset, "test":test_dataset})

datasetDict.save_to_disk('dataset')

## Publish dataset to Hugging Face Datasets Hub

In [None]:
datasetDict.push_to_hub('xvadov01/test-dataset')