# Imports

In [1]:
from datasets import load_dataset

from data_utils import (
    is_valid_python,
    insert_instruction_as_docstring,
    break_into_definition_and_solution
)

# Code Alpaca 20k

https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k

## Data Load

In [2]:
code_alpaca_dataset = load_dataset("sahil2801/CodeAlpaca-20k")
code_alpaca_dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 20022
    })
})

## Filtering

### Valid Python code

In [4]:
# Filter the dataset to only include `output` that is valid python code
# We might find some code snippets that are not valid python code like:
# 'Height of triangle = opposite side length * sin (angle) / side length'
code_alpaca_dataset = code_alpaca_dataset.filter(lambda x: is_valid_python(x['output']))

Filter:   0%|          | 0/8975 [00:00<?, ? examples/s]



### Only function definitions

In [6]:
# Finally get only the examples that start with 'def', as at HumanEval we are only interested in functions
# and we need to learn to use the function signature with the parameters to generate the function body
code_alpaca_dataset = code_alpaca_dataset.filter(lambda x: x['output'].startswith('def'))

Filter:   0%|          | 0/3897 [00:00<?, ? examples/s]

------

In [7]:
code_alpaca_dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 3897
    })
})

## Preprocessing

We need to transform Code Alpaca's data into the format tha HumanEval uses. To do so we prepare some utility functions at `data_utils.py`. We can summarize the steps as going from:

1. Code Alpaca's data format:
```python
output = '''
def foo():
    return 1
'''
instructions = "Write a function that returns 1"
```

2. To HumanEval's data format:
```python
instructions = '''
def foo():
    """ Write a function that returns 1
    """
'''
output = '''
    return 1
'''
```

In [12]:
case = 1
instruction = code_alpaca_dataset['train'][case]["instruction"]
code_def = code_alpaca_dataset['train'][case]["output"]
full = insert_instruction_as_docstring(instruction, code_def)
inputs, outputs = break_into_definition_and_solution(full)
print(f"\n## Inputs:\n{inputs}")
print(f"\n## Outputs:\n{outputs}")


## Inputs:
def find_num_distinct_states(matrix):
    """ Write a function to find the number of distinct states in a given matrix.
    """

## Outputs:
    states = set()
    for row in matrix:
        state = "".join([str(x) for x in row])
        states.add(state)
    return len(states)


In [20]:
def preprare_prompt_completion(example):
    instruction = example["instruction"]
    code_def = example["output"]
    full = insert_instruction_as_docstring(instruction, code_def)
    inputs, outputs = break_into_definition_and_solution(full)
    example["prompt"] = inputs
    example["completion"] = outputs
    return example

In [21]:
updated_dataset = code_alpaca_dataset.map(update_example)

Map:   0%|          | 0/3897 [00:00<?, ? examples/s]

In [18]:
# Now process the dataset to insert the inputs and outputs
# Iterate over the dataset and insert the inputs and outputs
for split in code_alpaca_dataset.keys():
    for i, example in enumerate(code_alpaca_dataset[split]):
        instruction = example["instruction"]
        code_def = example["output"]
        full = insert_instruction_as_docstring(instruction, code_def)
        inputs, outputs = break_into_definition_and_solution(full)
        code_alpaca_dataset[split][i]["prompt"] = inputs
        code_alpaca_dataset[split][i]["completion"] = outputs

In [19]:
example

{'output': 'def countVowels(str): \n\tvowel = 0\n\tfor i in str: \n\t\tif i.lower() in [\'a\', \'e\', \'i\', \'o\', \'u\']: \n\t\t\tvowel += 1\n\treturn vowel \n\n# Driver code \nstr = "Hello World"\nprint("Number of vowels are:", countVowels(str))',
 'input': 'Hello World',
 'instruction': 'Create a Python program to accept a string from the user and print out the number of vowels in the string.'}

In [14]:
code_alpaca_dataset

DatasetDict({
    train: Dataset({
        features: ['output', 'input', 'instruction'],
        num_rows: 3897
    })
})

In [95]:
print(f"## Definition:\n{d}")
print(f"\n## Solution:\n{s}")

## Definition:
def find_num_distinct_states(matrix):
    """ Write a function to find the number of
    distinct states in a given matrix.
    """

## Solution:
    states = set()
    for row in matrix:
        state = "".join([str(x) for x in row])
        states.add(state)
    return len(states)
