In [None]:
!pip install transformers datasets



In [None]:
from datasets import load_dataset

dataset1 = load_dataset("ysr/rust-self-instruct") # only code
dataset2 = load_dataset("ysr/rust_instruction_dataset") # code and assistance response
magicoder_rust = load_dataset("ise-uiuc/Magicoder-OSS-Instruct-75K") # clean the code column to be only rust code
neloy_dataset = load_dataset("Neloy262/rust_instruction_dataset") # clean the code column to be only rust code


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
import re

def extract_rust_code(text):
  # Rust function pattern
  rust_pattern = r'''(?s)```rust\n(.*?)```'''
  code_snippets = re.findall(rust_pattern, text)
  if code_snippets:
    return "```rust\n{}```".format("\n".join(code_snippets))
  return text

def remove_explanation(column_name):
  def custom_func(example):
    text = example[column_name]
    text = extract_rust_code(text)
    example[column_name] = text

    return example

  return custom_func

## Make Magicoder dataset only rust

In [None]:

magicoder_rust = magicoder_rust.filter(lambda example: example['lang'] == 'rust' and '```rust' in example['solution'])
magicoder_rust = magicoder_rust.map(remove_explanation('solution'))

In [None]:
magicoder_rust

DatasetDict({
    train: Dataset({
        features: ['lang', 'raw_index', 'index', 'seed', 'openai_fingerprint', 'problem', 'solution'],
        num_rows: 4069
    })
})

In [None]:
idx = 2003
# print(magicoder_rust['train'][idx]['problem'])
print(magicoder_rust['train'][idx]['solution'])

```rust
use std::io;
use std::io::Write;

pub fn borrow_stdio<F, T>(f: F) -> Result<T, io::Error>
where
    F: FnOnce() -> T,
{
    let mut input = String::new();

    // Read from standard input
    io::stdin().read_line(&mut input)?;

    // Execute the provided closure or function
    let result = f();

    // Write to standard output
    io::stdout().write_all(result.to_string().as_bytes())?;

    Ok(result)
}
```


## Removing all occurances of the word 'Rust' in neloy_dataset and remove explanation

In [None]:
neloy_dataset = load_dataset("Neloy262/rust_instruction_dataset") # clean the code column to be only rust code

def remove_rust_from_instruction(example):
    text = example['instruction'].lower()
    text = text.replace('a rust', 'a')
    text = text.replace('in rust', '')
    text = text.replace('rust', '')
    text = text.replace('  ', ' ') # fix double spacings
    example['instruction'] = text
    return example

def fix_rust_annotation(example):
    text = example['output']
    text = text.replace('``` rust', '```rust')
    example['output'] = text
    return example

neloy_dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 10000
    })
})

In [None]:
neloy_dataset = neloy_dataset.map(remove_rust_from_instruction) # remove all occurances of the word rust from instructions
neloy_dataset = neloy_dataset.map(fix_rust_annotation)
neloy_dataset = neloy_dataset.filter(lambda example: '```rust' in example['output'])
neloy_dataset = neloy_dataset.map(remove_explanation('output'))


In [None]:
neloy_dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 9445
    })
})

In [None]:
neloy_dataset['train'][3403]['instruction']

'construct a program to prompt the user for their age and display the appropriate greeting.'

In [None]:
print(neloy_dataset['train'][1002]['output'])

```rust
fn is_prime(n: u32) -> bool {
    if n <= 1 {
        return false;
    }
    let m = (n as f64).sqrt() as u32;
    for i in 2..=m {
        if n % i == 0 {
            return false;
        }
    }
    true
}

for n in 1..=1000 {
    if is_prime(n) {
        println!("{}", n);
    }
}
```


In [None]:
def create_messages(instruction_column, code_column):
  def generate_message_column(example):
    instruction = example[instruction_column]
    code = example[code_column]

    user_message = {'role': 'user', 'content': instruction}
    ai_message = {'role': 'assistant', 'content': code}

    example['messages'] = [user_message, ai_message]

    return example

  return generate_message_column


In [None]:
magicoder_rust = magicoder_rust.map(create_messages('problem', 'solution'), remove_columns=magicoder_rust['train'].column_names)
neloy_dataset = neloy_dataset.map(create_messages('instruction', 'output'), remove_columns=neloy_dataset['train'].column_names)
dataset1 = dataset1.map(create_messages('instruction', 'code'), remove_columns=dataset1['train'].column_names)
dataset2 = dataset2.map(create_messages('instruction', 'code'), remove_columns=dataset2['train'].column_names)

Map:   0%|          | 0/4069 [00:00<?, ? examples/s]

Map:   0%|          | 0/9445 [00:00<?, ? examples/s]

Map:   0%|          | 0/647 [00:00<?, ? examples/s]

Map:   0%|          | 0/524 [00:00<?, ? examples/s]

In [None]:
dataset2['train'][4]

{'messages': [{'content': 'area of the circle that has a square and a circle inscribed in it\narguments: a',
   'role': 'user'},
  {'content': 'fn circle_area(a: f64) -> f64 {\n    let square_area = a * a;\n    let radius_circle = a / 2.0;\n    let inner_circle_area = std::f64::consts::PI * radius_circle * radius_circle;\n    return square_area - inner_circle_area;\n}\n',
   'role': 'assistant'}]}

In [None]:
from datasets import concatenate_datasets
final_dataset = concatenate_datasets([magicoder_rust['train'], neloy_dataset['train'], dataset1['train'], dataset2['train']])

In [None]:
final_dataset = final_dataset.train_test_split(test_size=0.1)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
final_dataset.push_to_hub('ysr/rust-sft-training')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ysr/rust-sft-training/commit/a897f7843c5f8ef8f24ad56e7f964b1de6f94b9d', commit_message='Upload dataset', commit_description='', oid='a897f7843c5f8ef8f24ad56e7f964b1de6f94b9d', pr_url=None, pr_revision=None, pr_num=None)