In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"

In [None]:
!unzip drugsCom_raw.zip

In [None]:
from datasets import load_dataset

In [None]:
data_files = {
    'train': 'data/drugsComTrain_raw.tsv',
    'test': 'data/drugsComTest_raw.tsv'
}

In [None]:
drug_dataset = load_dataset('csv',data_files=data_files,delimiter='\t')

In [None]:
drug_dataset

In [None]:
drug_sample = drug_dataset['train'].shuffle(seed=42).select(range(1000))

In [None]:
drug_sample

In [None]:
drug_sample[:3]

In [None]:
drug_dataset.keys()

In [None]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [None]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

In [None]:
drug_sample[:3]

In [None]:
def lower_condition(example):
    return {'condition': example['condition'].lower()}

In [None]:
drug_dataset.map(lower_condition)

In [None]:
def filter_nones(x):
    return x['condition'] is not None

In [None]:
drug_dataset = drug_dataset.filter(filter_nones)

In [None]:
drug_dataset = drug_dataset.map(lower_condition)

In [None]:
drug_dataset['train']['condition'][:3]

In [None]:
drug_dataset['train'][:3]

In [None]:
def compute_review_length(example):
    return {'review_length': len(example['review'].split())}

In [None]:
drug_dataset = drug_dataset.map(compute_review_length)

In [None]:
drug_dataset['train'][0]

In [None]:
drug_dataset['train'].sort('review_length')[-1]

In [None]:
drug_dataset = drug_dataset.filter(lambda x: x['review_length'] > 50)

In [None]:
drug_dataset

In [None]:
drug_dataset.num_rows

In [None]:
import html

In [None]:

%time drug_dataset = drug_dataset.map(lambda x: {'review': html.unescape(x['review'])})

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['review'],truncation=True)

In [None]:
%time tokenized_dataset = drug_dataset.map(tokenize_function)

In [None]:
%time tokenized_dataset = drug_dataset.map(tokenize_function,batched=True)

In [None]:
slow_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased',use_fast=False)

In [None]:
def slow_tokenize_function(examples):
    return slow_tokenizer(examples['review'],truncation=True)

In [None]:
%time tokenized_dataset2 = drug_dataset.map(slow_tokenize_function,batched=True)

In [None]:
%time tokenized_dataset2 = drug_dataset.map(slow_tokenize_function,batched=True,num_proc=8)

In [None]:
def tokenize_and_split(examples):
    return tokenizer(
        examples['review'],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

In [None]:
result = tokenize_and_split(drug_dataset['train'][0])

In [None]:
[len(nip) for nip in result['input_ids']]

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split,batched=True)

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split,batched=True,remove_columns=drug_dataset['train'].column_names)

In [None]:
len(tokenized_dataset['train']),len(drug_dataset['train'])

In [None]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples['review'],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    sample_map = result.pop('overflow_to_sample_mapping')
    for key,values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [None]:
tokenized_dataset = drug_dataset.map(tokenize_and_split,batched=True)

In [None]:
tokenized_dataset

In [None]:
drug_dataset.set_format('pandas')

In [None]:
drug_dataset

In [None]:
drug_dataset['train'][:3]

In [None]:
train_df = drug_dataset['train'][:]

In [None]:
train_df

In [None]:
frequencies = (
    train_df['condition']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index': 'condition','count': 'frequency'}))

In [None]:
frequencies.head()

In [None]:
from datasets import Dataset

In [None]:
freq_dataset = Dataset.from_pandas(frequencies)

In [None]:
freq_dataset[:10]

In [None]:
drug_dataset

In [None]:
drug_dataset.reset_format()

In [None]:
type(drug_dataset)

In [None]:
drug_data_clean = drug_dataset['train'].train_test_split(train_size=0.8,seed=42)

In [None]:
drug_data_clean['validation'] = drug_data_clean.pop('test')

In [None]:
drug_data_clean['test'] = drug_dataset['test']

In [None]:
drug_data_clean

In [None]:
drug_data_clean.save_to_disk('data/drug-reviews')

In [None]:
from datasets import load_from_disk

In [None]:
drug_data_reloaded = load_from_disk('data/drug-reviews')

In [None]:
drug_data_reloaded

In [None]:
for split,dataset in drug_data_clean.items():
    dataset.to_json(f'data/json/drug-reviews-{split}.jsonl')

In [None]:
!head -n 1 data/json/drug-reviews-train.jsonl

In [None]:
data_files = {
    "train": "data/json/drug-reviews-train.jsonl",
    "validation": "data/json/drug-reviews-validation.jsonl",
    "test": "data/json/drug-reviews-test.jsonl",
}
drug_dataset_reloaded = load_dataset("json", data_files=data_files)

In [None]:
drug_dataset_reloaded