In [None]:
!wget "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"

--2026-01-05 21:00:58--  https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘drugsCom_raw.zip.1’

drugsCom_raw.zip.1      [    <=>             ] 339.93K  8.13KB/s               

In [None]:
!unzip drugsCom_raw.zip

In [1]:
from datasets import load_dataset

In [2]:
data_files = {
    'train': 'data/drugsComTrain_raw.tsv',
    'test': 'data/drugsComTest_raw.tsv'
}

In [3]:
drug_dataset = load_dataset('csv',data_files=data_files,delimiter='\t')

In [4]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [5]:
drug_sample = drug_dataset['train'].shuffle(seed=42).select(range(1000))

In [6]:
drug_sample

Dataset({
    features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
    num_rows: 1000
})

In [7]:
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [8]:
drug_dataset.keys()

dict_keys(['train', 'test'])

In [9]:
for split in drug_dataset.keys():
    assert len(drug_dataset[split]) == len(drug_dataset[split].unique("Unnamed: 0"))

In [10]:
drug_dataset = drug_dataset.rename_column(
    original_column_name="Unnamed: 0", new_column_name="patient_id"
)
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [11]:
drug_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [12]:
def lower_condition(example):
    return {'condition': example['condition'].lower()}

In [13]:
drug_dataset.map(lower_condition)

Map:   0%|          | 0/161297 [00:00<?, ? examples/s]

AttributeError: 'NoneType' object has no attribute 'lower'

In [14]:
def filter_nones(x):
    return x['condition'] is not None

In [15]:
drug_dataset = drug_dataset.filter(filter_nones)

In [16]:
drug_dataset = drug_dataset.map(lower_condition)

In [17]:
drug_dataset['train']['condition'][:3]

['left ventricular dysfunction', 'adhd', 'birth control']

In [18]:
drug_dataset['train'][:3]

{'patient_id': [206461, 95260, 92703],
 'drugName': ['Valsartan', 'Guanfacine', 'Lybrel'],
 'condition': ['left ventricular dysfunction', 'adhd', 'birth control'],
 'review': ['"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
  '"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effect

In [19]:
def compute_review_length(example):
    return {'review_length': len(example['review'].split())}

In [20]:
drug_dataset = drug_dataset.map(compute_review_length)

In [21]:
drug_dataset['train'][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [22]:
drug_dataset['train'].sort('review_length')[-1]

{'patient_id': 121004,
 'drugName': 'Venlafaxine',
 'condition': 'migraine',
 'review': '"Two and a half months ago I was prescribed Venlafaxine to help prevent chronic migraines.\r\nIt did help the migraines (reduced them by almost half), but with it came a host of side effects that were far worse than the problem I was trying to get rid of.\r\nHaving now come off of the stuff, I would not recommend anyone ever use Venlafaxine unless they suffer from extreme / suicidal depression. I mean extreme in the most emphatic sense of the word. \r\nBefore trying Venlafaxine, I was a writer. While on Venlafaxine, I could barely write or speak or communicate at all. More than that, I just didn&#039;t want to. Not normal for a usually outgoing extrovert.\r\nNow, I&#039;m beginning to write again - but my ability to speak and converse with others has deteriorated by about 95%. Writing these words is taking forever; keeping up in conversation with even one person is impossible, and I barely see the 

In [23]:
drug_dataset = drug_dataset.filter(lambda x: x['review_length'] > 50)

In [24]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 117788
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 39253
    })
})

In [25]:
drug_dataset.num_rows

{'train': 117788, 'test': 39253}

In [27]:
import html

In [28]:

%time drug_dataset = drug_dataset.map(lambda x: {'review': html.unescape(x['review'])})

CPU times: user 19.8 ms, sys: 46.3 ms, total: 66.1 ms
Wall time: 131 ms


In [29]:
from transformers import AutoTokenizer

In [30]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [31]:
def tokenize_function(examples):
    return tokenizer(examples['review'],truncation=True)

In [33]:
%time tokenized_dataset = drug_dataset.map(tokenize_function)

Map:   0%|          | 0/117788 [00:00<?, ? examples/s]

Map:   0%|          | 0/39253 [00:00<?, ? examples/s]

CPU times: user 33.1 s, sys: 235 ms, total: 33.3 s
Wall time: 33.7 s


In [34]:
%time tokenized_dataset = drug_dataset.map(tokenize_function,batched=True)

Map:   0%|          | 0/117788 [00:00<?, ? examples/s]

CPU times: user 20 s, sys: 496 ms, total: 20.5 s
Wall time: 6.35 s


In [36]:
slow_tokenizer = AutoTokenizer.from_pretrained('bert-base-cased',use_fast=False)

In [37]:
def slow_tokenize_function(examples):
    return slow_tokenizer(examples['review'],truncation=True)

In [44]:
%time tokenized_dataset2 = drug_dataset.map(slow_tokenize_function,batched=True)

CPU times: user 165 ms, sys: 4 ms, total: 169 ms
Wall time: 168 ms


In [43]:
%time tokenized_dataset2 = drug_dataset.map(slow_tokenize_function,batched=True,num_proc=8)

CPU times: user 165 ms, sys: 34 μs, total: 165 ms
Wall time: 167 ms


In [45]:
def tokenize_and_split(examples):
    return tokenizer(
        examples['review'],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

In [46]:
result = tokenize_and_split(drug_dataset['train'][0])

In [48]:
[len(nip) for nip in result['input_ids']]

[128, 49]

In [50]:
tokenized_dataset = drug_dataset.map(tokenize_and_split,batched=True)

Map:   0%|          | 0/117788 [00:00<?, ? examples/s]

ArrowInvalid: Column 8 named input_ids expected length 1000 but got length 1542

In [52]:
tokenized_dataset = drug_dataset.map(tokenize_and_split,batched=True,remove_columns=drug_dataset['train'].column_names)

Map:   0%|          | 0/117788 [00:00<?, ? examples/s]

Map:   0%|          | 0/39253 [00:00<?, ? examples/s]

In [53]:
len(tokenized_dataset['train']),len(drug_dataset['train'])

(186044, 117788)

In [54]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples['review'],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )
    sample_map = result.pop('overflow_to_sample_mapping')
    for key,values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

In [55]:
tokenized_dataset = drug_dataset.map(tokenize_and_split,batched=True)

Map:   0%|          | 0/117788 [00:00<?, ? examples/s]

Map:   0%|          | 0/39253 [00:00<?, ? examples/s]

In [56]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 186044
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 62020
    })
})

In [57]:
drug_dataset.set_format('pandas')

In [58]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 117788
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 39253
    })
})

In [59]:
drug_dataset['train'][:3]

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89


In [60]:
train_df = drug_dataset['train'][:]

In [61]:
train_df

Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,95260,Guanfacine,adhd,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192,141
1,92703,Lybrel,birth control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17,134
2,138000,Ortho Evra,birth control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10,89
3,35696,Buprenorphine / naloxone,opiate dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37,124
4,155963,Cialis,benign prostatic hyperplasia,"""2nd day on 5mg started to work with rock hard...",2.0,"November 28, 2015",43,68
...,...,...,...,...,...,...,...,...
117783,103458,Tekturna,high blood pressure,"""I have only been on Tekturna for 9 days. The ...",7.0,"February 7, 2010",18,72
117784,164345,Junel 1.5 / 30,birth control,"""This would be my second month on Junel. I've ...",6.0,"May 27, 2015",0,71
117785,191035,Campral,alcohol dependence,"""I wrote my first report in Mid-October of 201...",10.0,"May 31, 2015",125,127
117786,47128,Thyroid desiccated,underactive thyroid,"""I've been on thyroid medication 49 years, I s...",10.0,"September 19, 2015",79,136


In [62]:
frequencies = (
    train_df['condition']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index': 'condition','count': 'frequency'}))

In [63]:
frequencies.head()

Unnamed: 0,condition,frequency
0,birth control,25288
1,depression,6929
2,acne,4646
3,anxiety,4298
4,pain,3734


In [64]:
from datasets import Dataset

In [65]:
freq_dataset = Dataset.from_pandas(frequencies)

In [67]:
freq_dataset[:10]

{'condition': ['birth control',
  'depression',
  'acne',
  'anxiety',
  'pain',
  'bipolar disorde',
  'weight loss',
  'obesity',
  'adhd',
  'insomnia'],
 'frequency': [25288, 6929, 4646, 4298, 3734, 2958, 2954, 2841, 2632, 2393]}

In [68]:
drug_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 117788
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 39253
    })
})

In [69]:
drug_dataset.reset_format()

In [70]:
type(drug_dataset)

datasets.dataset_dict.DatasetDict

In [71]:
drug_data_clean = drug_dataset['train'].train_test_split(train_size=0.8,seed=42)

In [72]:
drug_data_clean['validation'] = drug_data_clean.pop('test')

In [73]:
drug_data_clean['test'] = drug_dataset['test']

In [74]:
drug_data_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 94230
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 23558
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 39253
    })
})

In [75]:
drug_data_clean.save_to_disk('data/drug-reviews')

Saving the dataset (0/1 shards):   0%|          | 0/94230 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/23558 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/39253 [00:00<?, ? examples/s]

In [76]:
from datasets import load_from_disk

In [77]:
drug_data_reloaded = load_from_disk('data/drug-reviews')

In [78]:
drug_data_reloaded

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 94230
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 23558
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 39253
    })
})

In [79]:
for split,dataset in drug_data_clean.items():
    dataset.to_json(f'data/json/drug-reviews-{split}.jsonl')

Creating json from Arrow format:   0%|          | 0/95 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/40 [00:00<?, ?ba/s]

In [81]:
!head -n 1 data/json/drug-reviews-train.jsonl

{"patient_id":172429,"drugName":"Clonazepam","condition":"anxiety","review":"\"I am on a maintenance dose of 1 mg 3 times a day along with a different Xanax .5 if I am having a panic attack. I'm agoraphobic and suffer from anxiety because of multiple reasons. I find klonopin is a amazing medication that helps tremendously. I wouldn't have the life I have today.\"","rating":10.0,"date":"August 13, 2016","usefulCount":97,"review_length":54}


In [82]:
data_files = {
    "train": "data/json/drug-reviews-train.jsonl",
    "validation": "data/json/drug-reviews-validation.jsonl",
    "test": "data/json/drug-reviews-test.jsonl",
}
drug_dataset_reloaded = load_dataset("json", data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [83]:
drug_dataset_reloaded

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 94230
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 23558
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 39253
    })
})