In [1]:
from transformers import BertConfig, BertModel

config = BertConfig()

model = BertModel(config)

config

  from .autonotebook import tqdm as notebook_tqdm


BertConfig {
  "_attn_implementation_autoset": true,
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.47.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [2]:
MODEL_ID = "bert-base-cased"
model = BertModel.from_pretrained(MODEL_ID)

In [3]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_ID)

In [4]:
from pprint import pprint
pprint(tokenizer("Using a Transformer network is simple"))

{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1],
 'input_ids': [101, 7993, 170, 13809, 23763, 2443, 1110, 3014, 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [5]:
tokenizer.save_pretrained("./bert/")

('./bert/tokenizer_config.json',
 './bert/special_tokens_map.json',
 './bert/vocab.txt',
 './bert/added_tokens.json')

In [6]:
sequence = "Using a Transformer network is simple"
tokens = tokenizer.tokenize(sequence)
pprint(tokens)

['Using', 'a', 'Trans', '##former', 'network', 'is', 'simple']


In [7]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[7993, 170, 13809, 23763, 2443, 1110, 3014]


In [8]:
decoded_string = tokenizer.decode(ids)
print(decoded_string)

Using a Transformer network is simple


In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = "I've been waiting for a HuggingFace course my whole life."

tokens = tokenizer.tokenize(sequence)
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
input_ids = torch.tensor([ids, ids])
pprint(input_ids)
output = model(input_ids)
pprint(output.logits)

['using', 'a', 'transform', '##er', 'network', 'is', 'simple']
tensor([[ 2478,  1037, 10938,  2121,  2897,  2003,  3722],
        [ 2478,  1037, 10938,  2121,  2897,  2003,  3722]])
tensor([[ 2.5189, -2.1906],
        [ 2.5189, -2.1906]], grad_fn=<AddmmBackward0>)


In [10]:
tokenized_inputs = tokenizer(sequence, return_tensors="pt")
pprint(tokenized_inputs)

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[  101,  2478,  1037, 10938,  2121,  2897,  2003,  3722,   102]])}


In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much."
]

# Pad the sequences up to max seq length
model_inputs = tokenizer(sequences, padding="longest")
print("Tokens with padding='longest': ", model_inputs)
# Pad the sequences up to max length (512 for BERT or distilBERT)
model_inputs = tokenizer(sequences, padding="max_length")
print("Tokens with padding='max_length': ", model_inputs)
# Pad the seqences up to SPECIFIED max length
model_inputs = tokenizer(sequences, padding="max_length", max_length=8)
print("Tokens with padding='max_length' & max_length='8': ", model_inputs)

Tokens with padding='longest':  {'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102], [101, 1045, 5223, 2023, 2061, 2172, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
Tokens with padding='max_length':  {'input_ids': [[101, 1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [26]:
# Putting it all together
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = [
    "Learning transformers is pretty easy.",
    "I totally agree."
]

tokens = tokenizer(sequences, 
                         padding=True,
                         truncation=True,
                         return_tensors="pt"
                        )

output = model(**tokens)

probs = torch.nn.functional.softmax(output.logits, dim=1)

print(output.logits)
print(probs)


tensor([[-1.8175,  1.7472],
        [-4.2654,  4.5955]], grad_fn=<AddmmBackward0>)
tensor([[2.7525e-02, 9.7247e-01],
        [1.4181e-04, 9.9986e-01]], grad_fn=<SoftmaxBackward0>)


In [1]:
from datasets import load_dataset

raw_dataset = load_dataset("glue", "mrpc")
raw_dataset

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 3668/3668 [00:00<00:00, 779209.23 examples/s]
Generating validation split: 100%|██████████| 408/408 [00:00<00:00, 197652.58 examples/s]
Generating test split: 100%|██████████| 1725/1725 [00:00<00:00, 852400.38 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [16]:
raw_train_dataset = raw_dataset["train"]
raw_train_dataset[0]

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [21]:
raw_train_dataset.features

{'sentence1': Value(dtype='string', id=None),
 'sentence2': Value(dtype='string', id=None),
 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None),
 'idx': Value(dtype='int32', id=None)}

In [40]:
from datasets import Dataset
from pprint import pprint

def get_label(idx: int, dataset: Dataset):
    labels = dataset.features["label"].names
    sample = dataset[idx]
    sample_label = sample["label"]
    return sample, labels[sample_label]

pprint(get_label(87, raw_train_dataset))

({'idx': 100,
  'label': 1,
  'sentence1': 'Tuition at four-year private colleges averaged $ 19,710 this '
               'year , up 6 percent from 2002 .',
  'sentence2': 'For the current academic year , tuition at public colleges '
               'averaged $ 4,694 , up almost $ 600 from the year before .'},
 'equivalent')


In [53]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_sentence_1 = tokenizer(raw_dataset["train"]["sentence1"])
tokenized_sentence_2 = tokenizer(raw_dataset["train"]["sentence2"])

In [58]:
tokenized_dataset = tokenizer(
    raw_dataset["train"]["sentence1"],
    raw_dataset["train"]["sentence2"],
    padding=True,
    truncation=True,
)

In [55]:
def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

In [66]:
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map: 100%|██████████| 408/408 [00:00<00:00, 2883.26 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [67]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [69]:
samples = tokenized_dataset["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["idx", "sentence1", "sentence2"]}
[len(x) for x in samples["input_ids"]]

[50, 59, 47, 67, 59, 50, 62, 32]

In [70]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 67]),
 'token_type_ids': torch.Size([8, 67]),
 'attention_mask': torch.Size([8, 67]),
 'labels': torch.Size([8])}

In [71]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_dataset = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(example["sentence1"],
                     example["sentence2"],
                     truncation=True)
    
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map: 100%|██████████| 3668/3668 [00:00<00:00, 30643.41 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 17901.69 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 29195.63 examples/s]


In [72]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [73]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [75]:
trainer.train()

 36%|███▋      | 500/1377 [00:48<01:24, 10.42it/s]

{'loss': 0.6352, 'grad_norm': 2.9263598918914795, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


 73%|███████▎  | 1000/1377 [01:38<00:35, 10.56it/s]

{'loss': 0.6032, 'grad_norm': 3.2335994243621826, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


100%|██████████| 1377/1377 [02:17<00:00, 10.02it/s]

{'train_runtime': 137.3896, 'train_samples_per_second': 80.093, 'train_steps_per_second': 10.023, 'train_loss': 0.5990198893051868, 'epoch': 3.0}





TrainOutput(global_step=1377, training_loss=0.5990198893051868, metrics={'train_runtime': 137.3896, 'train_samples_per_second': 80.093, 'train_steps_per_second': 10.023, 'total_flos': 419446300011600.0, 'train_loss': 0.5990198893051868, 'epoch': 3.0})

In [76]:
predictions = trainer.predict(tokenized_dataset["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)

100%|██████████| 51/51 [00:00<00:00, 53.70it/s]

(408, 2) (408,)





In [79]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)

In [81]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.7426470588235294, 'f1': 0.8361934477379095}

In [82]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [83]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [84]:
trainer.train()

                                                  
 33%|███▎      | 460/1377 [00:48<09:16,  1.65it/s]

{'eval_loss': 0.59553062915802, 'eval_accuracy': 0.7205882352941176, 'eval_f1': 0.8235294117647058, 'eval_runtime': 2.9578, 'eval_samples_per_second': 137.938, 'eval_steps_per_second': 17.242, 'epoch': 1.0}


 36%|███▋      | 500/1377 [00:52<01:28,  9.89it/s]

{'loss': 0.636, 'grad_norm': 2.600222587585449, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


                                                  
 67%|██████▋   | 918/1377 [01:37<00:43, 10.64it/s]

{'eval_loss': 0.5403149724006653, 'eval_accuracy': 0.7622549019607843, 'eval_f1': 0.8462757527733756, 'eval_runtime': 2.9369, 'eval_samples_per_second': 138.922, 'eval_steps_per_second': 17.365, 'epoch': 2.0}


 73%|███████▎  | 1000/1377 [01:46<00:37, 10.05it/s]

{'loss': 0.5335, 'grad_norm': 3.487730026245117, 'learning_rate': 1.3689179375453886e-05, 'epoch': 2.18}


                                                   
100%|██████████| 1377/1377 [02:28<00:00,  9.29it/s]

{'eval_loss': 0.5452971458435059, 'eval_accuracy': 0.7941176470588235, 'eval_f1': 0.86, 'eval_runtime': 2.9242, 'eval_samples_per_second': 139.527, 'eval_steps_per_second': 17.441, 'epoch': 3.0}
{'train_runtime': 148.1602, 'train_samples_per_second': 74.271, 'train_steps_per_second': 9.294, 'train_loss': 0.5337902060780909, 'epoch': 3.0}





TrainOutput(global_step=1377, training_loss=0.5337902060780909, metrics={'train_runtime': 148.1602, 'train_samples_per_second': 74.271, 'train_steps_per_second': 9.294, 'total_flos': 419446300011600.0, 'train_loss': 0.5337902060780909, 'epoch': 3.0})

In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_dataset = load_dataset("glue", "mrpc")
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(
        example["sentence1"],
        example["sentence2"],
        truncation=True,
    )
    
tokenized_dataset = raw_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 408/408 [00:00<00:00, 3466.77 examples/s]


In [6]:
tokenized_dataset = tokenized_dataset.remove_columns(["sentence1", "sentence2", "idx"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch")
tokenized_dataset["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [9]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset["train"], 
    shuffle=True,
    batch_size=8,
    collate_fn=data_collator
)

eval_dataloader = DataLoader(
    tokenized_dataset["validation"],
    batch_size=8,
    collate_fn=data_collator
)

In [10]:
for batch in train_dataloader:
    break

{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 73]),
 'token_type_ids': torch.Size([8, 73]),
 'attention_mask': torch.Size([8, 73])}

In [13]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.5619, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [15]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)



In [16]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

print(num_training_steps)

1377


In [18]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [19]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████| 1377/1377 [01:52<00:00, 12.37it/s]

In [20]:
import evaluate
metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8504901960784313, 'f1': 0.8946459412780656}

In [None]:
# from accelerate import Accelerator
# from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
# 
# accelerator = Accelerator()
# 
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
# optimizer = AdamW(model.parameters(), lr=3e-5)
# 
# train_dl, eval_dl, model, optimizer = accelerator.prepare(
#     train_dataloader, eval_dataloader, model, optimizer
# )
# 
# num_epochs = 3
# num_training_steps = num_epochs * len(train_dl)
# lr_scheduler = get_scheduler(
#     "linear",
#     optimizer=optimizer,
#     num_warmup_steps=0,
#     num_training_steps=num_training_steps,
# )
# 
# progress_bar = tqdm(range(num_training_steps))
# 
# model.train()
# for epoch in range(num_epochs):
#     for batch in train_dl:
#         outputs = model(**batch)
#         loss = outputs.loss
#         accelerator.backward(loss)
# 
#         optimizer.step()
#         lr_scheduler.step()
#         optimizer.zero_grad()
#         progress_bar.update(1)

In [7]:
!curl -O "https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 16309    0 16309    0     0  14728      0 --:--:--  0:00:01 --:--:-- 14745
100  883k    0  883k    0     0   411k      0 --:--:--  0:00:02 --:--:--  411k
100 4143k    0 4143k    0     0  1152k      0 --:--:--  0:00:03 --:--:-- 1153k
100 5253k    0 5253k    0     0  1247k      0 --:--:--  0:00:04 --:--:-- 1248k
100 11.5M    0 11.5M    0     0  2267k      0 --:--:--  0:00:05 --:--:-- 2363k
100 17.1M    0 17.1M    0     0  2897k      0 --:--:--  0:00:06 --:--:-- 3541k
100 20.3M    0 20.3M    0     0  2950k      0 --:--:--  0:00:07 --:--:-- 4062k
100 23.8M    0 23.8M    0     0  3026k      0 --:--:--  0:00:08 --:--:-- 4535k
100 27.5M    0 27.5M    0     0  3118k      0 --:--

In [53]:
from datasets import load_dataset

data_files = {
    "train": "drugsComTrain_raw.tsv",
    "test": "drugsComTest_raw.tsv"
}

drugs_dataset = load_dataset("csv", 
                            data_files=data_files, 
                            delimiter="\t")

drugs_dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['Unnamed: 0', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [54]:
drugs_sample = drugs_dataset["train"].shuffle(seed=42).select(range(1000))
drugs_sample[:3]

{'Unnamed: 0': [87571, 178045, 80482],
 'drugName': ['Naproxen', 'Duloxetine', 'Mobic'],
 'condition': ['Gout, Acute', 'ibromyalgia', 'Inflammatory Conditions'],
 'review': ['"like the previous person mention, I&#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!"',
  '"I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects."',
  '"I have been taking Mobic for over a year with no side effects other than 

In [55]:
for split in drugs_dataset.keys():
    assert len(drugs_dataset[split]) == len(drugs_dataset[split].unique("Unnamed: 0"))

In [56]:
try:
    drugs_dataset = drugs_dataset.rename_column("Unnamed: 0", "patient_id")
except:
    pass
drugs_dataset

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 161297
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount'],
        num_rows: 53766
    })
})

In [57]:
print(len(drugs_dataset.unique("drugName")["train"]))
print(len(drugs_dataset.unique("condition")["train"]))

3436
885


In [59]:
drugs_dataset = drugs_dataset.filter(lambda x: x["condition"] is not None)

In [61]:
def lowercase_condition(example):
    return {"condition": example["condition"].lower()}

drugs_dataset = drugs_dataset.map(lowercase_condition)

In [62]:
drugs_dataset["train"]["condition"][:3]

['left ventricular dysfunction', 'adhd', 'birth control']

In [68]:
drugs_dataset = drugs_dataset.map(lambda x: {"review_length": [len(o.split()) for o in x["review"]]},
                                  batched=True)
drugs_dataset["train"][:3]

Map: 100%|██████████| 160398/160398 [00:00<00:00, 184671.16 examples/s]
Map: 100%|██████████| 53471/53471 [00:00<00:00, 182384.33 examples/s]


{'patient_id': [206461, 95260, 92703],
 'drugName': ['Valsartan', 'Guanfacine', 'Lybrel'],
 'condition': ['left ventricular dysfunction', 'adhd', 'birth control'],
 'review': ['"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
  '"My son is halfway through his fourth week of Intuniv. We became concerned when he began this last week, when he started taking the highest dose he will be on. For two days, he could hardly get out of bed, was very cranky, and slept for nearly 8 hours on a drive home from school vacation (very unusual for him.) I called his doctor on Monday morning and she said to stick it out a few days. See how he did at school, and with getting up in the morning. The last two days have been problem free. He is MUCH more agreeable than ever. He is less emotional (a good thing), less cranky. He is remembering all the things he should. Overall his behavior is better. \r\nWe have tried many different medications and so far this is the most effect

In [49]:
drugs_dataset["train"][0]

{'patient_id': 206461,
 'drugName': 'Valsartan',
 'condition': 'left ventricular dysfunction',
 'review': '"It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil"',
 'rating': 9.0,
 'date': 'May 20, 2012',
 'usefulCount': 27,
 'review_length': 17}

In [None]:
drugs_dataset["train"].sort("review_length")[:3]

{'patient_id': [111469, 13653, 53602],
 'drugName': ['Ledipasvir / sofosbuvir',
  'Amphetamine / dextroamphetamine',
  'Alesse'],
 'condition': ['hepatitis c', 'adhd', 'birth control'],
 'review': ['"Headache"', '"Great"', '"Awesome"'],
 'rating': [10.0, 10.0, 10.0],
 'date': ['February 3, 2015', 'October 20, 2009', 'November 23, 2015'],
 'usefulCount': [41, 3, 0],
 'review_length': [1, 1, 1]}

In [70]:
print(drugs_dataset.num_rows)
drugs_dataset = drugs_dataset.filter(lambda x: x["review_length"] > 30)
print(drugs_dataset.num_rows)

{'train': 160398, 'test': 53471}


Filter: 100%|██████████| 160398/160398 [00:01<00:00, 140613.65 examples/s]
Filter: 100%|██████████| 53471/53471 [00:00<00:00, 132928.57 examples/s]

{'train': 138514, 'test': 46108}





In [71]:
import html

drugs_dataset = drugs_dataset.map(lambda x: {"review": [html.unescape(o) for o in x["review"]]},
                                  batched=True)

Map: 100%|██████████| 138514/138514 [00:05<00:00, 24821.28 examples/s]
Map: 100%|██████████| 46108/46108 [00:01<00:00, 24599.61 examples/s]


In [80]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(example):
    return tokenizer(example["review"], truncation=True)

In [84]:
%time tokenized_dataset = drugs_dataset.map(tokenize_function, batched=True)

CPU times: total: 46.9 ms
Wall time: 49.2 ms


In [94]:
def slow_tokenize_function(examples):
    from transformers import AutoTokenizer
    slow_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
    return slow_tokenizer(examples["review"], truncation=True)

%time tokenized_dataset = drugs_dataset.map(slow_tokenize_function, batched=True, num_proc=16)

Map (num_proc=16): 100%|██████████| 138514/138514 [00:15<00:00, 8772.71 examples/s] 
Map (num_proc=16): 100%|██████████| 46108/46108 [00:11<00:00, 4053.27 examples/s] 

CPU times: total: 203 ms
Wall time: 27.6 s





In [100]:
def tokenize_and_split(examples):
    result = tokenizer(
        examples["review"],
        truncation=True,
        max_length=128,
        return_overflowing_tokens=True,
    )

    sample_map = result.pop("overflow_to_sample_mapping")
    for k, v in examples.items():
        result[k] = [v[i] for i in sample_map]
    return result

In [103]:
tokenized_dataset = drugs_dataset.map(tokenize_and_split, 
                                      batched=True)

Map: 100%|██████████| 138514/138514 [00:39<00:00, 3544.61 examples/s]
Map: 100%|██████████| 46108/46108 [00:12<00:00, 3569.65 examples/s]


In [104]:
len(tokenized_dataset["train"]), len(drugs_dataset["train"])

(206772, 138514)

In [107]:
drugs_dataset_clean = drugs_dataset["train"].train_test_split(test_size=0.2, seed=42)
drugs_dataset_clean["validation"] = drugs_dataset_clean.pop("test")
drugs_dataset_clean["test"] = drugs_dataset["test"]
drugs_dataset_clean

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [108]:
drugs_dataset_clean.save_to_disk("drugs-reviews")

Saving the dataset (1/1 shards): 100%|██████████| 110811/110811 [00:00<00:00, 126626.44 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 27703/27703 [00:00<00:00, 103622.03 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 46108/46108 [00:00<00:00, 1352299.29 examples/s]


In [109]:
from datasets import load_from_disk

drugs_dataset_reloaded = load_from_disk("drugs-reviews")
drugs_dataset_reloaded

DatasetDict({
    train: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 110811
    })
    validation: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 27703
    })
    test: Dataset({
        features: ['patient_id', 'drugName', 'condition', 'review', 'rating', 'date', 'usefulCount', 'review_length'],
        num_rows: 46108
    })
})

In [1]:
from datasets import load_dataset
import os

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"

issues_dataset = load_dataset("lewtun/github-issues", split="train")
issues_dataset

  from .autonotebook import tqdm as notebook_tqdm
Repo card metadata block was not found. Setting CardData to empty.


Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 3019
})

In [2]:
issues_dataset = issues_dataset.filter(
    lambda x: (x["is_pull_request"] == False and len(x["comments"]) >0)
)
issues_dataset

Dataset({
    features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
    num_rows: 808
})

In [3]:
columns = issues_dataset.column_names
columns_to_keep = ["title", "body", "html_url", "comments"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 808
})

In [4]:
issues_dataset.set_format("pandas")
df = issues_dataset[:]
df.head()

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,Protect master branch,"[Cool, I think we can do both :), @lhoestq now...",After accidental merge commit (91c55355b634d0d...
1,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,[Hi ! I guess the caching mechanism should hav...,## Describe the bug\r\nAfter upgrading to data...
2,https://github.com/huggingface/datasets/issues...,OSCAR unshuffled_original_ko: NonMatchingSplit...,[I tried `unshuffled_original_da` and it is al...,## Describe the bug\r\n\r\nCannot download OSC...
3,https://github.com/huggingface/datasets/issues...,load_dataset using default cache on Windows ca...,"[Hi @daqieq, thanks for reporting.\r\n\r\nUnfo...",## Describe the bug\r\nStandard process to dow...
4,https://github.com/huggingface/datasets/issues...,to_tf_dataset keeps a reference to the open da...,"[I did some investigation and, as it seems, th...",To reproduce:\r\n```python\r\nimport datasets ...


In [5]:
df["comments"][0].tolist()

['Cool, I think we can do both :)',
 '@lhoestq now the 2 are implemented.\r\n\r\nPlease note that for the the second protection, finally I have chosen to protect the master branch only from **merge commits** (see update comment above), so no need to disable/re-enable the protection on each release (direct commits, different from merge commits, can be pushed to the remote master branch; and eventually reverted without messing up the repo history).']

In [6]:
comments_df = df.explode("comments", ignore_index=True)
comments_df.head(4)

Unnamed: 0,html_url,title,comments,body
0,https://github.com/huggingface/datasets/issues...,Protect master branch,"Cool, I think we can do both :)",After accidental merge commit (91c55355b634d0d...
1,https://github.com/huggingface/datasets/issues...,Protect master branch,@lhoestq now the 2 are implemented.\r\n\r\nPle...,After accidental merge commit (91c55355b634d0d...
2,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,Hi ! I guess the caching mechanism should have...,## Describe the bug\r\nAfter upgrading to data...
3,https://github.com/huggingface/datasets/issues...,Backwards compatibility broken for cached data...,"If it's easy enough to implement, then yes ple...",## Describe the bug\r\nAfter upgrading to data...


In [7]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset

Dataset({
    features: ['html_url', 'title', 'comments', 'body'],
    num_rows: 2964
})

In [8]:
comments_dataset = comments_dataset.map(
    lambda x: {"comment_length": len(x["comments"].split())},
)
comments_dataset

Map: 100%|██████████| 2964/2964 [00:00<00:00, 24206.72 examples/s]


Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length'],
    num_rows: 2964
})

In [9]:
comments_dataset = comments_dataset.filter(
    lambda x: x["comment_length"] > 15
)

Filter: 100%|██████████| 2964/2964 [00:00<00:00, 142266.03 examples/s]


In [10]:
def concatenate_text(examples):
    return {
        "text": examples["title"]
        + " \n "
        + examples["body"]
        + " \n "
        + examples["comments"],
    }

comments_dataset = comments_dataset.map(concatenate_text)

Map: 100%|██████████| 2175/2175 [00:00<00:00, 12379.19 examples/s]


In [11]:
comments_dataset[0]

{'html_url': 'https://github.com/huggingface/datasets/issues/2945',
 'title': 'Protect master branch',
 'comments': '@lhoestq now the 2 are implemented.\r\n\r\nPlease note that for the the second protection, finally I have chosen to protect the master branch only from **merge commits** (see update comment above), so no need to disable/re-enable the protection on each release (direct commits, different from merge commits, can be pushed to the remote master branch; and eventually reverted without messing up the repo history).',
 'body': 'After accidental merge commit (91c55355b634d0dc73350a7ddee1a6776dbbdd69) into `datasets` master branch, all commits present in the feature branch were permanently added to `datasets` master branch history, as e.g.:\r\n- 00cc036fea7c7745cfe722360036ed306796a3f2\r\n- 13ae8c98602bbad8197de3b9b425f4c78f582af1\r\n- ...\r\n\r\nI propose to protect our master branch, so that we avoid we can accidentally make this kind of mistakes in the future:\r\n- [x] For Pul

In [12]:
from transformers import AutoTokenizer, AutoModel
import torch

model_checkpoint = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModel.from_pretrained(model_checkpoint)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [13]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [14]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [15]:
embedding = get_embeddings(comments_dataset["text"][0])
embedding.shape

torch.Size([1, 768])

In [16]:
embeddings_dataset = comments_dataset.map(
    lambda x: 
    {"embedding": get_embeddings(x["text"])
    .detach()
    .cpu()
    .numpy()[0]},
)

Map: 100%|██████████| 2175/2175 [01:09<00:00, 31.25 examples/s]


In [17]:
import faiss
embeddings_dataset.add_faiss_index(column="embedding")

  0%|          | 0/3 [00:00<?, ?it/s]

100%|██████████| 3/3 [00:00<00:00, 260.63it/s]


Dataset({
    features: ['html_url', 'title', 'comments', 'body', 'comment_length', 'text', 'embedding'],
    num_rows: 2175
})

In [18]:
query = "How can I load a dataset offline?"
query_embedding = get_embeddings(query).detach().cpu().numpy()
query_embedding.shape

(1, 768)

In [19]:
score, samples = embeddings_dataset.get_nearest_examples(
    "embedding", query_embedding, k=5
)

In [20]:
import pandas as pd

samples_df = pd.DataFrame(samples)
samples_df["score"] = score
samples_df.sort_values("score", ascending=False, inplace=True)

In [22]:
from pprint import pprint

for _, row in samples_df.iterrows():
    pprint({
        "comment": row.comments,
        "score": row.score,
        "title": row.title,
        "url": row.html_url,
    })

{'comment': 'Requiring online connection is a deal breaker in some cases '
            "unfortunately so it'd be great if offline mode is added similar "
            'to how `transformers` loads models offline fine.\r\n'
            '\r\n'
            "@mandubian's second bullet point suggests that there's a "
            'workaround allowing you to use your offline (custom?) dataset '
            'with `datasets`. Could you please elaborate on how that should '
            'look like?',
 'score': 25.505014419555664,
 'title': 'Discussion using datasets in offline mode',
 'url': 'https://github.com/huggingface/datasets/issues/824'}
{'comment': 'The local dataset builders (csv, text , json and pandas) are now '
            'part of the `datasets` package since #1726 :)\r\n'
            'You can now use them offline\r\n'
            '```python\r\n'
            "datasets = load_dataset('text', data_files=data_files)\r\n"
            '```\r\n'
            '\r\n'
            "We'll do a new