In [105]:
import torch
torch.cuda.empty_cache()

In [106]:
import pandas as pd

# Load the datasets
ds_train = pd.read_pickle('pickles/ds_train.pkl')
ds_test = pd.read_pickle('pickles/ds_test.pkl')


# Rename the columns
ds_train = ds_train.drop(columns=['text'])
ds_test = ds_test.drop(columns=['text'])
ds_train = ds_train.rename(columns={'label':'og_label', 'simple_topic':'input_ids', 'no_stopword':'text'})
ds_test = ds_test.rename(columns={'label':'og_label', 'simple_topic':'input_ids', 'no_stopword':'text'})

# Create new datasets
from datasets import Dataset, DatasetDict
new_train = Dataset.from_pandas(ds_train[['input_ids','text']])
new_test = Dataset.from_pandas(ds_test[['input_ids','text']])

# Create a DatasetDict
new_ds = DatasetDict({
    'train': new_train,
    'test': new_test
})

# Save the new datasets to disk
new_ds.save_to_disk('data')



[A
Saving the dataset (1/1 shards): 100%|██████████| 11314/11314 [00:00<00:00, 1027639.90 examples/s]

[A
Saving the dataset (1/1 shards): 100%|██████████| 7532/7532 [00:00<00:00, 836107.82 examples/s]


In [107]:
new_train[0]

{'input_ids': 3,
 'text': 'wondering anyone could enlighten car saw day 2door sports car looked late 60s early 70s called bricklin doors really small addition front bumper separate rest body know anyone tellme model name engine specs years production car made history whatever info funky looking car please email'}

In [108]:

# # Load model and tokenizer

import torch
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
# model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") 

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_train = new_train.map(preprocess_function, batched=True)
tokenized_test = new_test.map(preprocess_function, batched=True)


[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
Map: 100%|██████████| 11314/11314 [00:01<00:00, 6454.62 examples/s]

[A
[A
[A
[A
[A
[A
[A
Map: 100%|██████████| 7532/7532 [00:01<00:00, 7461.38 examples/s]


In [109]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [110]:
new_test[0]

{'input_ids': 3,
 'text': 'little confused models 8889 bonnevilles heard le se lse sse ssei could someone tell differences far features performance also curious know book value prefereably 89 model much less book value usually get words much demand time year heard midspring early summer best time buy'}

In [111]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
# label1 = 0, etc
id2label = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6}
label2id = {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6}
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased", num_labels=7, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [112]:
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [114]:
model(**tokenized_train[0])

TypeError: DistilBertForSequenceClassification.forward() got an unexpected keyword argument 'text'

In [113]:
training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/1416 [03:41<?, ?it/s]
  0%|          | 0/1416 [00:00<?, ?it/s]

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,attention_mask.

In [None]:

# Make predictions
no_stopwords = ds_test['no_stopword'].tolist()
inputs = tokenizer(no_stopwords, return_tensors="pt", truncation=True, padding=True)
len(no_stopwords)

# data loader




7532

In [None]:
inputs = tokenizer(no_stopwords[0], return_tensors="pt", truncation=True, padding=True)

outputs = model(**inputs)

outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.1200, -0.1036]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
from torch.utils.checkpoint import checkpoint

# Free up GPU memory
del no_stopwords
torch.cuda.empty_cache()

# Perform the operation with limited memory usage using gradient checkpointing
def forward_pass(inputs):
    return model(**inputs)

with torch.no_grad():
    outputs = checkpoint(forward_pass, inputs)

logits = outputs.logits

  return fn(*args, **kwargs)


RuntimeError: [enforce fail at alloc_cpu.cpp:114] data. DefaultCPUAllocator: not enough memory: you tried to allocate 11846811648 bytes.

In [None]:
predictions = torch.argmax(logits, dim=1)

torch.save(predictions, "pickles/predictions.pt")