In [48]:
from transformers import AutoModelForSequenceClassification, DistilBertTokenizer, Trainer, TrainingArguments
from datasets import DatasetDict
import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from torch.nn.functional import cross_entropy
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.dummy import DummyClassifier

In [31]:
yelp_hidden_states = joblib.load('yelp_hidden_states.joblib')

In [32]:
model_name = 'distilbert-base-uncased'
device = 'cuda'
num_labels = 5
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
train_ds = yelp_hidden_states.select(range(0,800))
eval_ds = yelp_hidden_states.select(range(800,1000))
print(train_ds[0]['input_ids'].shape)
print(eval_ds[0]['input_ids'].shape)
print(yelp_hidden_states[800]['input_ids'].shape)

torch.Size([512])
torch.Size([512])
torch.Size([512])


In [34]:
yelp_ds_dict = DatasetDict({'train': train_ds, 'test':eval_ds})

In [None]:
batch_size = 8
training_args = TrainingArguments(
    output_dir = "./results",
    learning_rate = 2e-5,
    num_train_epochs = 20,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    warmup_steps=500,  # number of warmup steps for learning rate scheduler
    disable_tqdm=False,
    push_to_hub=False,
    save_strategy='epoch',
    log_level='error',
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

In [None]:
trainer = Trainer(model=model, 
                  args=training_args, 
                  train_dataset=yelp_ds_dict['train'], eval_dataset=yelp_ds_dict['test'])
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
preds = trainer.predict(yelp_ds_dict['test'])

In [None]:
preds.metrics

In [None]:
np.argmax(preds.predictions, axis=1)

In [None]:
true_classes = yelp_ds_dict['test']['label']
preds_classes = np.argmax(preds.predictions, axis=1)
conf_mat = confusion_matrix(true_classes, preds_classes)
sns.heatmap(conf_mat, annot=True)

In [None]:
accuracy_score(true_classes, preds_classes)

In [None]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(yelp_ds_dict['train']['label'], yelp_ds_dict['train']['label'])

In [None]:
dummy_clf.score(yelp_ds_dict['test']['label'], yelp_ds_dict['test']['label'])

In [None]:
with torch.no_grad():
    outputs = model(yelp_ds_dict['test']['input_ids'], yelp_ds_dict['test']['attention_mask'])

In [None]:
pred_labels = torch.argmax(outputs.logits, dim=1)
loss = cross_entropy(outputs.logits, yelp_ds_dict['test']['label'], reduction='none')

In [None]:
df_individual_reviews = pd.DataFrame({'text': yelp_ds_dict['test']['text'], 'label': yelp_ds_dict['test']['label'], 'pred_label': pred_labels, 'loss': loss}).sort_values('loss', ascending=False).reset_index(drop=True)
df_individual_reviews

In [None]:
sns.lineplot(data=df_individual_reviews, x='label', y='loss')

In [None]:
trainer.create_model_card(model_name = 'distilbert-base-uncased-yelp')
trainer.push_to_hub(commit_message='Yelp review classification')

In [None]:
from transformers import pipeline
model_id = 'BertGollnick/distilbert-base-uncased-yelp-new'
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
classifier = pipeline('sentiment-analysis', model=model_id, tokenizer=tokenizer)

In [None]:
res = classifier('it is not so great', return_all_scores=True)[0]
df_res = pd.DataFrame(res)
sns.barplot(data=df_res, x='label', y='score')