In [75]:
from datasets import load_dataset
import numpy as np
import pandas as pd
import torch
import seaborn as sns
from transformers import AutoModel, DistilBertTokenizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB

In [2]:
yelp = load_dataset("yelp_review_full")
yelp

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 650000
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 50000
    })
})

In [3]:
train_ds = yelp["train"].select(range(1000))

In [4]:
train_ds

Dataset({
    features: ['label', 'text'],
    num_rows: 1000
})

In [11]:
model_name = "distilbert/distilbert-base-uncased-finetuned-sst-2-english"

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [13]:
device

'cuda'

In [14]:
model = AutoModel.from_pretrained(model_name).to(device)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [15]:
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [16]:
text = "Hello, this is a sample sentence"

In [37]:
encoded_text = tokenizer(text, return_tensors = "pt").to(device)
encoded_text

{'input_ids': tensor([[ 101, 7592, 1010, 2023, 2003, 1037, 7099, 6251,  102]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [38]:
encoded_text["input_ids"][0]

tensor([ 101, 7592, 1010, 2023, 2003, 1037, 7099, 6251,  102], device='cuda:0')

In [39]:
tokens = tokenizer.convert_ids_to_tokens(encoded_text["input_ids"][0])

In [40]:
tokens

['[CLS]', 'hello', ',', 'this', 'is', 'a', 'sample', 'sentence', '[SEP]']

In [41]:
tokenizer.convert_tokens_to_string(tokens)

'[CLS] hello , this is a sample sentence [SEP]'

In [42]:
tokenizer.vocab_size

30522

In [43]:
max_context_length = tokenizer.model_max_length
max_context_length

512

In [44]:
def tokenize_text(batch):
    return tokenizer(batch['text'], return_tensors='pt', padding='max_length', truncation=True)

In [45]:
yelp_encodings = train_ds.map(tokenize_text, batched=True, batch_size=128)

In [47]:
yelp_encodings.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])  # encodings need to be converted to torch tensors

In [48]:
def get_last_hidden_state(batch):
    inputs = {k: v.to(device) for k, v in batch.items() if k in tokenizer.model_input_names}
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    return {'hidden_state': last_hidden_state[:, 0]}

In [51]:
yelp_hidden_states = yelp_encodings.map(get_last_hidden_state, batched=True, batch_size=16)  # will have additional column 'hidden_state'

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [52]:
yelp_hidden_states

Dataset({
    features: ['label', 'text', 'input_ids', 'attention_mask', 'hidden_state'],
    num_rows: 1000
})

In [54]:
import joblib
joblib.dump(yelp_hidden_states, 'yelp_hidden_states.joblib')

['yelp_hidden_states.joblib']

In [55]:
cutoff = 800
x_train = np.array(yelp_hidden_states['hidden_state'][:cutoff])
y_train = np.array(yelp_hidden_states['label'][:cutoff])
x_test = np.array(yelp_hidden_states['hidden_state'][cutoff: ])
y_test = np.array(yelp_hidden_states['label'][cutoff: ])

In [57]:
print(f"x_train.shape: {x_train.shape}, y_train.shape: {y_train.shape}")
print(f"x_test.shape: {x_test.shape}, y_test.shape: {y_test.shape}")

x_train.shape: (800, 768), y_train.shape: (800,)
x_test.shape: (200, 768), y_test.shape: (200,)


In [58]:
# Dummy Model
dummy_model = DummyClassifier(strategy = "most_frequent")
dummy_model.fit(x_train, y_train)
dummy_model.score(x_test, y_test)

0.165

In [59]:
# SVM Model
svm_model = SVC(kernel = "linear", C = 1.0, random_state = 42)
svm_model.fit(x_train, y_train)
svm_model.score(x_test, y_test)

0.475

In [62]:
# Logistic Regression
lr_model = LogisticRegression(random_state = 42)
lr_model.fit(x_train, y_train)
lr_model.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.46

In [65]:
# Random Forest Classifier
rfc_model = RandomForestClassifier(random_state = 42)
rfc_model.fit(x_train, y_train)
rfc_model.score(x_test, y_test)

0.435

In [76]:
# BernoulliNB
BNB = BernoulliNB( force_alpha=True, fit_prior=True)
BNB.fit(x_train, y_train)
BNB.score(x_test, y_test)

0.36

In [77]:
# GaussianNB
GNB = GaussianNB()
GNB.fit(x_train, y_train)
GNB.score(x_test, y_test)

0.365