# Reference
* https://towardsdatascience.com/feature-extraction-with-bert-for-text-classification-533dde44dc2f

In [None]:
#https://drive.google.com/file/d/1Rn5yjsOpA6MGNvFnmarIcD3Bb2u2wniO/view?usp=share_link
#!gdown -q 1Rn5yjsOpA6MGNvFnmarIcD3Bb2u2wniO

In [None]:
import numpy as pd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

import torch
import random

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel


In [None]:
df = pd.read_csv("bbc-news.csv")
df_train, df_test = train_test_split(df, test_size=0.25, random_state=2023)

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1668 entries, 668 to 855
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    1668 non-null   object
 1   label   1668 non-null   object
dtypes: object(2)
memory usage: 39.1+ KB


In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 557 entries, 294 to 464
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    557 non-null    object
 1   label   557 non-null    object
dtypes: object(2)
memory usage: 13.1+ KB


In [None]:
train_text, train_label = list(df_train['text']), list(df_train['label'])
test_text, test_label = list(df_test['text']), list(df_test['label'])
nshowed = 3
idx_showed = random.sample(range(len(train_text)), nshowed)

print(idx_showed)
print("="*100)
print("Some samples from the dataset")
print("="*100)
for idx in idx_showed:
    print("text");
    print('-'*50)
    print(train_text[idx])
    print()
    print('-'*50)
    print("label: ", train_label[idx])
print("="*100)

[515, 1466, 604]
Some samples from the dataset
text
--------------------------------------------------
Ray Charles studio becomes museum A museum dedicated to the career of the late legendary singer Ray Charles is to open in his former recording studio in Los Angeles. His longtime publicist Jerry Digney said the museum would house "archive materials from recordings, to awards, to ephemera, to wardrobe". A tour bus used by Charles and his entourage over the years will also be on permanent display. It is hoped the museum will be ready for visitors in late 2007. Mr Digney said the recording studio and offices had been used by Charles for many years, and was where he recorded much of his last album, Genius Loves Company. It is hoped the museum will also house an education centre. The building had been declared a historic landmark by the city of Los Angeles just before Charles' death in June 2004 at the age of 73. Following his death, Charles won eight Grammy Awards, including album of the 

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)

print("Tokenizer's Type: ", type(tokenizer))
print("model's Type: ", type(model))
print()
train_tokenized = tokenizer(train_text, padding = True, truncation = True, return_tensors="pt")
test_tokenized = tokenizer(test_text, padding = True, truncation = True, return_tensors="pt")
train_tokenized = {k:v.to(device) for k,v in train_tokenized.items()} # use GPU if possible
test_tokenized = {k:v.to(device) for k,v in test_tokenized.items()} # use GPU if possible

print(type(train_tokenized))
print(train_tokenized.keys())
print("train_tokenized['input_ids'].shape: ", train_tokenized['input_ids'].shape)
print("train_tokenized['attention_mask'].shape: ", train_tokenized['attention_mask'].shape)

Tokenizer's Type:  <class 'transformers.models.distilbert.tokenization_distilbert_fast.DistilBertTokenizerFast'>
model's Type:  <class 'transformers.models.distilbert.modeling_distilbert.DistilBertModel'>

<class 'dict'>
dict_keys(['input_ids', 'attention_mask'])
train_tokenized['input_ids'].shape:  torch.Size([1668, 512])
train_tokenized['attention_mask'].shape:  torch.Size([1668, 512])


In [None]:
def get_features(tokenized_set, model, batch_size=16):
    features = []
    nsamples, feature_dim = tokenized_set['input_ids'].shape
    nbatches = nsamples//batch_size
    start_idx = 0
    for idx in range(nbatches):
        end_idx = min(start_idx + batch_size, nsamples)
        if idx == (nbatches - 1): # last batch
            end_idx = nsamples
        input_ids = tokenized_set['input_ids'][start_idx:end_idx, ...]
        attention_mask = tokenized_set['input_ids'][start_idx:end_idx, ...]
        with torch.no_grad():
            feature = model(input_ids, attention_mask)
            #features.append(feature.last_hidden_state[:, 0, :]) #get only the [CLS] hidden states
            features.append(feature.last_hidden_state.mean(dim=1))
        #
        start_idx += batch_size
    return torch.cat(features, dim=0)

train_features = get_features(train_tokenized, model, batch_size=16)
test_features = get_features(test_tokenized, model, batch_size=16)



In [None]:
print("training size:", train_features.shape)
print("testing size:", test_features.shape)

training size: torch.Size([1668, 768])
testing size: torch.Size([557, 768])


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

X_train = train_features.to("cpu").numpy()
X_test = test_features.to("cpu").numpy()

N_train, ndim = X_train.shape
N_test, _ = X_test.shape
lb_encoder = LabelEncoder().fit(train_label[:N_train])
y_train = lb_encoder.transform(train_label[:N_train])
y_test = lb_encoder.transform(test_label[:N_test])

print(f"Training set (nsamples | ndim): {N_train} | {ndim}")
print(f"Testing set (nsamples | ndim): {N_test} | {ndim}")

#model = LogisticRegression( max_iter=1000)
model = RandomForestClassifier()
model.fit(X_train,y_train)
model.score(X_test,y_test)

Training set (nsamples | ndim): 1668 | 768
Testing set (nsamples | ndim): 557 | 768


0.9748653500897666