# Text clasification with DistilBERT

In [1]:
import pandas as pd
import string
import time
from tqdm import tqdm
import transformers
from transformers import DistilBertTokenizer, DistilBertModel
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import warnings
warnings.simplefilter('ignore')

torch.__version__, transformers.__version__

  from .autonotebook import tqdm as notebook_tqdm


('2.4.0+cu121', '4.44.0')

## 1.- Dataset

In [2]:
# Load the ods file
input_file = './data/processed.ods'
df = pd.read_excel(input_file, engine="odf")

In [3]:
df.head()

Unnamed: 0,text,label
0,"San Jose in California 1903, Ms Winchester is ...",1
1,so they will make some business. So they look ...,1
2,so she couldn’t own that house. After a while ...,1
3,"the mansion, also the ghost poseer the child o...",1
4,"that the ms winchester did in the past, or the...",1


In [4]:
df['label'].unique()

array([1, 0])

In [5]:
df['label'].value_counts()

label
0    240
1    230
Name: count, dtype: int64

- Remove special characters and transform to lowercase

In [6]:
string.punctuation
df['text'] = df['text'].str.lower().str.translate(str.maketrans('', '', string.punctuation))
df.head()

Unnamed: 0,text,label
0,san jose in california 1903 ms winchester is t...,1
1,so they will make some business so they look f...,1
2,so she couldn’t own that house after a while i...,1
3,the mansion also the ghost poseer the child of...,1
4,that the ms winchester did in the past or the ...,1


## 2.- Pipeline

In [7]:
maxlen = 20
batch_size = 4

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

In [8]:
class LabelDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.label.astype(float)
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [9]:
train_size = 0.85
train_data = df.sample(frac=train_size,random_state=123)
test_data = df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

train_data = LabelDataset(train_data, tokenizer, maxlen)
test_data = LabelDataset(test_data, tokenizer, maxlen)
len(train_data), len(test_data)

(400, 70)

In [10]:
train_params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': batch_size,
               'num_workers': 0
               }

train_loader = DataLoader(train_data, **train_params)
test_loader = DataLoader(test_data, **test_params)

In [11]:
train_batch = next(iter(train_loader))

In [12]:
train_batch

{'ids': tensor([[  101,  2001,  1037,  8987,  2214, 10276,  2315,  2002,  6593,  7076,
           2040,  2018,  2042,  1996,  4602,  1997,  2010,  2051,  2588,   102],
         [  101,  1059, 27511,  1037,  2711,  1999,  1996,  2277,  2061,  2023,
           2711,  2202,  1996, 10658,  1999,  2014,  2482,  1998,  2175,   102],
         [  101,  1998,  2010,  3129,  3419,  2020,  1999,  1996,  2160,  2021,
           3419,  2404,  1996, 10658,  9617, 25766,  1999,  1996, 11669,   102],
         [  101,  1996,  6138,  1999,  2047,  5979,  5255,  2068,  2000,  1996,
           2598,  2021,  5573, 18718,  2726,  1996,  9191, 13555,  2008,   102]]),
 'mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0

## 3.- Model

In [13]:
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.2)
        self.classifier = torch.nn.Linear(768, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)

        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.sigmoid(output)
        return output

ids = train_batch['ids']
mask = train_batch['mask']
token_type_ids = train_batch['token_type_ids']
targets = train_batch['targets']

model = DistilBERTClass()
outputs = model(ids, mask, token_type_ids)
outputs

tensor([[0.5141],
        [0.4925],
        [0.5205],
        [0.4946]], grad_fn=<SigmoidBackward0>)

In [14]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [15]:
model.to(device)
lr = 1e-05
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

## 4.- Training

In [16]:
def test(model, device, test_loader):
    start = time.time()
    running_loss = 0.0
    running_acc = 0.0
    with torch.no_grad():
        model.eval()
        for data in test_loader:
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            labels = data['targets'].to(device)

            outputs = model(ids, mask, token_type_ids)
            outputs = outputs.squeeze()
            
            loss = loss_fn(outputs, labels)

            running_loss += loss.item()
            preds = outputs.round()  # Round the outputs to get predicted labels
            running_acc += (preds == labels).sum().item()  # Compute number of correct predictions

    val_acc = running_acc / len(test_loader.dataset)
    print(f'Time for eval is {time.time()-start:.4f} sec Val loss: {running_loss / len(test_loader):.4f}')
    print(f'Val acc: {val_acc:.4f}')
    return val_acc

In [17]:
len(train_loader.dataset)

400

In [18]:
def train(epoch, train_loader, test_loader, interval=300):
    running_loss = 0.0
    model.train()
    for _, data in tqdm(enumerate(train_loader, 0)):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        labels = data['targets'].to(device)

        optimizer.zero_grad()
        
        outputs = model(ids, mask, token_type_ids)
        outputs = outputs.squeeze()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        if _ % interval == 0:
            print(f'Train loss: {loss.item():.4f}')
            val_acc = test(model, device, test_loader)
            if val_acc > 0.85:
                break

In [19]:
epochs = 20

In [20]:
for epoch in range(epochs):
    train(epoch, train_loader, test_loader)

5it [00:00, 15.90it/s]

Train loss: 0.7233
Time for eval is 0.0833 sec Val loss: 0.6998
Val acc: 0.4571


100it [00:03, 31.68it/s]
5it [00:00, 24.98it/s]

Train loss: 0.8817
Time for eval is 0.0754 sec Val loss: 0.5700
Val acc: 0.6286


100it [00:02, 33.51it/s]
2it [00:00, 16.61it/s]

Train loss: 0.0399
Time for eval is 0.0635 sec Val loss: 0.3889
Val acc: 0.8286


100it [00:02, 33.61it/s]
0it [00:00, ?it/s]


Train loss: 0.0283
Time for eval is 0.0591 sec Val loss: 0.4159
Val acc: 0.8571


0it [00:00, ?it/s]


Train loss: 0.1276
Time for eval is 0.0633 sec Val loss: 0.4119
Val acc: 0.8714


0it [00:00, ?it/s]

Train loss: 0.1353


0it [00:00, ?it/s]


Time for eval is 0.0605 sec Val loss: 0.4086
Val acc: 0.8714


0it [00:00, ?it/s]

Train loss: 0.0181


0it [00:00, ?it/s]


Time for eval is 0.0634 sec Val loss: 0.4059
Val acc: 0.8714


0it [00:00, ?it/s]

Train loss: 0.0161


0it [00:00, ?it/s]


Time for eval is 0.0612 sec Val loss: 0.4039
Val acc: 0.8714


0it [00:00, ?it/s]

Train loss: 0.0381


0it [00:00, ?it/s]


Time for eval is 0.0620 sec Val loss: 0.4023
Val acc: 0.8714


0it [00:00, ?it/s]

Train loss: 0.0733


0it [00:00, ?it/s]


Time for eval is 0.0707 sec Val loss: 0.4015
Val acc: 0.8714


0it [00:00, ?it/s]

Train loss: 0.0193


0it [00:00, ?it/s]


Time for eval is 0.0605 sec Val loss: 0.4008
Val acc: 0.8714


0it [00:00, ?it/s]

Train loss: 0.0249


0it [00:00, ?it/s]


Time for eval is 0.0598 sec Val loss: 0.4004
Val acc: 0.8714


0it [00:00, ?it/s]

Train loss: 0.0341


0it [00:00, ?it/s]


Time for eval is 0.0621 sec Val loss: 0.4002
Val acc: 0.8857


0it [00:00, ?it/s]

Train loss: 0.0614


0it [00:00, ?it/s]


Time for eval is 0.0608 sec Val loss: 0.4008
Val acc: 0.8714


0it [00:00, ?it/s]

Train loss: 0.0248


0it [00:00, ?it/s]


Time for eval is 0.0628 sec Val loss: 0.4015
Val acc: 0.8714


0it [00:00, ?it/s]

Train loss: 0.1414


0it [00:00, ?it/s]


Time for eval is 0.0605 sec Val loss: 0.4036
Val acc: 0.8714


0it [00:00, ?it/s]

Train loss: 0.0823


0it [00:00, ?it/s]


Time for eval is 0.0602 sec Val loss: 0.4044
Val acc: 0.8714


0it [00:00, ?it/s]

Train loss: 0.0362


0it [00:00, ?it/s]


Time for eval is 0.0658 sec Val loss: 0.4055
Val acc: 0.8714


0it [00:00, ?it/s]

Train loss: 0.0719


0it [00:00, ?it/s]


Time for eval is 0.0624 sec Val loss: 0.4074
Val acc: 0.8714


0it [00:00, ?it/s]

Train loss: 0.0135


0it [00:00, ?it/s]

Time for eval is 0.0615 sec Val loss: 0.4093
Val acc: 0.8714





In [21]:
def segment_text(text: str, segment_length: int = 20) -> list:
    words = text.split()
    segments = [(' '.join(words[i:i + segment_length])) 
                for i in range(0, len(words), segment_length)]
    return segments

In [22]:
def test_essay(essay, model, tokenizer, max_len, device):
    chunks = segment_text(essay)

    model.eval()
    all_predictions = []
    
    for chunk in chunks:
        # Tokenize and prepare the chunk
        inputs = tokenizer.encode_plus(
            chunk,
            None,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0).to(device)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).unsqueeze(0).to(device)
        token_type_ids = torch.tensor(inputs['token_type_ids'], dtype=torch.long).unsqueeze(0).to(device)

        with torch.no_grad():
            outputs = model(ids, mask, token_type_ids)
            outputs = outputs.squeeze()

        predicted_prob = outputs
        all_predictions.append(predicted_prob)

    return all_predictions

In [23]:
essay = """They Kicked the boy out of his band and since it was the only thing he had as a job he was forced to look for a job in a school but he had no other speciality tan music so he had to pretend to be a math/sience teacher So he could work as a "teacher”. One day he saw that his students played instruments so well, and that's where it started all Would he prefer to teach them math/sience or music? After posing as a teacher and seeng how well his students played music, he decided to descover each one ability, literally this kids can play rock music.Maybe he thought “If they kicked me out of of the band I was in, why don't I make my own band"? Or maybe he just did it because music was his passion. But he has to continue keeping the secret were they going to find out what he did? what would happen if they did? The students were confused out first, but then they started to like it and with their imagination and ideas they create some Incredible things, each one has incredible qualities that can be seen during the movie, that "teacher" was crazy!! and everything is going well until... of course there has to be a problem, I think he didn't fully think about what parents say, or what EVERYONE Would say when they discovered what he was hiding. But when they demonstrated their talent a stage after secretly entering to on a a competition, the parents realized their children talent and had no problems."""
pred = test_essay(essay, model, tokenizer, maxlen, device)
print([tensor.item() for tensor in pred])

average_prediction = sum(pred) / len(pred)
percentage = average_prediction * 100
print(f"Predicted percentage: {percentage:.2f}%")

[0.9914416670799255, 0.5460850596427917, 0.9470161199569702, 0.9765793085098267, 0.9852501749992371, 0.9841912984848022, 0.9848730564117432, 0.9324618577957153, 0.9784491062164307, 0.8833802342414856, 0.9639078974723816, 0.4878084659576416, 0.5620577335357666, 0.971889317035675]
Predicted percentage: 87.11%


In [1]:
import os

In [None]:
os.makedirs('./models/', exist_ok=True)
output_model_file = './models/pytorch_distilbert_writings.bin'
output_vocab_file = './models/vocab_distilbert_writings.bin'

torch.save(model, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)