# Text classification with DistilBERT

In [1]:
import os
import pandas as pd
import string
import time
import tqdm
import torch
import transformers
import warnings
warnings.simplefilter('ignore')

torch.__version__, transformers.__version__

('2.4.0+cu121', '4.44.0')

## 1.- Dataset

In [2]:
# Load the ods file
input_file = './data/processed.ods'
df = pd.read_excel(input_file, engine="odf")

- Explore dataset

In [3]:
df.head()

Unnamed: 0,text,label
0,"San Jose in California 1903, Ms Winchester is ...",1
1,so they will make some business. So they look ...,1
2,so she couldn’t own that house. After a while ...,1
3,"the mansion, also the ghost poseer the child o...",1
4,"that the ms winchester did in the past, or the...",1


In [4]:
df.tail()

Unnamed: 0,text,label
465,"song. However, the truth comes out that Dewey ...",0
466,"leads to attempts to arrest him, and Dewey apo...",0
467,"competition, playing a song written by one of ...",0
468,"parents, who chant “School of Rock,” leads Dew...",0
469,high note.,0


In [5]:
df['label'].unique()

array([1, 0])

In [6]:
df['label'].value_counts()

label
0    240
1    230
Name: count, dtype: int64

- Remove special characters and transform to lowercase

In [7]:
df['text'] = df['text'].str.lower().str.translate(str.maketrans('', '', string.punctuation))
df.head()

Unnamed: 0,text,label
0,san jose in california 1903 ms winchester is t...,1
1,so they will make some business so they look f...,1
2,so she couldn’t own that house after a while i...,1
3,the mansion also the ghost poseer the child of...,1
4,that the ms winchester did in the past or the ...,1


## 2.- Pipeline

In [8]:
maxlen = 20
batch_size = 4 # Dataset only has 470 sentences of at most 20 words each

tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

In [9]:
class LabelDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.label.astype(float)
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [10]:
train_size = 0.85
train_data = df.sample(frac=train_size,random_state=123)
test_data = df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)

train_data = LabelDataset(train_data, tokenizer, maxlen)
test_data = LabelDataset(test_data, tokenizer, maxlen)
len(train_data), len(test_data)

(400, 70)

In [11]:
train_params = {'batch_size': batch_size,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': batch_size,
               'num_workers': 0
               }

train_loader = torch.utils.data.DataLoader(train_data, **train_params)
test_loader = torch.utils.data.DataLoader(test_data, **test_params)

In [12]:
train_batch = next(iter(train_loader))

In [13]:
train_batch

{'ids': tensor([[  101,  1996, 12841,  3185,  2003,  1037,  5469,  3185,  2013,  2760,
           2009,  2003, 17093,  2098,  1999,  2624,  4560,  2662,  5518,   102],
         [  101,  2028,  2305, 22841,  2015,  3338,  2046,  1996,  3232,  1521,
           1055,  2188,  1998,  2886,  2068,  2076,  1996,  6101,  2668,   102],
         [  101,  3602,  2007, 20309,  5719,  2000,  6570,  2600,  2189,  1999,
           2019,  2044, 11624, 13669,  2565, 14828,  1037,  2047,  2927,   102],
         [  101, 15670,  1996,  3129,  2008,  1996,  4763,  9178, 11014,  1996,
           2564,  1521,  1055,  3969,  2043,  1996,  3129,  5363,  2000,   102]]),
 'mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
         [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0

## 3.- Model

In [14]:
class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = transformers.DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)

        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.sigmoid(output)
        return output

ids = train_batch['ids']
mask = train_batch['mask']
token_type_ids = train_batch['token_type_ids']
targets = train_batch['targets']

model = DistilBERTClass()
outputs = model(ids, mask, token_type_ids)
outputs

tensor([[0.4675],
        [0.5814],
        [0.4844],
        [0.4889]], grad_fn=<SigmoidBackward0>)

In [15]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [16]:
model.to(device)
lr = 1e-05
loss_fn = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

## 4.- Training

In [17]:
def test(model, device, test_loader):
    start = time.time()
    running_loss = 0.0
    running_acc = 0.0
    with torch.no_grad():
        model.eval()
        for data in test_loader:
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            token_type_ids = data['token_type_ids'].to(device)
            labels = data['targets'].to(device)

            outputs = model(ids, mask, token_type_ids)
            outputs = outputs.squeeze()
            
            loss = loss_fn(outputs, labels)

            running_loss += loss.item()
            preds = outputs.round() 
            running_acc += (preds == labels).sum().item() 

    val_acc = running_acc / len(test_loader.dataset)
    print(f'Time for eval is {time.time()-start:.4f} sec Val loss: {running_loss / len(test_loader):.4f}')
    print(f'Val acc: {val_acc:.4f}')
    return val_acc

In [18]:
len(train_loader.dataset)

400

In [19]:
def train(epoch, train_loader, test_loader, interval=300):
    running_loss = 0.0
    model.train()
    for _, data in tqdm.tqdm(enumerate(train_loader, 0)):
        ids = data['ids'].to(device)
        mask = data['mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        labels = data['targets'].to(device)

        optimizer.zero_grad()
        
        outputs = model(ids, mask, token_type_ids)
        outputs = outputs.squeeze()
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        if _ % interval == 0:
            print(f'Train loss: {loss.item():.4f}')
            val_acc = test(model, device, test_loader)
            if val_acc > 0.82:
                break

In [20]:
epochs = 10

In [21]:
for epoch in range(epochs):
    train(epoch, train_loader, test_loader)

5it [00:00, 16.65it/s]

Train loss: 0.7028
Time for eval is 0.0670 sec Val loss: 0.6987
Val acc: 0.4714


100it [00:03, 31.47it/s]
0it [00:00, ?it/s]


Train loss: 0.1465
Time for eval is 0.0675 sec Val loss: 0.3742
Val acc: 0.8714


0it [00:00, ?it/s]


Train loss: 0.1073
Time for eval is 0.0677 sec Val loss: 0.3732
Val acc: 0.9000


0it [00:00, ?it/s]

Train loss: 0.2152


0it [00:00, ?it/s]


Time for eval is 0.0673 sec Val loss: 0.3725
Val acc: 0.9000


0it [00:00, ?it/s]

Train loss: 0.1090


0it [00:00, ?it/s]


Time for eval is 0.0682 sec Val loss: 0.3725
Val acc: 0.9000


0it [00:00, ?it/s]

Train loss: 0.5737


0it [00:00, ?it/s]


Time for eval is 0.0675 sec Val loss: 0.3719
Val acc: 0.9000


0it [00:00, ?it/s]

Train loss: 0.1307


0it [00:00, ?it/s]


Time for eval is 0.0678 sec Val loss: 0.3706
Val acc: 0.9000


0it [00:00, ?it/s]

Train loss: 0.1030


0it [00:00, ?it/s]


Time for eval is 0.0876 sec Val loss: 0.3696
Val acc: 0.9000


0it [00:00, ?it/s]

Train loss: 0.1041


0it [00:00, ?it/s]


Time for eval is 0.0883 sec Val loss: 0.3691
Val acc: 0.9000


0it [00:00, ?it/s]

Train loss: 0.6252


0it [00:00, ?it/s]

Time for eval is 0.0887 sec Val loss: 0.3707
Val acc: 0.9000





- Remove comments to save model

In [22]:
os.makedirs('./models/', exist_ok=True)
output_model_file = './models/pytorch_distilbert_writings.bin'
output_vocab_file = './models/vocab_distilbert_writings.bin'

#torch.save(model, output_model_file)
#tokenizer.save_vocabulary(output_vocab_file)

## 5.- Testing

In [23]:
def segment_text(text: str, segment_length: int = 20) -> list:
    words = text.split()
    segments = [(' '.join(words[i:i + segment_length])) 
                for i in range(0, len(words), segment_length)]
    return segments

In [24]:
def test_essay(essay, model, tokenizer, max_len, device):
    essay = essay.lower().translate(str.maketrans('', '', string.punctuation))
    chunks = segment_text(essay)

    model.eval()
    all_predictions = []
    
    for chunk in chunks:
        # Tokenize and prepare the chunk
        inputs = tokenizer.encode_plus(
            chunk,
            None,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        
        ids = torch.tensor(inputs['input_ids'], dtype=torch.long).unsqueeze(0).to(device)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long).unsqueeze(0).to(device)
        token_type_ids = torch.tensor(inputs['token_type_ids'], dtype=torch.long).unsqueeze(0).to(device)

        with torch.no_grad():
            outputs = model(ids, mask, token_type_ids)
            outputs = outputs.squeeze()

        predicted_prob = outputs
        all_predictions.append(predicted_prob)

    return all_predictions

In [25]:
essay = """They Kicked the boy out of his band and since it was the only thing he had as a job he was forced to look for a job in a school but he had no other speciality tan music so he had to pretend to be a math/sience teacher So he could work as a "teacher”. One day he saw that his students played instruments so well, and that's where it started all Would he prefer to teach them math/sience or music? After posing as a teacher and seeng how well his students played music, he decided to descover each one ability, literally this kids can play rock music.Maybe he thought “If they kicked me out of of the band I was in, why don't I make my own band"? Or maybe he just did it because music was his passion. But he has to continue keeping the secret were they going to find out what he did? what would happen if they did? The students were confused out first, but then they started to like it and with their imagination and ideas they create some Incredible things, each one has incredible qualities that can be seen during the movie, that "teacher" was crazy!! and everything is going well until... of course there has to be a problem, I think he didn't fully think about what parents say, or what EVERYONE Would say when they discovered what he was hiding. But when they demonstrated their talent a stage after secretly entering to on a a competition, the parents realized their children talent and had no problems."""
essay

'They Kicked the boy out of his band and since it was the only thing he had as a job he was forced to look for a job in a school but he had no other speciality tan music so he had to pretend to be a math/sience teacher So he could work as a "teacher”. One day he saw that his students played instruments so well, and that\'s where it started all Would he prefer to teach them math/sience or music? After posing as a teacher and seeng how well his students played music, he decided to descover each one ability, literally this kids can play rock music.Maybe he thought “If they kicked me out of of the band I was in, why don\'t I make my own band"? Or maybe he just did it because music was his passion. But he has to continue keeping the secret were they going to find out what he did? what would happen if they did? The students were confused out first, but then they started to like it and with their imagination and ideas they create some Incredible things, each one has incredible qualities that 

In [26]:
pred = test_essay(essay, model, tokenizer, maxlen, device)
print([tensor.item() for tensor in pred])

[0.931472659111023, 0.20791824162006378, 0.5809360146522522, 0.9172549843788147, 0.8047293424606323, 0.9112629294395447, 0.942256510257721, 0.7861670255661011, 0.9451174139976501, 0.4799267053604126, 0.9055496454238892, 0.8273663520812988, 0.4947417676448822, 0.6776221394538879]


In [27]:
average_prediction = sum(pred) / len(pred)
percentage = average_prediction * 100
print(f"Predicted percentages:\nAuthentic:{percentage:.2f}% Generated: {100 - percentage:.2f}%")

Predicted percentages:
Authentic:74.37% Generated: 25.63%
