# EDA corpus


In [8]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm

## Cumulating the dataset from path

In [9]:
pos_file_path = r'/kaggle/input/aclimdb/aclImdb/train/pos'
neg_file_path = r'/kaggle/input/aclimdb/aclImdb/train/neg'

def get_data(pos_folder_path, neg_folder_path):
    pos_context = []
    neg_context = []

    for filename in tqdm(os.listdir(pos_folder_path)):
        if filename.endswith('.txt'):
            file_path = os.path.join(pos_folder_path, filename)

            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                pos_context.append(content)

    for filename in tqdm(os.listdir(neg_folder_path)):
        if filename.endswith('.txt'):
            file_path = os.path.join(neg_folder_path, filename)

            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
                neg_context.append(content)
    
    pos_label = [0]*len(pos_context)
    neg_label = [1]*len(neg_context)
    text = pos_context+neg_context
    label = pos_label+neg_label
    
    data = pd.DataFrame({'Context':text, 'Label':label})

    return data

frame = get_data(pos_file_path, neg_file_path)

100%|██████████| 12500/12500 [00:09<00:00, 1270.55it/s]
100%|██████████| 12500/12500 [00:56<00:00, 221.03it/s]


In [14]:
frame

Unnamed: 0,Context,Label
0,This was one of those wonderful rare moments i...,0
1,Have you seen The Graduate? It was hailed as t...,0
2,"I don't watch a lot of TV, except for The Offi...",0
3,Kubrick again puts on display his stunning abi...,0
4,"First of all, I liked very much the central id...",0
...,...,...
24995,The first hour of the movie was boring as hell...,1
24996,"A fun concept, but poorly executed. Except for...",1
24997,I honestly don't understand how tripe like thi...,1
24998,This remake of the 1962 orginal film'o the boo...,1


## Removing unnecessary parttern

In [20]:
frame['CleanedContext'] = frame['Context'].apply(lambda x: x.replace('<br />', ' '))
frame['CleanedContext'] = frame['CleanedContext'].replace(r'\s+', ' ', regex=True)
frame['CleanedContext'] = frame['CleanedContext'].apply(lambda x: x.lower().strip())

In [21]:
i = 1000
print('Before cleaning:')
print(frame['Context'][i])
print("\n")
print('After cleaning:')
print(frame['Context'][i])

Before cleaning:
Does any one know what the 2 sports cars were? I think Robert Stack's might have been a Masseratti.Rock Hudson's character told his father he was taking a job in Iraq ,isn't that timely? I have had Dorthy Malone in my spank bank most of my life ,maybe this was the film that impressed me.Loren Bacall sure did have some chops in this film and probably out-acted Malone but Malones's part made a more sensational impact so she got the Oscar for best supporting role.Was Loren's part considered a leading role?Old man Hadley character was was probably a pretty common picture of tycoons of his era in that he was a regular guy who made it big in an emerging industry but in building a whole town he had forgotten his children to have his wife bring them up.In time,being widowed he realized that they were all he really had and they were spoiled rotten,looking for attention,so rather than try to relate to his children he blew his head off.An ancient morality tale.But seriously,what 

## Tokenizing 

In [22]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
print(tokenizer)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)


## Batch-tokenizing and Padding phase

In [52]:
import torch 
from torch.nn.utils.rnn import pad_sequence

token_ids = []
attention_masks = []

for review in tqdm(frame['CleanedContext']):
    batch_encoder = tokenizer.encode_plus(
        text = review,
        max_length= 128,
        truncation= True,
        return_tensors= 'pt',
    )
    token_ids.append(batch_encoder['input_ids'])
    attention_masks.append(batch_encoder['attention_mask'])

100%|██████████| 25000/25000 [01:31<00:00, 273.05it/s]


In [53]:
token_ids = [t.squeeze(0) for t in token_ids]
attention_masks = [t.squeeze(0) for t in attention_masks]

token_ids = pad_sequence(
    token_ids, 
    batch_first= True,
    padding_value= tokenizer.pad_token_id
    )
attention_masks = pad_sequence(
    attention_masks,
    batch_first= True,
    padding_value= tokenizer.pad_token_type_id
)

# Build Dataset and DataLoader

In [54]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

train_ids, val_ids = train_test_split(
    token_ids, 
    test_size= 0.2, 
    random_state= 42
)
train_ids, test_ids = train_test_split(
    train_ids, 
    test_size= 0.125, 
    random_state= 42
)

train_masks, val_masks = train_test_split(
    attention_masks, 
    test_size= 0.2, 
    random_state= 42
)
train_masks, test_masks = train_test_split(
    train_masks, 
    test_size= 0.125, 
    random_state= 42
)

labels = torch.tensor(frame['Label'], dtype= torch.long)
train_labels, val_labels = train_test_split(
    labels,test_size= 0.2
    )
train_labels, test_labels = train_test_split(
    train_labels,test_size= 0.125
    )

In [55]:
train_data = TensorDataset(train_ids, train_masks, train_labels)
val_data = TensorDataset(val_ids, val_masks, val_labels)
test_data = TensorDataset(test_ids, test_masks, test_labels)

In [56]:
train_loader = DataLoader(
    train_data, batch_size= 16, 
    num_workers= 4
    )
val_loader = DataLoader(
    val_data, batch_size= 16, 
    num_workers= 4
    )
test_loader = DataLoader(
    test_data, batch_size= 16, 
    num_workers= 4
    )

# BERT for binary classification

In [57]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')

In [58]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Optim Part

In [59]:
from torch.optim import AdamW
import torch.nn as nn
from transformers import get_linear_schedule_with_warmup

EPOCHS = 2

optimizer = AdamW(model.parameters())

loss_function = nn.CrossEntropyLoss()

num_training_steps = EPOCHS * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps)

# Training 

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=8,        # giảm nếu OOM
    per_device_eval_batch_size=8,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_dir='/kaggle/working/logs',                 # TensorBoard logs
    logging_steps=50,
    gradient_accumulation_steps=1,        # tăng nếu muốn mô phỏng batch lớn
    fp16=True,                            # mixed precision nếu GPU support
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,                          # model BERT đã khai báo
    args=training_args,                   # training args
    train_dataset=train_data,             # dataset train (Dataset object)
    eval_dataset=val_data                 # dataset eval
)

trainer.train()

results = trainer.evaluate()
print(results)



<IPython.core.display.Javascript object>