In [8]:
import gc
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import nlpaug.augmenter.word as naw
import preprocessor as p
import multiprocessing
from contextlib import contextmanager
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, MarianTokenizer, MarianMTModel
from tqdm import tqdm
from sklearn import metrics

In [3]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [4]:
train_df = train_df[['keyword', 'text', 'target']]
test_df = test_df[['keyword', 'text']]

train_df['keyword'] = train_df['keyword'].fillna('None')
test_df['keyword'] = test_df['keyword'].fillna('None')

In [5]:
def extend_clean(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.HASHTAG, p.OPT.NUMBER, p.OPT.SMILEY)
train_df['text'] = train_df['text'].apply(lambda x: extend_clean(p.clean(x).lower()))

In [None]:
# aug = naw.ContextualWordEmbsAug(
#     model_path='xlm-roberta-base',
#     action='substitute'
# )
# aug_rows = []

# for idx, row in train_df.iterrows():
#     aug_text = aug.augment(row['text'])
#     new_row = {
#         'keyword': row['keyword'],
#         'text': aug_text,
#         'target': row['target']
#     }
#     aug_rows.append(new_row)

# aug_df = pd.DataFrame(aug_rows)
# new_df = pd.concat([train_df, aug_df], ignore_index=True)
# new_df.to_csv('./data/train.csv', index=False)

In [None]:
def translate_batch(batch, src_lang, out_lang):
    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{out_lang}'
    model = MarianMTModel.from_pretrained(model_name).to('cuda')
    tokenizer = MarianTokenizer.from_pretrained(model_name)

    inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=512).to('cuda')
    with torch.no_grad():
        translated = model.generate(**inputs, early_stopping=True)
    outputs = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return outputs

def translate(texts, src_lang, out_lang, batch_size=32, timeout=10):
    translated_texts = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        with multiprocessing.get_context("spawn").Pool(1) as pool:
            try:
                result = pool.apply_async(translate_batch, (batch, src_lang, out_lang))
                outputs = result.get(timeout=timeout)
                translated_texts.extend(outputs)
            except multiprocessing.TimeoutError:
                print(f"Timeout on batch {i / batch_size}, skipping...")
                continue
            except Exception as e:
                print(f"Error on batch {i}: {e}")
                continue
    return translated_texts

translated_df = train_df.copy()
translated_df['text'] = translate(translated_df['text'].tolist(), 'en', 'de', batch_size=16)
translated_df['text'] = translate(translated_df['text'].tolist(), 'de', 'en', batch_size=16)
new_df2 = pd.concat([train_df, translated_df], ignore_index=True)
new_df2.to_csv('./data/train.csv', index=False)

  0%|          | 1/952 [00:10<2:38:53, 10.02s/it]

Timeout on batch 0, skipping...


  0%|          | 2/952 [00:20<2:38:36, 10.02s/it]

Timeout on batch 16, skipping...


  0%|          | 3/952 [00:30<2:38:23, 10.01s/it]

Timeout on batch 32, skipping...


  0%|          | 3/952 [00:40<3:31:11, 13.35s/it]


KeyboardInterrupt: 

In [6]:
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=128, is_test=False):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        keyword = str(self.df.iloc[index]['keyword'])
        text = str(self.df.iloc[index]['text'])
        
        comb_text = keyword + " : " + text
        
        encoding = self.tokenizer(
            comb_text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        
        inputs = {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0)
        }
        
        if not self.is_test and 'target' in self.df.columns:
            labels = torch.tensor(self.df.iloc[index]['target'], dtype=torch.long)
            return inputs, labels
        else:
            return inputs

In [7]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
train_val_dataset = CustomDataset(train_df, tokenizer)
test_dataset = CustomDataset(test_df, tokenizer, is_test=True)

train_length = int(len(train_val_dataset) * 0.8)
val_length = len(train_val_dataset) - train_length

train_dataset, val_dataset = random_split(train_val_dataset, [train_length, val_length])

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [8]:
def train_model(model,
        train_dataloader,
        val_dataloader,
        optimizer, 
        scheduler,
        num_epochs=10,
        criterion=None,
        device=None
    ):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    if criterion is None:
        criterion = torch.nn.CrossEntropyLoss()
    
    model = model.to(device)
    torch.backends.cudnn.benchmark = True
    scaler = torch.GradScaler(device)

    for epoch in range(num_epochs):
        model.train()

        running_loss = 0.0
        all_labels = []
        all_preds = []
        
        tqdm_train = tqdm(train_dataloader, desc=f"Training epoch {epoch + 1}: ", leave=False)
        
        for inputs, labels in tqdm_train:
            if isinstance(inputs, dict):
                inputs = {k: v.to(device) for k, v in inputs.items()}
            else:
                inputs = inputs.to(device)
            
            labels = labels.to(device)
            
            optimizer.zero_grad()
            
            with torch.autocast(device_type="cuda"):
                outputs = model(**inputs)
                loss = criterion(outputs.logits, labels)
            
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            
            preds = torch.argmax(outputs.logits, dim=1)
            running_loss += loss.item()
            
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            
            tqdm_train.set_postfix(loss=loss.item())
        
        train_loss = running_loss / len(train_dataloader)
        train_f1 = metrics.f1_score(all_labels, all_preds, average="weighted")
        
        model.eval()
        
        running_loss = 0.0
        all_preds = []
        all_labels = []
        
        tqdm_val = tqdm(val_dataloader, desc=f"Validation epoch {epoch + 1}: ", leave=False)
        
        with torch.no_grad():
            for inputs, labels in tqdm_val:
                if isinstance(inputs, dict):
                    inputs = {k: v.to(device) for k, v in inputs.items()}
                else:
                    inputs = inputs.to(device)

                labels = labels.to(device)
                
                with torch.autocast(device_type="cuda"):
                    outputs = model(**inputs)
                    loss = criterion(outputs.logits, labels)
                
                preds = torch.argmax(outputs.logits, dim=1)
                running_loss += loss.item()
                
                all_labels.extend(labels.cpu().numpy())
                all_preds.extend(preds.cpu().numpy())
                
                tqdm_val.set_postfix(loss=loss.item())
        
        val_loss = running_loss / len(val_dataloader)
        val_f1 = metrics.f1_score(all_labels, all_preds, average="weighted")
        
        if isinstance(scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            scheduler.step(val_loss)
        else:
            scheduler.step()
        
        print(f"--- Epoch {epoch + 1}/{num_epochs}\n"
              f"Train F1: {train_f1:.4f}, Val F1: {val_f1:.4f}\n"
              f"Train loss: {train_loss:.4f}, Val loss: {val_loss:.4f}")

In [9]:
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=2)

for param in model.parameters():
    param.requires_grad = False

for param in model.classifier.parameters():
    param.requires_grad = True

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=12)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, num_epochs=12)

                                                                                 

--- Epoch 1/12
Train F1: 0.7224, Val F1: 0.7926
Train loss: 0.5638, Val loss: 0.4688


                                                                                 

--- Epoch 2/12
Train F1: 0.7785, Val F1: 0.7988
Train loss: 0.4913, Val loss: 0.4492


                                                                                 

--- Epoch 3/12
Train F1: 0.7806, Val F1: 0.8018
Train loss: 0.4885, Val loss: 0.4449


                                                                                 

--- Epoch 4/12
Train F1: 0.7817, Val F1: 0.7854
Train loss: 0.4798, Val loss: 0.4603


                                                                                 

--- Epoch 5/12
Train F1: 0.7842, Val F1: 0.8036
Train loss: 0.4731, Val loss: 0.4368


                                                                                 

--- Epoch 6/12
Train F1: 0.7900, Val F1: 0.8031
Train loss: 0.4674, Val loss: 0.4344


                                                                                 

--- Epoch 7/12
Train F1: 0.7927, Val F1: 0.8035
Train loss: 0.4587, Val loss: 0.4369


                                                                                 

--- Epoch 8/12
Train F1: 0.7920, Val F1: 0.8057
Train loss: 0.4630, Val loss: 0.4334


                                                                                 

--- Epoch 9/12
Train F1: 0.7923, Val F1: 0.8081
Train loss: 0.4601, Val loss: 0.4310


                                                                                  

--- Epoch 10/12
Train F1: 0.7969, Val F1: 0.8091
Train loss: 0.4509, Val loss: 0.4296


                                                                                  

--- Epoch 11/12
Train F1: 0.7971, Val F1: 0.8065
Train loss: 0.4543, Val loss: 0.4301


                                                                                  

--- Epoch 12/12
Train F1: 0.7970, Val F1: 0.8072
Train loss: 0.4509, Val loss: 0.4291




In [40]:
def test_model(model, test_dataloader, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model.to(device)
    model.eval()
    
    all_preds = []
    
    with torch.no_grad():
        for inputs in test_dataloader:
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = model(**inputs)
            preds = torch.argmax(outputs.logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
    
    return all_preds

In [41]:
predictions = test_model(model, test_dataloader)

In [15]:
del model
del optimizer
del scheduler

torch.cuda.empty_cache()

gc.collect()

8068

In [42]:
test_csv = pd.read_csv('./sample_submission.csv')
test_csv['target'] = predictions
test_csv.to_csv('sample_submission.csv', index=False)