## Imports

In [None]:
# !pip install numpy==1.23.5
# !pip install -U pip setuptools wheel
# !pip install -U 'spacy[cuda-autodetect]'
# !python -m spacy download en_core_web_sm
# !python -m spacy download en_core_web_md

In [None]:
#!pip install numpy requests nlpaug
#!pip install torch>=1.6.0 transformers>=4.11.3 sentencepiece

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
import unicodedata
import re
from tqdm import tqdm
import os

import itertools
import spacy
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk import ne_chunk
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import FreqDist
import nlpaug.augmenter.word as naw

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', 30)

# Modeling(Model-6,7,8 - Kfold[0~3])

## Use tokenizer

In [None]:
train = pd.read_csv('./train_last_3.csv')
test = pd.read_csv('./test_last_3.csv')

In [4]:
train = pd.read_csv('./data/train_last_3.csv')
test = pd.read_csv('./data/test_last_3.csv')

In [5]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from torch.nn import BCEWithLogitsLoss

In [6]:
model_type = "sileod/deberta-v3-base-tasksource-nli"
tokenizer = AutoTokenizer.from_pretrained(model_type)
config = AutoConfig.from_pretrained(model_type)
model_01 = AutoModelForSequenceClassification.from_pretrained(model_type, config=config)
model_02 = AutoModelForSequenceClassification.from_pretrained(model_type, config=config)
model_03 = AutoModelForSequenceClassification.from_pretrained(model_type, config=config)

Downloading tokenizer_config.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/8.26M [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/704M [00:00<?, ?B/s]

In [7]:
def tokenize_texts(texts, tokenizer, max_len=352):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=max_len, add_special_tokens=False , return_tensors='pt')

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenize_texts(text, self.tokenizer, self.max_len)
        return {'input_ids': encoding['input_ids'][0], 'attention_mask': encoding['attention_mask'][0], 'label': torch.tensor(label)}

## Data_loader

In [8]:
k_folds = [1, 2, 3]
train_indices = []
val_indices = []

In [9]:
train_index = train.iloc[list(range(0, 24780, 10))]

In [10]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_val_splits = list(skf.split(train_index, train_index['first_party_winner']))

In [11]:
for k_fold in k_folds:
    train_train =  list(train_index.iloc[train_val_splits[k_fold][0],:].index)
    train_val =  list(train_index.iloc[train_val_splits[k_fold][1],:].index)

    train_list = train_train[:]
    val_list = train_val[:]

    for num in train_train:
        for i in range(1, 10):
            new_value = num + i
            train_list.append(new_value)

    for num in train_val:
        for i in range(1, 10):
            new_value = num + i
            val_list.append(new_value)

    train_list = sorted(train_list)
    val_list = sorted(val_list)

    train_indices.append(sorted(train_list))
    val_indices.append(sorted(val_list))

In [12]:
train_data_01 = train.iloc[train_indices[0]]
val_data_01 = train.iloc[val_indices[0]]

train_data_02 = train.iloc[train_indices[1]]
val_data_02 = train.iloc[val_indices[1]]

train_data_03 = train.iloc[train_indices[2]]
val_data_03 = train.iloc[val_indices[2]]

In [13]:
train_dataset_01 = NewsDataset(train_data_01['facts'].to_numpy(), train_data_01['first_party_winner'].to_numpy(), tokenizer, max_len=352)
val_dataset_01 = NewsDataset(val_data_01['facts'].to_numpy(), val_data_01['first_party_winner'].to_numpy(), tokenizer, max_len=352)

train_dataset_02 = NewsDataset(train_data_02['facts'].to_numpy(), train_data_02['first_party_winner'].to_numpy(), tokenizer, max_len=352)
val_dataset_02 = NewsDataset(val_data_02['facts'].to_numpy(), val_data_02['first_party_winner'].to_numpy(), tokenizer, max_len=352)

train_dataset_03 = NewsDataset(train_data_03['facts'].to_numpy(), train_data_03['first_party_winner'].to_numpy(), tokenizer, max_len=352)
val_dataset_03 = NewsDataset(val_data_03['facts'].to_numpy(), val_data_03['first_party_winner'].to_numpy(), tokenizer, max_len=352)

In [14]:
train_loader_01 = DataLoader(train_dataset_01, batch_size=32, shuffle=True)
val_loader_01 = DataLoader(val_dataset_01, batch_size=32, shuffle=False)

train_loader_02 = DataLoader(train_dataset_02, batch_size=32, shuffle=True)
val_loader_02 = DataLoader(val_dataset_02, batch_size=32, shuffle=False)

train_loader_03 = DataLoader(train_dataset_03, batch_size=32, shuffle=True)
val_loader_03 = DataLoader(val_dataset_03, batch_size=32, shuffle=False)

# Modeling

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_01.to(device)
model_02.to(device)
model_03.to(device)

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0): DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
 

In [16]:
class LabelSmoothingLoss(torch.nn.Module):
    def __init__(self, classes, smoothing=0.1, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [17]:
optimizer_01 = AdamW(model_01.parameters(), lr=1e-5)
optimizer_02 = AdamW(model_02.parameters(), lr=1e-5)
optimizer_03 = AdamW(model_03.parameters(), lr=1e-5)

In [18]:
# Loss function
num_classes = 2
smoothing = 0.01
criterion = LabelSmoothingLoss(classes=num_classes, smoothing=smoothing).to(device)

In [19]:
# Step 7: Training and validation
def train(model, data_loader, optimizer, device):
    model.train()
    losses = []
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

        loss = criterion(logits, labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return np.mean(losses)




def eval(model, data_loader, device):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predictions.cpu().numpy())
    return y_true, y_pred

## Model_06 - Kfold[1]

In [None]:
train_loader = [train_loader_01, train_loader_02, train_loader_03]
val_loader = [val_loader_01, val_loader_02, val_loader_03]
model_loader = [model_01, model_02, model_03]
optimizer_loader = [optimizer_01, optimizer_02, optimizer_03]


best_accuracy_01 = 0

for epoch in range(10):
    print(f"Epoch: {epoch+1}")
    train_loss = train(model_loader[0], train_loader[0], optimizer_loader[0], device)
    print(f"Train Loss: {train_loss}")
    y_true_train, y_pred_train = eval(model_loader[0], train_loader[0], device)
    y_true_val, y_pred_val = eval(model_loader[0], val_loader[0], device)

    train_accuracy = accuracy_score(y_true_train, y_pred_train)
    val_accuracy = accuracy_score(y_true_val, y_pred_val)

    print(f"Train Accuracy: {train_accuracy}")
    print(f"Val Accuracy: {val_accuracy}")
    print(classification_report(y_true_val, y_pred_val))

    if val_accuracy > best_accuracy_01:
        best_accuracy_01 = val_accuracy
        torch.save(model_loader[0].state_dict(), f'model_loader[0]_{k_folds[0]}.pt')

    torch.cuda.empty_cache()

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')

In [None]:
model_01.load_state_dict(torch.load(f'model_loader[0]_1.pt'))
test_dataset = NewsDataset(test['facts'].to_numpy(), np.zeros(len(test)), tokenizer, max_len=352)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
_, test_predictions = eval(model_01, test_loader, device)
submit['first_party_winner_01'] = test_predictions

In [None]:
submit_01 = submit.copy()

In [None]:
submit_01.to_csv('submit_01_version2.csv', index = False)

## Model_07 - Kfold[2]

In [None]:
train_loader = [train_loader_01, train_loader_02, train_loader_03]
val_loader = [val_loader_01, val_loader_02, val_loader_03]
model_loader = [model_01, model_02, model_03]
optimizer_loader = [optimizer_01, optimizer_02, optimizer_03]


best_accuracy_02 = 0

for epoch in range(10):
    print(f"Epoch: {epoch+1}")
    train_loss = train(model_loader[1], train_loader[1], optimizer_loader[1], device)
    print(f"Train Loss: {train_loss}")
    y_true_train, y_pred_train = eval(model_loader[1], train_loader[1], device)
    y_true_val, y_pred_val = eval(model_loader[1], val_loader[1], device)

    train_accuracy = accuracy_score(y_true_train, y_pred_train)
    val_accuracy = accuracy_score(y_true_val, y_pred_val)

    print(f"Train Accuracy: {train_accuracy}")
    print(f"Val Accuracy: {val_accuracy}")
    print(classification_report(y_true_val, y_pred_val))

    if val_accuracy > best_accuracy_02:
        best_accuracy_02 = val_accuracy
        torch.save(model_loader[1].state_dict(), f'model_loader[1]_{k_folds[1]}.pt')

    torch.cuda.empty_cache()

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')

In [None]:
model_02.load_state_dict(torch.load(f'model_loader[1]_2.pt'))
test_dataset = NewsDataset(test['facts'].to_numpy(), np.zeros(len(test)), tokenizer, max_len=352)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
_, test_predictions = eval(model_02, test_loader, device)
submit['first_party_winner_02'] = test_predictions

In [None]:
submit_02 = submit.copy()

In [None]:
submit_02.to_csv('submit_02_version2.csv', index = False)

## Model_08 - Kfold[3]

In [None]:
train_loader = [train_loader_01, train_loader_02, train_loader_03]
val_loader = [val_loader_01, val_loader_02, val_loader_03]
model_loader = [model_01, model_02, model_03]
optimizer_loader = [optimizer_01, optimizer_02, optimizer_03]


best_accuracy_03 = 0

for epoch in range(10):
    print(f"Epoch: {epoch+1}")
    train_loss = train(model_loader[2], train_loader[2], optimizer_loader[2], device)
    print(f"Train Loss: {train_loss}")
    y_true_train, y_pred_train = eval(model_loader[2], train_loader[2], device)
    y_true_val, y_pred_val = eval(model_loader[2], val_loader[2], device)

    train_accuracy = accuracy_score(y_true_train, y_pred_train)
    val_accuracy = accuracy_score(y_true_val, y_pred_val)

    print(f"Train Accuracy: {train_accuracy}")
    print(f"Val Accuracy: {val_accuracy}")
    print(classification_report(y_true_val, y_pred_val))

    if val_accuracy > best_accuracy_03:
        best_accuracy_03 = val_accuracy
        torch.save(model_loader[2].state_dict(), f'model_loader[2]_{k_folds[2]}.pt')

    torch.cuda.empty_cache()

In [None]:
best_accuracy_03

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')

In [None]:
model_03.load_state_dict(torch.load(f'model_loader[2]_3.pt'))
test_dataset = NewsDataset(test['facts'].to_numpy(), np.zeros(len(test)), tokenizer, max_len=352)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
_, test_predictions = eval(model_03, test_loader, device)
submit['first_party_winner_03'] = test_predictions

In [None]:
submit_03 = submit.copy()

In [None]:
submit_03.to_csv('submit_03_version2.csv', index = False)

# Modeling(Model-6,7,8 - Kfold[0~3])

In [None]:
train = pd.read_csv('/data/train_last_3.csv')
test = pd.read_csv('/data/test_last_3.csv')

In [None]:
model_type = "sileod/deberta-v3-base-tasksource-nli"
tokenizer = AutoTokenizer.from_pretrained(model_type)
config = AutoConfig.from_pretrained(model_type)
model_01 = AutoModelForSequenceClassification.from_pretrained(model_type, config=config)
model_02 = AutoModelForSequenceClassification.from_pretrained(model_type, config=config)
model_03 = AutoModelForSequenceClassification.from_pretrained(model_type, config=config)
model_04 = AutoModelForSequenceClassification.from_pretrained(model_type, config=config)
model_05 = AutoModelForSequenceClassification.from_pretrained(model_type, config=config)

In [None]:
def tokenize_texts(texts, tokenizer, max_len=352):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=max_len, add_special_tokens=False , return_tensors='pt')

class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = tokenize_texts(text, self.tokenizer, self.max_len)
        return {'input_ids': encoding['input_ids'][0], 'attention_mask': encoding['attention_mask'][0], 'label': torch.tensor(label)}

### Data_loader

In [None]:
k_folds = [0, 1, 2, 3, 4]
train_indices = []
val_indices = []

In [None]:
train_index = train.iloc[list(range(0, 24780, 10))]

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_val_splits = list(skf.split(train_index, train_index['first_party_winner']))

In [None]:
for k_fold in k_folds:
    train_train =  list(train_index.iloc[train_val_splits[k_fold][0],:].index)
    train_val =  list(train_index.iloc[train_val_splits[k_fold][1],:].index)

    train_list = train_train[:]
    val_list = train_val[:]

    for num in train_train:
        for i in range(1, 10):
            new_value = num + i
            train_list.append(new_value)

    for num in train_val:
        for i in range(1, 10):
            new_value = num + i
            val_list.append(new_value)

    train_list = sorted(train_list)
    val_list = sorted(val_list)

    train_indices.append(sorted(train_list))
    val_indices.append(sorted(val_list))

In [None]:
train_data_01 = train.iloc[train_indices[0]]
val_data_01 = train.iloc[val_indices[0]]

train_data_02 = train.iloc[train_indices[1]]
val_data_02 = train.iloc[val_indices[1]]

train_data_03 = train.iloc[train_indices[2]]
val_data_03 = train.iloc[val_indices[2]]

train_data_04 = train.iloc[train_indices[3]]
val_data_04 = train.iloc[val_indices[3]]

train_data_05 = train.iloc[train_indices[4]]
val_data_05 = train.iloc[val_indices[4]]

In [None]:
train_dataset_01 = NewsDataset(train_data_01['facts'].to_numpy(), train_data_01['first_party_winner'].to_numpy(), tokenizer, max_len=352)
val_dataset_01 = NewsDataset(val_data_01['facts'].to_numpy(), val_data_01['first_party_winner'].to_numpy(), tokenizer, max_len=352)

train_dataset_02 = NewsDataset(train_data_02['facts'].to_numpy(), train_data_02['first_party_winner'].to_numpy(), tokenizer, max_len=352)
val_dataset_02 = NewsDataset(val_data_02['facts'].to_numpy(), val_data_02['first_party_winner'].to_numpy(), tokenizer, max_len=352)

train_dataset_03 = NewsDataset(train_data_03['facts'].to_numpy(), train_data_03['first_party_winner'].to_numpy(), tokenizer, max_len=352)
val_dataset_03 = NewsDataset(val_data_03['facts'].to_numpy(), val_data_03['first_party_winner'].to_numpy(), tokenizer, max_len=352)

train_dataset_04 = NewsDataset(train_data_04['facts'].to_numpy(), train_data_04['first_party_winner'].to_numpy(), tokenizer, max_len=352)
val_dataset_04 = NewsDataset(val_data_04['facts'].to_numpy(), val_data_04['first_party_winner'].to_numpy(), tokenizer, max_len=352)

train_dataset_05 = NewsDataset(train_data_05['facts'].to_numpy(), train_data_05['first_party_winner'].to_numpy(), tokenizer, max_len=352)
val_dataset_05 = NewsDataset(val_data_05['facts'].to_numpy(), val_data_05['first_party_winner'].to_numpy(), tokenizer, max_len=352)

In [None]:
train_loader_01 = DataLoader(train_dataset_01, batch_size=32, shuffle=True)
val_loader_01 = DataLoader(val_dataset_01, batch_size=32, shuffle=False)

train_loader_02 = DataLoader(train_dataset_02, batch_size=32, shuffle=True)
val_loader_02 = DataLoader(val_dataset_02, batch_size=32, shuffle=False)

train_loader_03 = DataLoader(train_dataset_03, batch_size=32, shuffle=True)
val_loader_03 = DataLoader(val_dataset_03, batch_size=32, shuffle=False)

train_loader_04 = DataLoader(train_dataset_04, batch_size=32, shuffle=True)
val_loader_04 = DataLoader(val_dataset_04, batch_size=32, shuffle=False)

train_loader_05 = DataLoader(train_dataset_05, batch_size=32, shuffle=True)
val_loader_05 = DataLoader(val_dataset_05, batch_size=32, shuffle=False)

# Modeling

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_01.to(device)
model_02.to(device)
model_03.to(device)
model_04.to(device)
model_05.to(device)

In [None]:
class LabelSmoothingLoss(torch.nn.Module):
    def __init__(self, classes, smoothing=0.1, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))

In [None]:
optimizer_01 = AdamW(model_01.parameters(), lr=1e-5)
optimizer_02 = AdamW(model_02.parameters(), lr=1e-5)
optimizer_03 = AdamW(model_03.parameters(), lr=1e-5)
optimizer_04 = AdamW(model_04.parameters(), lr=1e-5)
optimizer_05 = AdamW(model_05.parameters(), lr=1e-5)

In [None]:
# Loss function
num_classes = 2
smoothing = 0.01
criterion = LabelSmoothingLoss(classes=num_classes, smoothing=smoothing).to(device)

In [None]:
# Step 7: Training and validation
def train(model, data_loader, optimizer, device):
    model.train()
    losses = []
    for batch in data_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        logits = outputs.logits

        loss = criterion(logits, labels)
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    return np.mean(losses)




def eval(model, data_loader, device):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(predictions.cpu().numpy())
    return y_true, y_pred

## Model_01 - Kfold[1]

In [None]:
train_loader = [train_loader_01, train_loader_02, train_loader_03, train_loader_04, train_loader_05]
val_loader = [val_loader_01, val_loader_02, val_loader_03, val_loader_04, val_loader_05]
model_loader = [model_01, model_02, model_03, model_04, model_05]
optimizer_loader = [optimizer_01, optimizer_02, optimizer_03, optimizer_04, optimizer_05]


best_accuracy_01 = 0

for epoch in range(10):
    print(f"Epoch: {epoch+1}")
    train_loss = train(model_loader[0], train_loader[0], optimizer_loader[0], device)
    print(f"Train Loss: {train_loss}")
    y_true_train, y_pred_train = eval(model_loader[0], train_loader[0], device)
    y_true_val, y_pred_val = eval(model_loader[0], val_loader[0], device)

    train_accuracy = accuracy_score(y_true_train, y_pred_train)
    val_accuracy = accuracy_score(y_true_val, y_pred_val)

    print(f"Train Accuracy: {train_accuracy}")
    print(f"Val Accuracy: {val_accuracy}")
    print(classification_report(y_true_val, y_pred_val))

    if val_accuracy > best_accuracy_01:
        best_accuracy_01 = val_accuracy
        torch.save(model_loader[0].state_dict(), f'model_fine_loader[0]_{k_folds[0]}.pt')

    torch.cuda.empty_cache()

In [None]:
best_accuracy_01

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')

In [None]:
model_01.load_state_dict(torch.load(f'model_fine_loader[0]_1.pt'))
test_dataset = NewsDataset(test['facts'].to_numpy(), np.zeros(len(test)), tokenizer, max_len=352)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
_, test_predictions = eval(model_01, test_loader, device)
submit['first_party_winner_01'] = test_predictions

In [None]:
submit_01 = submit.copy()

In [None]:
submit_01.to_csv('submit_01_version3.csv', index = False)

## Model_02 - Kfold[2]

In [None]:
train_loader = [train_loader_01, train_loader_02, train_loader_03, train_loader_04, train_loader_05]
val_loader = [val_loader_01, val_loader_02, val_loader_03, val_loader_04, val_loader_05]
model_loader = [model_01, model_02, model_03, model_04, model_05]
optimizer_loader = [optimizer_01, optimizer_02, optimizer_03, optimizer_04, optimizer_05]


best_accuracy_02 = 0

for epoch in range(10):
    print(f"Epoch: {epoch+1}")
    train_loss = train(model_loader[1], train_loader[1], optimizer_loader[1], device)
    print(f"Train Loss: {train_loss}")
    y_true_train, y_pred_train = eval(model_loader[1], train_loader[1], device)
    y_true_val, y_pred_val = eval(model_loader[1], val_loader[1], device)

    train_accuracy = accuracy_score(y_true_train, y_pred_train)
    val_accuracy = accuracy_score(y_true_val, y_pred_val)

    print(f"Train Accuracy: {train_accuracy}")
    print(f"Val Accuracy: {val_accuracy}")
    print(classification_report(y_true_val, y_pred_val))

    if val_accuracy > best_accuracy_02:
        best_accuracy_02 = val_accuracy
        torch.save(model_loader[1].state_dict(), f'model_fine_loader[1]_{k_folds[1]}.pt')

    torch.cuda.empty_cache()

In [None]:
best_accuracy_02

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')

In [None]:
model_02.load_state_dict(torch.load(f'model_fine_loader[1]_2.pt'))
test_dataset = NewsDataset(test['facts'].to_numpy(), np.zeros(len(test)), tokenizer, max_len=352)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
_, test_predictions = eval(model_02, test_loader, device)
submit['first_party_winner_02'] = test_predictions

In [None]:
submit_02 = submit.copy()

In [None]:
submit_02.to_csv('submit_02_version3.csv', index = False)

## Model_03 - Kfold[3]

In [20]:
train_loader = [train_loader_01, train_loader_02, train_loader_03, train_loader_04, train_loader_05]
val_loader = [val_loader_01, val_loader_02, val_loader_03, val_loader_04, val_loader_05]
model_loader = [model_01, model_02, model_03, model_04, model_05]
optimizer_loader = [optimizer_01, optimizer_02, optimizer_03, optimizer_04, optimizer_05]


best_accuracy_03 = 0

for epoch in range(10):
    print(f"Epoch: {epoch+1}")
    train_loss = train(model_loader[2], train_loader[2], optimizer_loader[2], device)
    print(f"Train Loss: {train_loss}")
    y_true_train, y_pred_train = eval(model_loader[2], train_loader[2], device)
    y_true_val, y_pred_val = eval(model_loader[2], val_loader[2], device)

    train_accuracy = accuracy_score(y_true_train, y_pred_train)
    val_accuracy = accuracy_score(y_true_val, y_pred_val)

    print(f"Train Accuracy: {train_accuracy}")
    print(f"Val Accuracy: {val_accuracy}")
    print(classification_report(y_true_val, y_pred_val))

    if val_accuracy > best_accuracy_03:
        best_accuracy_03 = val_accuracy
        torch.save(model_loader[2].state_dict(), f'model_fine_loader[2]_{k_folds[2]}.pt')

    torch.cuda.empty_cache()

Epoch: 1
Train Loss: 0.758619368364734
Train Accuracy: 0.5002522704339052
Val Accuracy: 0.4993951612903226
              precision    recall  f1-score   support

           0       0.50      1.00      0.67      2480
           1       0.33      0.00      0.00      2480

    accuracy                           0.50      4960
   macro avg       0.42      0.50      0.33      4960
weighted avg       0.42      0.50      0.33      4960

Epoch: 2
Train Loss: 0.7514975108446613
Train Accuracy: 0.5050958627648839
Val Accuracy: 0.49556451612903224
              precision    recall  f1-score   support

           0       0.50      0.97      0.66      2480
           1       0.42      0.02      0.04      2480

    accuracy                           0.50      4960
   macro avg       0.46      0.50      0.35      4960
weighted avg       0.46      0.50      0.35      4960

Epoch: 3
Train Loss: 0.6018260053569271
Train Accuracy: 0.951765893037336
Val Accuracy: 0.5058467741935484
              precision

KeyboardInterrupt: 

In [21]:
best_accuracy_03

0.5381048387096774

In [23]:
submit = pd.read_csv('./data/sample_submission.csv')

In [24]:
model_03.load_state_dict(torch.load(f'model_fine_loader[2]_2.pt'))
test_dataset = NewsDataset(test['facts'].to_numpy(), np.zeros(len(test)), tokenizer, max_len=352)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
_, test_predictions = eval(model_03, test_loader, device)
submit['first_party_winner_03'] = test_predictions

In [25]:
submit_03 = submit.copy()

In [26]:
submit_03.to_csv('submit_03_version4.csv', index = False)

## Model_04 - Kfold[4]

In [None]:
train_loader = [train_loader_01, train_loader_02, train_loader_03, train_loader_04, train_loader_05]
val_loader = [val_loader_01, val_loader_02, val_loader_03, val_loader_04, val_loader_05]
model_loader = [model_01, model_02, model_03, model_04, model_05]
optimizer_loader = [optimizer_01, optimizer_02, optimizer_03, optimizer_04, optimizer_05]


best_accuracy_04 = 0

for epoch in range(10):
    print(f"Epoch: {epoch+1}")
    train_loss = train(model_loader[3], train_loader[3], optimizer_loader[3], device)
    print(f"Train Loss: {train_loss}")
    y_true_train, y_pred_train = eval(model_loader[3], train_loader[3], device)
    y_true_val, y_pred_val = eval(model_loader[3], val_loader[3], device)

    train_accuracy = accuracy_score(y_true_train, y_pred_train)
    val_accuracy = accuracy_score(y_true_val, y_pred_val)

    print(f"Train Accuracy: {train_accuracy}")
    print(f"Val Accuracy: {val_accuracy}")
    print(classification_report(y_true_val, y_pred_val))

    if val_accuracy > best_accuracy_04:
        best_accuracy_04 = val_accuracy
        torch.save(model_loader[3].state_dict(), f'model_fine_loader[3]_{k_folds[3]}.pt')

    torch.cuda.empty_cache()

In [None]:
best_accuracy_04

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')

In [None]:
model_04.load_state_dict(torch.load(f'model_fine_loader[3]_4.pt'))
test_dataset = NewsDataset(test['facts'].to_numpy(), np.zeros(len(test)), tokenizer, max_len=352)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
_, test_predictions = eval(model_04, test_loader, device)
submit['first_party_winner_04'] = test_predictions

In [None]:
submit_04 = submit.copy()

In [None]:
submit_04.to_csv('submit_04_version3.csv', index = False)

## Model_05 - Kfold[5]

In [None]:
train_loader = [train_loader_01, train_loader_02, train_loader_03, train_loader_04, train_loader_05]
val_loader = [val_loader_01, val_loader_02, val_loader_03, val_loader_04, val_loader_05]
model_loader = [model_01, model_02, model_03, model_04, model_05]
optimizer_loader = [optimizer_01, optimizer_02, optimizer_03, optimizer_04, optimizer_05]


best_accuracy_05 = 0

for epoch in range(10):
    print(f"Epoch: {epoch+1}")
    train_loss = train(model_loader[4], train_loader[4], optimizer_loader[4], device)
    print(f"Train Loss: {train_loss}")
    y_true_train, y_pred_train = eval(model_loader[4], train_loader[4], device)
    y_true_val, y_pred_val = eval(model_loader[4], val_loader[4], device)

    train_accuracy = accuracy_score(y_true_train, y_pred_train)
    val_accuracy = accuracy_score(y_true_val, y_pred_val)

    print(f"Train Accuracy: {train_accuracy}")
    print(f"Val Accuracy: {val_accuracy}")
    print(classification_report(y_true_val, y_pred_val))

    if val_accuracy > best_accuracy_05:
        best_accuracy_05 = val_accuracy
        torch.save(model_loader[4].state_dict(), f'model_fine_loader[4]_{k_folds[4]}.pt')

    torch.cuda.empty_cache()

In [None]:
best_accuracy_05

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')

In [None]:
model_05.load_state_dict(torch.load(f'model_fine_loader[4]_4.pt'))
test_dataset = NewsDataset(test['facts'].to_numpy(), np.zeros(len(test)), tokenizer, max_len=352)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
_, test_predictions = eval(model_05, test_loader, device)
submit['first_party_winner_05'] = test_predictions

In [None]:
submit_05 = submit.copy()

In [None]:
submit_05.to_csv('submit_05_version3.csv', index = False)

## Ensemble

In [None]:
from scipy.stats import mode

In [None]:
submit_01 = pd.read_csv('./data/submit_01_version4.csv')
submit_02 = pd.read_csv('./data/submit_02_version4.csv')
submit_03 = pd.read_csv('./data/submit_03_version4.csv')
submit_04 = pd.read_csv('./data/submit_04_version4.csv')
submit_05 = pd.read_csv('./data/submit_05_version4.csv')
submit_06 = pd.read_csv('/data/submit_01_version2.csv')
submit_07 = pd.read_csv('/data/submit_02_version2.csv')
submit_08 = pd.read_csv('/data/submit_03_version2.csv')

In [None]:
submit_06.columns = ['ID','first_party_winner','first_party_winner_06']
submit_07.columns = ['ID','first_party_winner','first_party_winner_07']
submit_08.columns = ['ID','first_party_winner','first_party_winner_08']

In [None]:
submission = pd.merge(submit_01, submit_02, on=['ID', 'first_party_winner'])
submission = pd.merge(submission, submit_03, on=['ID', 'first_party_winner'])
submission = pd.merge(submission, submit_04, on=['ID', 'first_party_winner'])
submission = pd.merge(submission, submit_05, on=['ID', 'first_party_winner'])
submission = pd.merge(submission, submit_06, on=['ID', 'first_party_winner'])
submission = pd.merge(submission, submit_07, on=['ID', 'first_party_winner'])
submission = pd.merge(submission, submit_08, on=['ID', 'first_party_winner'])

In [None]:
submission['first_party_winner'] = submission[['first_party_winner_01','first_party_winner_02', 'first_party_winner_03','first_party_winner_04','first_party_winner_05','first_party_winner_06','first_party_winner_07','first_party_winner_08']].mode(axis=1)[0]

In [None]:
submission['first_party_winner'] = submission['first_party_winner'].astype(int)

In [None]:
submission

In [None]:
submission = submission.iloc[:,:2]

In [None]:
submission.to_csv('submission_minsu_version_04.csv', index = False)