In [87]:
from sentence_transformers import SentenceTransformer
import torch
from torch import nn
from torch import optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from datasets import load_metric
from sklearn.metrics import classification_report
import warnings
from sklearn.utils.class_weight import compute_class_weight

warnings.filterwarnings('ignore')

In [88]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

### Pliki xml, z ktorych czytamy musza miec tag root

In [89]:
PATH = 'train_2015_10_22.utf-8/STSint.input.headlines.xml'
PATH_PROCESSED = 'train_2015_10_22.utf-8/processed_xml.xml'

In [90]:
with open(PATH, 'rb') as reader:
    with open(PATH_PROCESSED, 'wb') as writer:
        text = reader.read()
        text = text.replace(b'<==>', b'---')
        text = text.replace(b'&', b'und')
        writer.write(text)

In [91]:
def clear_sentence(text):
    text = text.replace('//', '')
    text = text.strip()
    return text

In [92]:
def clear_alignment(text):
    new_text= []
    for chunk in text.split('\n'):
        chunks = chunk.split('//')

        alignment = chunks[0]
        class_1 = chunks[1].strip()
        class_2 = chunks[2].strip()

        chunks_text = chunks[3].split('---')
        chunks_text[0] = chunks_text[0].strip()
        chunks_text[1] = chunks_text[1].strip()

        alignments = alignment.split('---')
        alignment_source = alignments[0].strip()
        alignment_translation = alignments[1].strip()

        alignment_source = tuple(map(int, alignment_source.split(' ')))
        alignment_translation = tuple(map(int, alignment_translation.split(' ')))

        row = [class_1, class_2, alignment_source, alignment_translation, chunks_text]
        new_text.append(row)
    return new_text

In [93]:
def source_to_words(text):
    text.replace('\n', '')
    chunks = text.split(':')
    result = []
    for chunk in chunks:
        chunk = chunk.strip()
        chunk = chunk[1:]
        chunk = chunk.strip()
        result.append(chunk)
    return result

In [94]:
def df_from_alignment(series):
    new_df = pd.DataFrame(columns=['class_1', 'class_2', 'chunks_source', 'chunks_translation', 'text_source', 'text_translation'])
    for data in series:
        for element in data:
            row = {}
            row['class_1'] = element[0]
            row['class_2'] = element[1]
            row['chunks_source'] = element[2]
            row['chunks_translation'] = element[3]

            sentence = element[4]
            row['text_source'] = sentence[0]
            row['text_translation'] = sentence[1]
            new_df = new_df.append(row, ignore_index=True)
    return new_df

In [95]:
class MyDataset(Dataset):
    def __init__(self, chunk_source, chunk_translation, targets):
        self.chunk_source = chunk_source
        self.chunk_translation = chunk_translation
        self.pairs = [chunk_source, chunk_translation]
        self.targets = targets
        self.classes = list(np.unique(targets))

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        return (self.pairs[0][idx], self.pairs[1][idx]), self.classes.index(self.targets[idx])

In [96]:
class MyDatasetConnected(Dataset):
    def __init__(self, chunk_source, chunk_translation, targets_1, targets_2):
        self.chunk_source = chunk_source
        self.chunk_translation = chunk_translation
        self.pairs = [chunk_source, chunk_translation]
        self.targets_1 = targets_1
        self.targets_2 = targets_2
        self.classes_1 = list(np.unique(targets_1))
        self.classes_2 = list(np.unique(targets_2))

    def __len__(self):
        return len(self.targets_1)

    def __getitem__(self, idx):
        return (self.pairs[0][idx], self.pairs[1][idx]),\
               (self.classes_1.index(self.targets_1[idx]), self.classes_2.index(self.targets_2[idx]))

In [97]:
class ClassifierSeparate(nn.Module):
    def __init__(self, df_labels, model_name='all-mpnet-base-v2'):
        super().__init__()
        self.classes = list(np.unique(df_labels))
        self.sbert = SentenceTransformer(model_name)
        self.train_sbert(True)
        self.main = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=768, out_features=500),
            nn.BatchNorm1d(500),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(in_features=500, out_features=100),
            nn.BatchNorm1d(100),
            nn.Dropout(0.2),
            nn.ReLU(),
            nn.Linear(in_features=100, out_features=len(self.classes)),
            nn.Softmax()
        )

    def train_sbert(self, train=True):
        for param in self.sbert.parameters():
            param.requires_grad = train

    def forward(self, ids, mask):
        x = {
            'input_ids': ids,
            'attention_mask': mask
        }
        x = self.sbert(x)['sentence_embedding']
        x = self.main(x)
        return x

In [98]:
class ClassifierConnected(nn.Module):
    def __init__(self, df_labels_1, df_labels_2, model_name='all-mpnet-base-v2'):
        super().__init__()
        self.classes_1 = list(np.unique(df_labels_1))
        self.classes_2 = list(np.unique(df_labels_2))
        self.sbert = SentenceTransformer(model_name)
        self.train_sbert(False)
        self.class_1 = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(p=0.2),
            nn.Linear(in_features=768, out_features=len(self.classes_1)),
            nn.Softmax()
        )

        self.class_2 = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(p=0.2),
            nn.Linear(in_features=768, out_features=len(self.classes_2)),
            nn.Softmax()
        )

    def train_sbert(self, train=True):
        for param in self.sbert.parameters():
            param.requires_grad = train

    def forward(self, ids, mask):
        x = {
            'input_ids': ids,
            'attention_mask': mask
        }
        x = self.sbert(x)['sentence_embedding']
        return {
            'class_1': self.class_1(x),
            'class_2': self.class_2(x)
        }

In [99]:
df = pd.read_xml(PATH_PROCESSED)
df.drop('status', axis=1, inplace=True)

df['source_split'] = df['source'].apply(source_to_words)
df['translation_split'] = df['translation'].apply(source_to_words)

split_columns = df['sentence'].str.split(pat='\n', expand=True)
df['source'] = split_columns[0]
df['translation'] = split_columns[1]
df.drop('sentence', axis=1, inplace=True)

df['source'] = df['source'].apply(clear_sentence)
df['translation'] = df['translation'].apply(clear_sentence)
df['alignment'] = df['alignment'].apply(clear_alignment)

In [100]:
new_df = df_from_alignment(df['alignment'])

In [101]:
new_df

Unnamed: 0,class_1,class_2,chunks_source,chunks_translation,text_source,text_translation
0,EQUI,5,"(8, 9)","(11, 12)",at 91,aged 91
1,SPE1_FACT,3,"(1, 2, 3, 4, 5, 6)","(1, 2, 4, 5, 6, 7, 8)",Former Nazi death camp guard Demjanjuk,John Demjanjuk convicted Nazi death camp guard
2,EQUI,5,"(7,)","(10,)",dead,dies
3,NOALI,0,"(0,)","(3,)",-not aligned-,","
4,NOALI,0,"(0,)","(9,)",-not aligned-,","
...,...,...,...,...,...,...
3974,NOALI,NIL,"(5, 6)","(0,)",gear up,-not aligned-
3975,SPE1,4,"(2,)","(2,)",question,arrest
3976,SIMI,2,"(4, 5, 6, 7, 8)","(4, 5, 6, 7, 8)",in deadly LA boardwalk crash,in deadly LA driving attack
3977,SPE2,3,"(3,)","(3,)",man,suspect


In [102]:
def change_classes(text):
    class_1_change = {
        'SPE1_FACT': 'SPE1',
        'SIMI_FACT': 'SIMI',
        'EQUI_POL': 'EQUI',
        'EQUI_FACT': 'EQUI',
        'REL_POL': 'REL',
        'SPE2_FACT': 'SPE2',
        'NOALI_FACT': 'NOALI',
        'SPE2_POL': 'SPE2'
    }
    try:
        new_class_name = class_1_change[text]
        return new_class_name
    except KeyError as e:
        return text

new_df['class_1'] = new_df['class_1'].apply(change_classes)

In [103]:
df_labels_1 = new_df.pop('class_1')
df_labels_2 = new_df.pop('class_2')

new_df.drop(columns=['chunks_source', 'chunks_translation'], inplace=True)

In [104]:
# Ktora klase chcemy uczyc? -> df_labels_1 albo df_labels_2
df_labels = df_labels_1

In [105]:
df_train, df_test, y_train, y_test = train_test_split(new_df, df_labels, test_size=0.2)

In [106]:
train_dataset = MyDataset(df_train['text_source'].values, df_train['text_translation'].values, y_train.values)
test_dataset = MyDataset(df_test['text_source'].values, df_test['text_translation'].values, y_test.values)

BATCH_SIZE = 16
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [107]:
sbert = SentenceTransformer('all-mpnet-base-v2')

def tokenize(x):
    return sbert.tokenize(x)

In [108]:
tokenize('siema')

{'input_ids': tensor([[   0, 1059,    2],
         [   0, 1049,    2],
         [   0, 1045,    2],
         [   0, 1053,    2],
         [   0, 1041,    2]]),
 'attention_mask': tensor([[1, 1, 1],
         [1, 1, 1],
         [1, 1, 1],
         [1, 1, 1],
         [1, 1, 1]])}

In [109]:
def train_model(model, optimizer, epochs, train_dataloader):
    model.train()
    for epoch in range(epochs):
        losses = []
        for batch in train_dataloader:
            data, labels = batch
            data_batch = []
            for s1, s2 in zip(data[0], data[1]):
                data_batch.append((s1, s2))
            data_batch = tokenize(data_batch)
            labels = torch.tensor(labels, device=device)

            labels = labels.to(device)
            input_ids = data_batch['input_ids'].to(device)
            attention_mask = data_batch['attention_mask'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            outputs = outputs.to(device)
            loss = criterion(outputs, labels)
            loss.backward()
            losses.append(loss.item())
            optimizer.step()
        print(f'EPOCH {[epoch]} | LOSS: {np.mean(losses)}')

In [110]:
def get_accuracy(model, dataloader):
    y_true = []
    y_pred = []

    metric= load_metric("accuracy")
    model.eval()
    for batch in dataloader:
        data, labels = batch
        data_batch = []
        for s1, s2 in zip(data[0], data[1]):
            data_batch.append((s1, s2))
        data_batch = tokenize(data_batch)

        labels = torch.tensor(labels, device=device)
        y_true.extend(labels.cpu().numpy())
        labels = labels.to(device)
        input_ids = data_batch['input_ids'].to(device)
        attention_mask = data_batch['attention_mask'].to(device)

        predictions = model(input_ids, attention_mask)
        predictions = torch.argmax(predictions, dim=1)
        y_pred.extend(predictions.cpu().numpy())
        metric.add_batch(predictions=predictions, references=labels)
    return metric.compute()['accuracy'], y_true, y_pred

In [111]:
def get_classes_weight(df_labels):
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(df_labels), y=np.array(df_labels))
    class_weights = torch.tensor(class_weights, dtype=torch.float)
    return class_weights

In [112]:
model_separate = ClassifierSeparate(df_labels=df_labels)
model_separate.to(device)

ClassifierSeparate(
  (sbert): SentenceTransformer(
    (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
    (2): Normalize()
  )
  (main): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Linear(in_features=768, out_features=500, bias=True)
    (2): BatchNorm1d(500, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.2, inplace=False)
    (4): ReLU()
    (5): Linear(in_features=500, out_features=100, bias=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.2, inplace=False)
    (8): ReLU()
    (9): Linear(in_features=100, out_features=7, bias=True)
    (10): Softmax(dim=None)
  )
)

In [113]:
criterion = nn.CrossEntropyLoss()
optimizer_separate = optim.Adam(model_separate.parameters(), lr=3e-4)

In [114]:
train_model(model=model_separate, optimizer=optimizer_separate, epochs=100, train_dataloader=train_dataloader)

EPOCH [0] | LOSS: 1.7906378022390395
EPOCH [1] | LOSS: 1.7540829121766977
EPOCH [2] | LOSS: 1.7207440113901493


KeyboardInterrupt: 

In [None]:
acc, y_true, y_pred = get_accuracy(model_separate, test_dataloader)

In [None]:
print(classification_report(y_true, y_pred, target_names=list(np.unique(df_labels))))

In [None]:
df_train, df_test, y_train_1, y_test_1, y_train_2, y_test_2 = train_test_split(new_df, df_labels_1, df_labels_2, test_size=0.2)

In [None]:
train_dataset = MyDatasetConnected(df_train['text_source'].values, df_train['text_translation'].values, y_train_1.values, y_train_2.values)
test_dataset = MyDatasetConnected(df_test['text_source'].values, df_test['text_translation'].values, y_test_1.values, y_test_2.values)

BATCH_SIZE = 16
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
model_connected = ClassifierConnected(df_labels_1=df_labels_1, df_labels_2=df_labels_2)
model_connected.to(device)

criterion = nn.CrossEntropyLoss()
optimizer_connected = optim.Adam(model_connected.parameters(), lr=3e-4)

In [None]:
model_connected.train()
for epoch in range(100):
    losses = []
    for batch in train_dataloader:
        data, labels = batch
        data_batch = []
        for s1, s2 in zip(data[0], data[1]):
            data_batch.append((s1, s2))
        data_batch = tokenize(data_batch)

        label_1 = labels[0].to(device)
        label_2 = labels[1].to(device)

        input_ids = data_batch['input_ids'].to(device)
        attention_mask = data_batch['attention_mask'].to(device)

        optimizer_connected.zero_grad()
        outputs = model_connected(input_ids, attention_mask)

        output_1 = outputs['class_1']
        output_2 = outputs['class_2']
        output_1.to(device)
        output_2.to(device)

        loss_1 = criterion(output_1, label_1)
        loss_2 = criterion(output_2, label_2)
        loss = loss_1 + loss_2
        loss.backward()
        losses.append(loss.item())
        optimizer_connected.step()
    print(f'EPOCH {[epoch]} | LOSS: {np.mean(losses)}')

In [None]:
model_connected.eval()
for c in ['class_1', 'class_2']:
    metric= load_metric("accuracy")
    y_true = []
    y_pred = []
    for batch in test_dataloader:
        data, labels = batch
        data_batch = []
        for s1, s2 in zip(data[0], data[1]):
            data_batch.append((s1, s2))
        data_batch = tokenize(data_batch)

        label_1 = labels[0].to(device)
        label_2 = labels[1].to(device)

        input_ids = data_batch['input_ids'].to(device)
        attention_mask = data_batch['attention_mask'].to(device)

        if c == 'class_1':
            y_true.extend(label_1.cpu().numpy())
            label_1.to(device)
        else:
            y_true.extend(label_2.cpu().numpy())
            label_2.to(device)


        predictions = model_connected(input_ids, attention_mask)
        predictions = predictions[c]
        predictions = torch.argmax(predictions, dim=1)
        y_pred.extend(predictions.cpu().numpy())
        if c == 'class_1':
            metric.add_batch(predictions=predictions, references=label_1)
        else:
            metric.add_batch(predictions=predictions, references=label_2)
    acc = accuracy = metric.compute()['accuracy']
    print(f'CLASS {c} accuracy: {acc}')
    print(classification_report(y_true, y_pred, target_names=list(np.unique(df_labels))))