In [1]:
import torch
from torch import nn
from torch import optim
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report
import warnings
from sklearn.utils.class_weight import compute_class_weight

warnings.filterwarnings('ignore')

from dataset import MyDataset, MyDatasetConnected
from models import ClassifierConnected, ClassifierSeparate
from train_eval import get_accuracy_connected, get_accuracy, train_model, train_model_connected

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

### Stałe

In [3]:
TRAIN_PATH = 'train/processed_headlines.xml'
TEST_PATH = 'test/processed_headlines.xml'
BATCH_SIZE = 64
EPOCHS = 20
LR = 3e-4

### Zamiana plików uczących na pliki xml z niezbędnymi modyfikacjami

In [4]:
def process_data(PATH, PATH_PROCESSED):
    with open(PATH, 'rb') as reader:
        with open(PATH_PROCESSED, 'wb') as writer:
            text = reader.read()
            text = text.replace(b'<==>', b'---')
            text = text.replace(b'&', b'und')
            writer.write(text)

    start_root = '<root>'
    end_root = '</root>'

    with open(PATH_PROCESSED, "a+") as f:
        f.write(end_root)

    with open(PATH_PROCESSED, "r+") as f:
        content = f.read()
        f.seek(0)
        f.write(start_root + '\n' + content)

In [5]:
datasets = ['answers-students', 'headlines', 'images']
for i in ['train', 'test']:
    for dataset in datasets:
        if i == 'test':
            PATH = f'test/STSint.testinput.{dataset}.wa'
            PATH_PROCESSED = f'test/processed_{dataset}.xml'
        else:
            PATH = f'train/STSint.input.{dataset}.wa'
            PATH_PROCESSED = f'train/processed_{dataset}.xml'
        process_data(PATH, PATH_PROCESSED)

In [6]:
def clear_sentence(text):
    text = text.replace('//', '')
    text = text.strip()
    return text

In [7]:
def clear_alignment(text):
    new_text= []

    for chunk in text.split('\n'):
        
        chunks = chunk.split('//')

        alignment = chunks[0]
        class_1 = chunks[1].strip()
        class_2 = chunks[2].strip()

        chunks_text = chunks[3].split('---')
        chunks_text[0] = chunks_text[0].strip()
        chunks_text[1] = chunks_text[1].strip()

        alignments = alignment.split('---')
        alignment_source = alignments[0].strip()
        alignment_translation = alignments[1].strip()

        alignment_source = tuple(map(int, alignment_source.split(' ')))
        alignment_translation = tuple(map(int, alignment_translation.split(' ')))

        row = [class_1, class_2, alignment_source, alignment_translation, chunks_text]
        new_text.append(row)

    return new_text

In [8]:
def source_to_words(text):
    text.replace('\n', '')
    chunks = text.split(':')
    result = []
    for chunk in chunks:
        chunk = chunk.strip()
        chunk = chunk[1:]
        chunk = chunk.strip()
        result.append(chunk)
    return result

In [9]:
def df_from_alignment(series):
    new_df = pd.DataFrame(columns=['class_1', 'class_2', 'chunks_source', 'chunks_translation', 'text_source', 'text_translation'])
    for data in series:
        for element in data:
            row = {}
            row['class_1'] = element[0]
            row['class_2'] = element[1]
            row['chunks_source'] = element[2]
            row['chunks_translation'] = element[3]

            sentence = element[4]
            row['text_source'] = sentence[0]
            row['text_translation'] = sentence[1]
            new_df = new_df.append(row, ignore_index=True)
    return new_df

In [10]:
def get_classes_weight(df_labels):
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(df_labels), y=np.array(df_labels))
    class_weights = torch.tensor(class_weights, dtype=torch.float)
    return class_weights

In [11]:
def collate_fn(batch):
    data_list, label_list = [], []
    for _data, _label in batch:
        data_list.append(_data)
        label_list.append(_label)
    return data_list, torch.LongTensor(label_list)

In [12]:
def get_df(path, labels=1):
    df = pd.read_xml(path)
    df.drop('status', axis=1, inplace=True)

    df['source_split'] = df['source'].apply(source_to_words)
    df['translation_split'] = df['translation'].apply(source_to_words)

    split_columns = df['sentence'].str.split(pat='\n', expand=True)
    df['source'] = split_columns[0]
    df['translation'] = split_columns[1]
    df.drop('sentence', axis=1, inplace=True)

    df['source'] = df['source'].apply(clear_sentence)
    df['translation'] = df['translation'].apply(clear_sentence)
    df.dropna(how = 'any', axis=0, inplace=True)
    df['alignment'] = df['alignment'].apply(clear_alignment)

    new_df = df_from_alignment(df['alignment'])

    def change_classes(text):
        class_1_change = {
            'SPE1_FACT': 'SPE1',
            'SIMI_FACT': 'SIMI',
            'EQUI_POL': 'EQUI',
            'EQUI_FACT': 'EQUI',
            'REL_POL': 'REL',
            'SPE2_FACT': 'SPE2',
            'NOALI_FACT': 'NOALI',
            'SPE2_POL': 'SPE2'
        }
        try:
            new_class_name = class_1_change[text]
            return new_class_name
        except KeyError as e:
            return text

    new_df['class_1'] = new_df['class_1'].apply(change_classes)
    df_labels_1 = new_df.pop('class_1')
    df_labels_2 = new_df.pop('class_2')

    new_df.drop(columns=['chunks_source', 'chunks_translation'], inplace=True)

    if labels == 1:
        df_labels = df_labels_1
    else:
        df_labels = df_labels_2
    return new_df, df_labels

In [13]:
import re

# Generator dla klasy 1
def gen_1(preds_1):
    for i in preds_1:
        yield i

# Generator dla klasy 2
def gen_2(preds_2):
    for i in preds_2:
        yield i

def to_correct_class(idx, class_type):
    type_1 = ['EQUI', 'NOALI', 'OPPO', 'REL', 'SIMI', 'SPE1', 'SPE2']
    type_2 = ['0', '1', '2', '3', '4', '5', 'NIL']
    if class_type == 1:
        return type_1[idx]
    else:
        return type_2[idx]

# Zamiana oryginalnego df na zmodyfikowanego o predykcje + zamiana predykcji na konkretną nazwę, np. NIL
def modify_alignment(text, g_1, g_2):
    chunks = text.split('\n')
    new_chunks = []
    for chunk in chunks:
        # Np. do NIL, 0, 1, ..., 5
        chunk = re.sub(r'\b\s\/\/\s(\bNIL\b)*[0-6]*\s\/\/\s\b', f' // {to_correct_class(next(g_2), 2)} // ', chunk)
        # Np. do EQUI, SPE1, SPE2, ...
        chunk = re.sub(r'\b\s\/\/\s[A-Z]+[1-2]*\s\/\/\s\b', f' // {to_correct_class(next(g_1), 1)} // ', chunk)
        new_chunks.append(chunk)
    text = '\n'.join(new_chunks)

    return text

def get_rows_with_nan(df):
    rows_with_nan = []
    for index, row in df.iterrows():
        is_nan_series = row.isnull()
        if is_nan_series.any():
            rows_with_nan.append(index)
    return rows_with_nan

def prepare_test_wa_file(test_path, g_1, g_2, filename):
    df = pd.read_xml(test_path)
    df.drop('status', axis=1, inplace=True)
    df.drop('id', axis=1, inplace=True)
    rows_with_nan = get_rows_with_nan(df)
    print(rows_with_nan)
    df.dropna(axis=0, how='any', inplace=True)

    df['alignment'] = df['alignment'].apply(modify_alignment, args=(g_1, g_2))
    df.to_xml(filename, index=False, row_name='Sentence', root_name='root', xml_declaration=False, pretty_print=True)
    text = ''
    tags = ['root', 'sentence', 'source', 'translation', 'alignment']
    with open(filename, 'r') as file:
        text = file.read()
        text = text.replace('<root>', '')
        text = text.replace('</root>', '')
        text = text.replace('<sentence>', '')
        text = text.replace('</sentence>', '')
        text = text.replace('<Sentence>', '<sentence>')
        text = text.replace('</Sentence>', '</sentence>')
        text = text.replace('---', '<==>')
        text = text.replace('und', '&')
        for tag in tags:
            text = text.replace(f'<{tag}>', f'\n<{tag}>\n')
            text = text.replace(f'</{tag}>', f'\n</{tag}>\n')
        text = text.strip()
        text = "\n".join([ll.rstrip() for ll in text.splitlines() if ll.strip()])
    with open(filename, 'w') as file:
        file.write(text)


    with open(filename, 'r') as file:
        lines = file.readlines()
        index = 1
        new_lines = []
        for line in lines:
            if '<sentence>' in line:
                if index in rows_with_nan:
                    index += 1
                line = line.replace('<sentence>', f'<sentence id=\"{index}\" status=\"\">')
                index += 1
            if '</sentence>' in line:
                line = line.replace('</sentence>', '</sentence>\n\n')
            new_lines.append(line)
    with open(filename, 'w') as file:
        file.writelines(new_lines)

In [14]:
df_train, y_train = get_df(TRAIN_PATH, labels=1)
df_test, y_test = get_df(TEST_PATH, labels=1)

In [15]:
# train_path = 'train/processed_headlines.xml'
# df_train1, y_train1 = get_df(train_path, labels=1)
#
# train_path = 'train/processed_answers-students.xml'
# df_train2, y_train2 = get_df(train_path, labels=1)
#
# train_path = 'train/processed_images.xml'
# df_train3, y_train3 = get_df(train_path, labels=1)
#
# df_train, y_train = pd.concat([df_train1, df_train2, df_train3], axis=0), pd.concat([y_train1, y_train2, y_train3], axis=0)

In [16]:
train_dataset = MyDataset(df_train['text_source'].values, df_train['text_translation'].values, y_train.values, 1)
test_dataset = MyDataset(df_test['text_source'].values, df_test['text_translation'].values, y_test.values, 1)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [17]:
model_separate = ClassifierSeparate(labels=1)
model_separate = model_separate.to(device)

In [18]:
criterion = nn.CrossEntropyLoss()
optimizer_separate = optim.Adam(model_separate.parameters(), lr=LR)

In [19]:
train_model(model=model_separate, optimizer=optimizer_separate, epochs=EPOCHS, train_dataloader=train_dataloader, criterion=criterion, device=device)

EPOCH [0] | LOSS: 1.5865503088120492
EPOCH [1] | LOSS: 1.4108800215105857
EPOCH [2] | LOSS: 1.3723640980259064
EPOCH [3] | LOSS: 1.357496073169093
EPOCH [4] | LOSS: 1.3447282718073936
EPOCH [5] | LOSS: 1.3355387430037222
EPOCH [6] | LOSS: 1.326772064931931
EPOCH [7] | LOSS: 1.3203141439345576
EPOCH [8] | LOSS: 1.3199959185815626
EPOCH [9] | LOSS: 1.3110196186650185
EPOCH [10] | LOSS: 1.3091381903617614
EPOCH [11] | LOSS: 1.3002909806466871
EPOCH [12] | LOSS: 1.296544619144932
EPOCH [13] | LOSS: 1.2898690296757607
EPOCH [14] | LOSS: 1.2906667590141296
EPOCH [15] | LOSS: 1.2808637907428126
EPOCH [16] | LOSS: 1.2766930114838384
EPOCH [17] | LOSS: 1.2717360854148865
EPOCH [18] | LOSS: 1.265209403730208
EPOCH [19] | LOSS: 1.2606108900039428


In [20]:
acc, y_true, y_pred_1 = get_accuracy(model_separate, test_dataloader, device)

In [21]:
print(classification_report(y_true, y_pred_1, target_names=list(np.unique(y_test))))

              precision    recall  f1-score   support

        EQUI       0.79      0.95      0.86       686
       NOALI       1.00      1.00      1.00       869
        OPPO       0.00      0.00      0.00        13
         REL       0.39      0.14      0.21        99
        SIMI       0.50      0.63      0.56       158
        SPE1       0.29      0.14      0.19       107
        SPE2       0.37      0.19      0.25       108

    accuracy                           0.82      2040
   macro avg       0.48      0.44      0.44      2040
weighted avg       0.78      0.82      0.79      2040



In [22]:
df_train, y_train = get_df(TRAIN_PATH, labels=2)
df_test, y_test = get_df(TEST_PATH, labels=2)

train_dataset = MyDataset(df_train['text_source'].values, df_train['text_translation'].values, y_train.values, 2)
test_dataset = MyDataset(df_test['text_source'].values, df_test['text_translation'].values, y_test.values, 2)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

model_separate = ClassifierSeparate(labels=2)
model_separate.to(device)
criterion = nn.CrossEntropyLoss()
optimizer_separate = optim.Adam(model_separate.parameters(), lr=LR)

train_model(model=model_separate, optimizer=optimizer_separate, epochs=EPOCHS, train_dataloader=train_dataloader, criterion=criterion, device=device)

EPOCH [0] | LOSS: 1.6253865938032828
EPOCH [1] | LOSS: 1.4331472362241438
EPOCH [2] | LOSS: 1.3904298140156655
EPOCH [3] | LOSS: 1.3689069074969138
EPOCH [4] | LOSS: 1.3571552864966854
EPOCH [5] | LOSS: 1.3473274438611922
EPOCH [6] | LOSS: 1.3326045351643716
EPOCH [7] | LOSS: 1.3263819871410247
EPOCH [8] | LOSS: 1.3183186977140364
EPOCH [9] | LOSS: 1.313156739357979
EPOCH [10] | LOSS: 1.3055441571820168
EPOCH [11] | LOSS: 1.301166836292513
EPOCH [12] | LOSS: 1.2905806437615426
EPOCH [13] | LOSS: 1.2870256593150478
EPOCH [14] | LOSS: 1.287375921203244
EPOCH [15] | LOSS: 1.2850716171726104
EPOCH [16] | LOSS: 1.2823091207012054
EPOCH [17] | LOSS: 1.2823659489231725
EPOCH [18] | LOSS: 1.2788192745177978
EPOCH [19] | LOSS: 1.2735177316973287


In [23]:
acc, y_true, y_pred_2 = get_accuracy(model_separate, test_dataloader, device)
print(classification_report(y_true, y_pred_2, target_names=list(np.unique(y_test))))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         5
           2       0.00      0.00      0.00        60
           3       0.45      0.40      0.43       149
           4       0.50      0.41      0.45       272
           5       0.79      0.93      0.85       686
         NIL       1.00      1.00      1.00       868

    accuracy                           0.82      2040
   macro avg       0.46      0.46      0.45      2040
weighted avg       0.79      0.82      0.80      2040



In [24]:
g_1 = gen_1(y_pred_1)
g_2 = gen_2(y_pred_2)
prepare_test_wa_file(test_path=TEST_PATH, g_1=g_1, g_2=g_2, filename='separate_results.xml')

[]


### Uczone razem

In [25]:
df_train, y_train_1 = get_df(TRAIN_PATH, labels=1)
df_test, y_test_1 = get_df(TEST_PATH, labels=1)

_, y_train_2 = get_df(TRAIN_PATH, labels=2)
_, y_test_2 = get_df(TEST_PATH, labels=2)

In [26]:
train_dataset = MyDatasetConnected(df_train['text_source'].values, df_train['text_translation'].values, y_train_1.values, y_train_2.values)
test_dataset = MyDatasetConnected(df_test['text_source'].values, df_test['text_translation'].values, y_test_1.values, y_test_2.values)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

In [27]:
model_connected = ClassifierConnected()
model_connected.to(device)

criterion = nn.CrossEntropyLoss()
optimizer_connected = optim.Adam(model_connected.parameters(), lr=LR)

In [28]:
train_model_connected(model_connected, optimizer_connected, EPOCHS, train_dataloader, criterion, device)

EPOCH [0] | LOSS: 3.8570422164855467
EPOCH [1] | LOSS: 3.7541945518985873
EPOCH [2] | LOSS: 3.619055813358676
EPOCH [3] | LOSS: 3.4816500794503
EPOCH [4] | LOSS: 3.371100894866451
EPOCH [5] | LOSS: 3.289502513024115
EPOCH [6] | LOSS: 3.226635190748399
EPOCH [7] | LOSS: 3.175743679846487
EPOCH [8] | LOSS: 3.1293385221112158
EPOCH [9] | LOSS: 3.0909000865874754
EPOCH [10] | LOSS: 3.0557829333889868
EPOCH [11] | LOSS: 3.0259551502043203
EPOCH [12] | LOSS: 3.0000993397928055
EPOCH [13] | LOSS: 2.978834206058133
EPOCH [14] | LOSS: 2.959145842059966
EPOCH [15] | LOSS: 2.9448704604179627
EPOCH [16] | LOSS: 2.9305404732304234
EPOCH [17] | LOSS: 2.9164858979563557
EPOCH [18] | LOSS: 2.9080808970236007
EPOCH [19] | LOSS: 2.8979041999386204


In [29]:
acc_1, acc_2, y_true_1, y_true_2, y_pred_1, y_pred_2 = get_accuracy_connected(model_connected, test_dataloader, device)

CLASS_1 accuracy: 0.7622549019607843
CLASS_2 accuracy: 0.7607843137254902


In [30]:
g_1 = gen_1(y_pred_1)
g_2 = gen_2(y_pred_2)
prepare_test_wa_file(test_path=TEST_PATH, g_1=g_1, g_2=g_2, filename='connected_results.xml')

[]
