In [20]:
from all_pipeline import clean, convert2idx, model_path, char_to_index, indicies_to_labels, torch, TensorDataset, DataLoader, re, textwrap, pickle

In [8]:
def calculate_diacritic_error_rate(predictions, gold):
    """
    Calculate the diacritic error rate (DER) for a set of predictions and gold labels.

    :param predictions: A list of predictions.
    :param gold: A list of gold labels.
    :return: The diacritic error rate.
    """
    total = 0
    errors = 0
    for i in range(len(predictions)):
        total += 1
        if predictions[i] != gold[i]:
            errors += 1
    return errors / total

In [10]:
# load the predictions and gold labels from the csv files, with column label (second column in both files)
predictions = []
gold = []
with open('submission.csv', 'r') as f:
    for line in f:
        predictions.append(line.strip().split(',')[1])
with open('gold.csv', 'r') as f:
    for line in f:
        gold.append(line.strip().split(',')[1])
        
# calculate the DER
der = calculate_diacritic_error_rate(predictions, gold)
print(f'DER: {der * 100:.3f}%')
print(f'Accuracy: {(1 - der) * 100:.3f}%')

DER: 3.825%
Accuracy: 96.175%


In [18]:
def predict_single_sentence(model, sentence='', max_len=200, char_to_index={}, indicies_to_labels={}, batch_size=256):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu"); print(device)
    
    # preprocess the sentence
    preprocessed_sentence = sentence.strip()
    preprocessed_sentence = clean([preprocessed_sentence])[0]
    
    # tokenize the sentence
    preprocessed_sentence = re.compile(r'[\n\r\t]').sub('', preprocessed_sentence)
    preprocessed_sentence = re.compile(r'\s+').sub(' ', preprocessed_sentence)
    preprocessed_sentence = preprocessed_sentence.strip()

    tokenized_sentences = []
    
    # split the line into sentences by dot
    dot_splitted_list = preprocessed_sentence.split('.')

    # remove last string if empty
    if dot_splitted_list[-1] == '':
        dot_splitted_list = dot_splitted_list[:-1]

    for dot_splitted in dot_splitted_list:
        dot_splitted = dot_splitted.strip()
        # Split the line into sentences of max_len, without cutting words
        sentences = textwrap.wrap(dot_splitted, max_len)

        for sentence in sentences:
            tokenized_sentences.append(sentence)
            
    sentence_sequences = convert2idx(data=tokenized_sentences, char_to_index=char_to_index, max_len=max_len, device=device)
    
    dataset = TensorDataset(sentence_sequences, sentence_sequences)

    dataloader = DataLoader(dataset, batch_size=batch_size)

    # TODO: refactor from here
    outputs = model(sentence_sequences)
    outputs = outputs.argmax(dim=2)
    outputs = outputs.tolist()
    diacritics = []
    for output in outputs:
        for index in output:
            predicted_class = indicies_to_labels[index]
            if type(predicted_class) is tuple:
                diacritics.append(chr(predicted_class[0]) + chr(predicted_class[1]))
            elif predicted_class == 0:
                diacritics.append('')
            else:
                diacritics.append(chr(predicted_class))
    print(diacritics)
    return diacritics[:len(sentence)]

In [None]:
with open(model_path, "rb") as file:
        model = pickle.load(file)
        
# TODO: call predict_single_sentence with your own sentence