In [1]:
from pathlib import Path
from conllu import parse
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

notebook_path = Path().resolve()
code_folder = notebook_path.parent
treebanks_folder = code_folder / "treebanks"
print(treebanks_folder)

nltk.download('stopwords')
nltk.download('punkt')
norwegian_stop_words = set(stopwords.words("norwegian"))

C:\Users\juste\OneDrive\Desktop\University\COMP 550\Final Project\comp550-norwegian-dialects\treebanks


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\juste\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\juste\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def parse_conll_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        conll_data = file.read()
        return parse(conll_data)

In [3]:
#In this function we will have a nested list all_parsed_data (nested list of sentences where the outter list is a specific dialect while the inner list is the sentences per dialect)
#The second list all_parsed_data_filtered is the filtered version of the same thing
#Each element in the inner list is a token and there is meta data at the end
#Each token has associated info lemma,upos,etc

all_parsed_data = [] 
print(treebanks_folder)

for file_path in treebanks_folder.glob("*.conll"):
    parsed_data = parse_conll_file(file_path)
    all_parsed_data.append(parsed_data)

if all_parsed_data:
    first_file_parsed_data = all_parsed_data[0][1]
    serialized = first_file_parsed_data.serialize()
    print(first_file_parsed_data)
    print(serialized)
    token = first_file_parsed_data[0]
    # print(token['id'])
    # print(token['form'])
    # print(token['lemma'])
    # print(token['upos'])
else:
    all_parsed_data = []

C:\Users\juste\OneDrive\Desktop\University\COMP 550\Final Project\comp550-norwegian-dialects\treebanks
TokenList<kor, gammal, var, du, aal_uio_0201, da, du, begynte, å, køyre, tømmer, ?, metadata={text: "kor gammal var du aal_uio_0201 da du begynte å køyre tømmer ?", segstart: "42.422", segstop: "45.249", file: "aal_uio_02", speaker: "khs", id: "2"}>
# text = kor gammal var du aal_uio_0201 da du begynte å køyre tømmer ?
# segstart = 42.422
# segstop = 45.249
# file = aal_uio_02
# speaker = khs
# id = 2
1	kor	kor	adv	adv	_	2	ADV	_	_
2	gammal	gammal	adj	adj	eint|m/f|ub|pos	3	SPRED	_	_
3	var	vere	verb	verb	pret	0	FINV	_	_
4	du	du	pron	pron	pers|eint|2|nom|hum	3	SUBJ	_	_
5	aal_uio_0201	aal_uio_0201	subst	subst	prop	4	APP	_	_
6	da	da	sbu	sbu	_	8	SBU	_	_
7	du	du	pron	pron	pers|eint|2|nom|hum	8	SUBJ	_	_
8	begynte	begynne	verb	verb	pret	3	ADV	_	_
9	å	å	inf-merke	inf-merke	_	8	DOBJ	_	_
10	køyre	køyre	verb	verb	inf	9	INFV	_	_
11	tømmer	tømmer	subst	subst	appell|fl|nøyt|ub	10	DOBJ	_	_
12	?	$?	clb

In [4]:
def create_roberta_dataset(num_classes):
    import pandas as pd
    better_data ={}

    for dialect in all_parsed_data:

        dialect_name = dialect[0].metadata.get('file', '')
        better_data[dialect_name] = dialect
    
    ls_aal = better_data['aal_uio_02'] + better_data['aal_uio_06']
    ls_austevoll = better_data['austevoll_uib_01'] + better_data['austevoll_uib_04']

    better_data['aal_uio_02'] = ls_aal
    better_data['austevoll_uib_01'] = ls_austevoll

    del better_data['aal_uio_06']
    del better_data['austevoll_uib_04']

    temp_data = {}
    for dialect, token_list in better_data.items():
        temp_ls = []
        for token in token_list:
            sentence = token.metadata.get('text', '')
            if len(token) > 3:
                temp_ls.append(sentence)
        temp_data[dialect] = temp_ls


    len_each_dialect = {}
    for dialect, token_list in temp_data.items():
        len_each_dialect[dialect] = len(token_list)
    
    sorted_len_each_dialect = sorted(len_each_dialect.items(), key=lambda x: x[1], reverse=True)[:num_classes]

    filtered_dict = {key[0]: temp_data[key[0]] for key in sorted_len_each_dialect}
    
    #Aaustevoll, Bardu, Brandbu, Eidsberg, Fana, Lista, Flakstad, Førde, Giske, Gol, Hemsedal, Herad, Hjartdal, Høyanger, Nordli, Vardø and Ål
    map_names = {
        'bardu_uit_01': 'Bardu',
        'brandbu_uio_01_ny': 'Brandbu',
        'eidsberg_uio_03': 'Eidsberg',
        'fana_uib_03': 'Fana',
        'farsund_uib_02': 'Farsund',
        'flakstad_uib_04': 'Flakstad', 
        'foerde_uib_05': 'Foerde',
        'giske_uib_02': 'Giske',
        'gol_uio_01': 'Gol',
        'hemsedal_uio_01': 'Hemsedal',
        'herad_uio_01': 'Herad',
        'hjartdal_uio_01': 'Hjartdal',
        'hoeyanger_uib_02': 'Hoeyanger',
        'lierne_uio_01': 'Lierne', 
        'vardoe_uio_01': 'Vardoe',
        'aal_uio_02': 'Al', 
        'austevoll_uib_01': 'Austevoll',
    }
    dialect_to_index = {dialect: idx for idx, dialect in enumerate(filtered_dict.keys())}
    list_of_names_add_to_prompt = ", ".join([map_names[key] for key in filtered_dict.keys()])
    # print(list_of_names_add_to_prompt)
    prompt = f"You are tasked with being a Norwegian Dialect classifier. The goal is to train a model that can accurately distinguish between different Norwegian dialects. The primary dialects of interest are {list_of_names_add_to_prompt} and you should be able to distinguish between these dialects.\n Here is the sentence: "
    
    test_size = 0.2

    train_dataset = []
    test_dataset = []

    for dialect, sentence_list in filtered_dict.items():
        label = dialect_to_index[dialect]
        train_size = int(len(sentence_list) * (1 - test_size))
        train_dataset.extend([{"label": label, "text": sentence} for sentence in sentence_list[:train_size]])
        test_dataset.extend([{"label": label, "text": sentence} for sentence in sentence_list[train_size:]])

    return train_dataset, test_dataset

In [5]:
train_roberta_dataset3, test_roberta_dataset3 = create_roberta_dataset(3)
train_roberta_dataset6, test_roberta_dataset6 = create_roberta_dataset(6)
train_roberta_dataset12, test_roberta_dataset12 = create_roberta_dataset(12)
train_roberta_dataset17, test_roberta_dataset17 = create_roberta_dataset(17)

In [6]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate
from datasets import Dataset
import torch

def tokenzie_data(train_roberta_dataset,test_roberta_dataset):

    tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/norwegian-roberta-base")

    data_training = {
        "label": [one["label"] for one in train_roberta_dataset],
        "text": [one["text"] for one in train_roberta_dataset],
    }
    data_testing = {
        "label": [one["label"] for one in test_roberta_dataset],
        "text": [one["text"] for one in test_roberta_dataset],
    }

    num_classes = len(set(data_training["label"]))
    data_training["label"] = [torch.nn.functional.one_hot(torch.tensor(label), num_classes=num_classes) for label in data_training["label"]]
    data_testing["label"] = [torch.nn.functional.one_hot(torch.tensor(label), num_classes=num_classes) for label in data_testing["label"]]

    dataset_training = Dataset.from_dict(data_training)
    dataset_testing = Dataset.from_dict(data_testing)

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=50)

    tokenized_dataset_training = dataset_training.map(tokenize_function)
    tokenized_dataset_testing = dataset_testing.map(tokenize_function)
    return tokenized_dataset_training, tokenized_dataset_testing, num_classes




In [9]:
def run_roberta_model(tokenized_dataset_training,tokenized_dataset_testing,num_classes):

    model = AutoModelForSequenceClassification.from_pretrained("patrickvonplaten/norwegian-roberta-base", num_labels=num_classes)

    training_args = TrainingArguments(
        output_dir="test_trainer",
        evaluation_strategy="epoch",      
    )

    metric = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        references = np.argmax(labels, axis=-1)
        return metric.compute(predictions=predictions, references=references)


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset_training,
        eval_dataset=tokenized_dataset_testing,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    return model

In [10]:
tokenized_dataset_training3, tokenized_dataset_testing3, num_classes3 = tokenzie_data(train_roberta_dataset3, test_roberta_dataset3)
tokenized_dataset_training6, tokenized_dataset_testing6, num_classes6 = tokenzie_data(train_roberta_dataset6, test_roberta_dataset6)
tokenized_dataset_training12, tokenized_dataset_testing12, num_classes12 = tokenzie_data(train_roberta_dataset12, test_roberta_dataset12)
tokenized_dataset_training17, tokenized_dataset_testing17, num_classes17 = tokenzie_data(train_roberta_dataset17, test_roberta_dataset17)

Map: 100%|██████████| 1665/1665 [00:00<00:00, 2861.01 examples/s]
Map: 100%|██████████| 418/418 [00:00<00:00, 2548.78 examples/s]
Map: 100%|██████████| 2595/2595 [00:00<00:00, 3409.98 examples/s]
Map: 100%|██████████| 653/653 [00:00<00:00, 2940.95 examples/s]
Map: 100%|██████████| 3999/3999 [00:01<00:00, 3570.54 examples/s]
Map: 100%|██████████| 1006/1006 [00:00<00:00, 3398.62 examples/s]
Map: 100%|██████████| 4748/4748 [00:01<00:00, 3486.05 examples/s]
Map: 100%|██████████| 1196/1196 [00:00<00:00, 3340.78 examples/s]


In [11]:
notebook_path = Path().resolve()
code_folder = notebook_path.parent
robertamodel_folder = code_folder / "robertamodels3"

In [12]:
roberta_model3 = run_roberta_model(tokenized_dataset_training3, tokenized_dataset_testing3, num_classes3)
roberta_model3.save_pretrained(robertamodel_folder)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at patrickvonplaten/norwegian-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                 
 33%|███▎      | 209/627 [08:10<13:22,  1.92s/it]

{'eval_loss': 0.38422995805740356, 'eval_accuracy': 0.7177033492822966, 'eval_runtime': 22.184, 'eval_samples_per_second': 18.842, 'eval_steps_per_second': 2.389, 'epoch': 1.0}


                                                   
 67%|██████▋   | 418/627 [16:08<06:41,  1.92s/it]

{'eval_loss': 0.3108651340007782, 'eval_accuracy': 0.8301435406698564, 'eval_runtime': 22.1133, 'eval_samples_per_second': 18.903, 'eval_steps_per_second': 2.397, 'epoch': 2.0}


 80%|███████▉  | 500/627 [19:13<04:45,  2.25s/it]

{'loss': 0.39, 'learning_rate': 1.0127591706539077e-05, 'epoch': 2.39}


                                                 
100%|██████████| 627/627 [23:55<00:00,  2.29s/it]


{'eval_loss': 0.3572394847869873, 'eval_accuracy': 0.8181818181818182, 'eval_runtime': 19.606, 'eval_samples_per_second': 21.32, 'eval_steps_per_second': 2.703, 'epoch': 3.0}
{'train_runtime': 1435.3481, 'train_samples_per_second': 3.48, 'train_steps_per_second': 0.437, 'train_loss': 0.34199358439711675, 'epoch': 3.0}


In [13]:
roberta_model6 = run_roberta_model(tokenized_dataset_training6, tokenized_dataset_testing6, num_classes6)
robertamodel_folder = code_folder / "robertamodels6"
roberta_model6.save_pretrained(robertamodel_folder)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at patrickvonplaten/norwegian-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 33%|███▎      | 325/975 [12:11<21:58,  2.03s/it]
 33%|███▎      | 325/975 [12:46<21:58,  2.03s/it]

{'eval_loss': 0.3586987555027008, 'eval_accuracy': 0.5022970903522205, 'eval_runtime': 34.1217, 'eval_samples_per_second': 19.137, 'eval_steps_per_second': 2.403, 'epoch': 1.0}


 51%|█████▏    | 500/975 [18:37<16:38,  2.10s/it]  Checkpoint destination directory test_trainer\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.3591, 'learning_rate': 2.435897435897436e-05, 'epoch': 1.54}


 67%|██████▋   | 650/975 [23:50<10:57,  2.02s/it]
 67%|██████▋   | 650/975 [24:26<10:57,  2.02s/it]

{'eval_loss': 0.3284062445163727, 'eval_accuracy': 0.552833078101072, 'eval_runtime': 35.3552, 'eval_samples_per_second': 18.47, 'eval_steps_per_second': 2.319, 'epoch': 2.0}


100%|██████████| 975/975 [36:00<00:00,  1.72s/it]  
100%|██████████| 975/975 [36:32<00:00,  2.25s/it]


{'eval_loss': 0.33547210693359375, 'eval_accuracy': 0.5666156202143952, 'eval_runtime': 31.4267, 'eval_samples_per_second': 20.779, 'eval_steps_per_second': 2.609, 'epoch': 3.0}
{'train_runtime': 2192.2583, 'train_samples_per_second': 3.551, 'train_steps_per_second': 0.445, 'train_loss': 0.2885448044996995, 'epoch': 3.0}


In [14]:
roberta_model12 = run_roberta_model(tokenized_dataset_training12, tokenized_dataset_testing12, num_classes12)
robertamodel_folder = code_folder / "robertamodels12"
roberta_model12.save_pretrained(robertamodel_folder)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at patrickvonplaten/norwegian-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 33%|███▎      | 500/1500 [21:26<47:26,  2.85s/it]  Checkpoint destination directory test_trainer\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.2761, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}



 33%|███▎      | 500/1500 [22:35<47:26,  2.85s/it]

{'eval_loss': 0.2622585594654083, 'eval_accuracy': 0.2286282306163022, 'eval_runtime': 67.1149, 'eval_samples_per_second': 14.989, 'eval_steps_per_second': 1.877, 'epoch': 1.0}


 67%|██████▋   | 1000/1500 [40:56<18:16,  2.19s/it] 

{'loss': 0.2281, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}



 67%|██████▋   | 1000/1500 [41:53<18:16,  2.19s/it]

{'eval_loss': 0.2428758144378662, 'eval_accuracy': 0.28429423459244535, 'eval_runtime': 55.7947, 'eval_samples_per_second': 18.03, 'eval_steps_per_second': 2.258, 'epoch': 2.0}


100%|██████████| 1500/1500 [1:00:24<00:00,  1.84s/it]

{'loss': 0.19, 'learning_rate': 0.0, 'epoch': 3.0}



100%|██████████| 1500/1500 [1:01:12<00:00,  2.45s/it]


{'eval_loss': 0.23466500639915466, 'eval_accuracy': 0.3628230616302187, 'eval_runtime': 46.7623, 'eval_samples_per_second': 21.513, 'eval_steps_per_second': 2.694, 'epoch': 3.0}
{'train_runtime': 3672.8404, 'train_samples_per_second': 3.266, 'train_steps_per_second': 0.408, 'train_loss': 0.23140023803710938, 'epoch': 3.0}


In [15]:
roberta_model17 = run_roberta_model(tokenized_dataset_training17, tokenized_dataset_testing17, num_classes17)
robertamodel_folder = code_folder / "robertamodels17"
roberta_model17.save_pretrained(robertamodel_folder)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at patrickvonplaten/norwegian-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 28%|██▊       | 500/1782 [20:49<51:46,  2.42s/it]  Checkpoint destination directory test_trainer\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.2261, 'learning_rate': 3.5970819304152635e-05, 'epoch': 0.84}


                                                    
 33%|███▎      | 594/1782 [26:09<46:24,  2.34s/it]

{'eval_loss': 0.21544848382472992, 'eval_accuracy': 0.1714046822742475, 'eval_runtime': 72.5344, 'eval_samples_per_second': 16.489, 'eval_steps_per_second': 2.068, 'epoch': 1.0}


 56%|█████▌    | 1000/1782 [43:30<32:40,  2.51s/it] Checkpoint destination directory test_trainer\checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.2076, 'learning_rate': 2.1941638608305277e-05, 'epoch': 1.68}


                                                   
 67%|██████▋   | 1188/1782 [52:42<23:34,  2.38s/it]

{'eval_loss': 0.19714465737342834, 'eval_accuracy': 0.24665551839464883, 'eval_runtime': 70.0712, 'eval_samples_per_second': 17.068, 'eval_steps_per_second': 2.141, 'epoch': 2.0}


 84%|████████▍ | 1500/1782 [1:05:49<11:31,  2.45s/it]Checkpoint destination directory test_trainer\checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.1846, 'learning_rate': 7.912457912457913e-06, 'epoch': 2.53}


                                                     
100%|██████████| 1782/1782 [1:18:26<00:00,  2.64s/it]


{'eval_loss': 0.19099938869476318, 'eval_accuracy': 0.29347826086956524, 'eval_runtime': 67.5395, 'eval_samples_per_second': 17.708, 'eval_steps_per_second': 2.221, 'epoch': 3.0}
{'train_runtime': 4706.2931, 'train_samples_per_second': 3.027, 'train_steps_per_second': 0.379, 'train_loss': 0.2000620121640121, 'epoch': 3.0}
