In [1]:
from pathlib import Path
from conllu import parse
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

notebook_path = Path().resolve()
code_folder = notebook_path.parent
treebanks_folder = code_folder / "treebanks"
print(treebanks_folder)

nltk.download('stopwords')
nltk.download('punkt')
norwegian_stop_words = set(stopwords.words("norwegian"))

C:\Users\juste\OneDrive\Desktop\University\COMP 550\Final Project\comp550-norwegian-dialects\treebanks


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\juste\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\juste\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def parse_conll_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        conll_data = file.read()
        return parse(conll_data)

In [3]:
#In this function we will have a nested list all_parsed_data (nested list of sentences where the outter list is a specific dialect while the inner list is the sentences per dialect)
#The second list all_parsed_data_filtered is the filtered version of the same thing
#Each element in the inner list is a token and there is meta data at the end
#Each token has associated info lemma,upos,etc
all_parsed_data = [] 
print(treebanks_folder)

for file_path in treebanks_folder.glob("*.conll"):
    parsed_data = parse_conll_file(file_path)
    all_parsed_data.append(parsed_data)


if all_parsed_data:
    first_file_parsed_data = all_parsed_data[0][1]
    serialized = first_file_parsed_data.serialize()
    print(first_file_parsed_data)
    print(serialized)
    token = first_file_parsed_data[0]
    # print(token['id'])
    # print(token['form'])
    # print(token['lemma'])
    # print(token['upos'])
else:
    all_parsed_data = []

C:\Users\juste\OneDrive\Desktop\University\COMP 550\Final Project\comp550-norwegian-dialects\treebanks
TokenList<kor, gammal, var, du, aal_uio_0201, da, du, begynte, å, køyre, tømmer, ?, metadata={text: "kor gammal var du aal_uio_0201 da du begynte å køyre tømmer ?", segstart: "42.422", segstop: "45.249", file: "aal_uio_02", speaker: "khs", id: "2"}>
# text = kor gammal var du aal_uio_0201 da du begynte å køyre tømmer ?
# segstart = 42.422
# segstop = 45.249
# file = aal_uio_02
# speaker = khs
# id = 2
1	kor	kor	adv	adv	_	2	ADV	_	_
2	gammal	gammal	adj	adj	eint|m/f|ub|pos	3	SPRED	_	_
3	var	vere	verb	verb	pret	0	FINV	_	_
4	du	du	pron	pron	pers|eint|2|nom|hum	3	SUBJ	_	_
5	aal_uio_0201	aal_uio_0201	subst	subst	prop	4	APP	_	_
6	da	da	sbu	sbu	_	8	SBU	_	_
7	du	du	pron	pron	pers|eint|2|nom|hum	8	SUBJ	_	_
8	begynte	begynne	verb	verb	pret	3	ADV	_	_
9	å	å	inf-merke	inf-merke	_	8	DOBJ	_	_
10	køyre	køyre	verb	verb	inf	9	INFV	_	_
11	tømmer	tømmer	subst	subst	appell|fl|nøyt|ub	10	DOBJ	_	_
12	?	$?	clb

In [4]:
def create_bert_dataset(num_classes):
    import pandas as pd
    better_data ={}

    for dialect in all_parsed_data:

        dialect_name = dialect[0].metadata.get('file', '')
        better_data[dialect_name] = dialect
    
    ls_aal = better_data['aal_uio_02'] + better_data['aal_uio_06']
    ls_austevoll = better_data['austevoll_uib_01'] + better_data['austevoll_uib_04']

    better_data['aal_uio_02'] = ls_aal
    better_data['austevoll_uib_01'] = ls_austevoll

    del better_data['aal_uio_06']
    del better_data['austevoll_uib_04']

    temp_data = {}
    for dialect, token_list in better_data.items():
        temp_ls = []
        for token in token_list:
            sentence = token.metadata.get('text', '')
            if len(token) > 3:
                temp_ls.append(sentence)
        temp_data[dialect] = temp_ls


    len_each_dialect = {}
    for dialect, token_list in temp_data.items():
        len_each_dialect[dialect] = len(token_list)
    
    sorted_len_each_dialect = sorted(len_each_dialect.items(), key=lambda x: x[1], reverse=True)[:num_classes]

    filtered_dict = {key[0]: temp_data[key[0]] for key in sorted_len_each_dialect}
    
    #Aaustevoll, Bardu, Brandbu, Eidsberg, Fana, Lista, Flakstad, Førde, Giske, Gol, Hemsedal, Herad, Hjartdal, Høyanger, Nordli, Vardø and Ål
    map_names = {
        'bardu_uit_01': 'Bardu',
        'brandbu_uio_01_ny': 'Brandbu',
        'eidsberg_uio_03': 'Eidsberg',
        'fana_uib_03': 'Fana',
        'farsund_uib_02': 'Farsund',
        'flakstad_uib_04': 'Flakstad', 
        'foerde_uib_05': 'Foerde',
        'giske_uib_02': 'Giske',
        'gol_uio_01': 'Gol',
        'hemsedal_uio_01': 'Hemsedal',
        'herad_uio_01': 'Herad',
        'hjartdal_uio_01': 'Hjartdal',
        'hoeyanger_uib_02': 'Hoeyanger',
        'lierne_uio_01': 'Lierne', 
        'vardoe_uio_01': 'Vardoe',
        'aal_uio_02': 'Al', 
        'austevoll_uib_01': 'Austevoll',
    }
    dialect_to_index = {dialect: idx for idx, dialect in enumerate(filtered_dict.keys())}
    list_of_names_add_to_prompt = ", ".join([map_names[key] for key in filtered_dict.keys()])
    # print(list_of_names_add_to_prompt)
    prompt = f"You are tasked with being a Norwegian Dialect classifier. The goal is to train a model that can accurately distinguish between different Norwegian dialects. The primary dialects of interest are {list_of_names_add_to_prompt} and you should be able to distinguish between these dialects.\n Here is the sentence: "
    
    test_size = 0.2

    train_dataset = []
    test_dataset = []

    for dialect, sentence_list in filtered_dict.items():
        label = dialect_to_index[dialect]
        train_size = int(len(sentence_list) * (1 - test_size))
        train_dataset.extend([{"label": label, "text": sentence} for sentence in sentence_list[:train_size]])
        test_dataset.extend([{"label": label, "text": sentence} for sentence in sentence_list[train_size:]])

    return train_dataset, test_dataset


In [6]:
train_bert_dataset3, test_bert_dataset3 = create_bert_dataset(3)
train_bert_dataset6, test_bert_dataset6 = create_bert_dataset(6)
train_bert_dataset12, test_bert_dataset12 = create_bert_dataset(12)
train_bert_dataset17, test_bert_dataset17 = create_bert_dataset(17)

In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets import load_dataset
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate
from datasets import Dataset
import torch


def tokenzie_data(train_bert_dataset,test_bert_dataset):
#https://huggingface.co/docs/transformers/training
    tokenizer = AutoTokenizer.from_pretrained("NbAiLab/notram-bert-norwegian-cased-080321")

    data_training = {
        "label": [one["label"] for one in train_bert_dataset],
        "text": [one["text"] for one in train_bert_dataset],
    }
    data_testing = {
        "label": [one["label"] for one in test_bert_dataset],
        "text": [one["text"] for one in test_bert_dataset],
    }

    num_classes = len(set(data_training["label"]))
    data_training["label"] = [torch.nn.functional.one_hot(torch.tensor(label), num_classes=num_classes) for label in data_training["label"]]
    data_testing["label"] = [torch.nn.functional.one_hot(torch.tensor(label), num_classes=num_classes) for label in data_testing["label"]]

    dataset_training = Dataset.from_dict(data_training)
    dataset_testing = Dataset.from_dict(data_testing)

    def tokenize_function(examples):
        return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=50)

    tokenized_dataset_training = dataset_training.map(tokenize_function)
    tokenized_dataset_testing = dataset_testing.map(tokenize_function)
    return tokenized_dataset_training, tokenized_dataset_testing, num_classes




In [13]:
def run_bert_model(tokenized_dataset_training,tokenized_dataset_testing,num_classes):
#https://huggingface.co/docs/transformers/training
    model = AutoModelForSequenceClassification.from_pretrained("NbAiLab/notram-bert-norwegian-cased-080321", num_labels=num_classes)

    training_args = TrainingArguments(
        output_dir="test_trainer",
        evaluation_strategy="epoch",
    )

    metric = evaluate.load("accuracy")

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        references = np.argmax(labels, axis=-1)
        return metric.compute(predictions=predictions, references=references)


    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset_training,
        eval_dataset=tokenized_dataset_testing,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    return model


In [14]:
tokenized_dataset_training3, tokenized_dataset_testing3, num_classes3 = tokenzie_data(train_bert_dataset3, test_bert_dataset3)
tokenized_dataset_training6, tokenized_dataset_testing6, num_classes6 = tokenzie_data(train_bert_dataset6, test_bert_dataset6)
tokenized_dataset_training12, tokenized_dataset_testing12, num_classes12 = tokenzie_data(train_bert_dataset12, test_bert_dataset12)
tokenized_dataset_training17, tokenized_dataset_testing17, num_classes17 = tokenzie_data(train_bert_dataset17, test_bert_dataset17)

Map: 100%|██████████| 1665/1665 [00:00<00:00, 4070.90 examples/s]
Map: 100%|██████████| 418/418 [00:00<00:00, 3906.56 examples/s]
Map: 100%|██████████| 2595/2595 [00:00<00:00, 3782.50 examples/s]
Map: 100%|██████████| 653/653 [00:00<00:00, 3934.88 examples/s]
Map: 100%|██████████| 3999/3999 [00:01<00:00, 3779.76 examples/s]
Map: 100%|██████████| 1006/1006 [00:00<00:00, 3698.53 examples/s]
Map: 100%|██████████| 4748/4748 [00:01<00:00, 3465.20 examples/s]
Map: 100%|██████████| 1196/1196 [00:00<00:00, 3908.48 examples/s]


In [15]:
notebook_path = Path().resolve()
code_folder = notebook_path.parent
bertmodel_folder = code_folder / "bertmodels3"

In [16]:
bert_model3 = run_bert_model(tokenized_dataset_training3, tokenized_dataset_testing3, num_classes3)
bert_model3.save_pretrained(bertmodel_folder)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NbAiLab/notram-bert-norwegian-cased-080321 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                 
 33%|███▎      | 209/627 [09:15<14:33,  2.09s/it]

{'eval_loss': 0.4014187455177307, 'eval_accuracy': 0.7870813397129187, 'eval_runtime': 20.3091, 'eval_samples_per_second': 20.582, 'eval_steps_per_second': 2.61, 'epoch': 1.0}


                                                 
 67%|██████▋   | 418/627 [19:17<08:52,  2.55s/it]

{'eval_loss': 0.3799878656864166, 'eval_accuracy': 0.8038277511961722, 'eval_runtime': 23.8115, 'eval_samples_per_second': 17.555, 'eval_steps_per_second': 2.226, 'epoch': 2.0}


 80%|███████▉  | 500/627 [23:26<06:26,  3.05s/it]

{'loss': 0.2507, 'learning_rate': 1.0127591706539077e-05, 'epoch': 2.39}


                                                 
100%|██████████| 627/627 [30:34<00:00,  2.93s/it]


{'eval_loss': 0.4020707309246063, 'eval_accuracy': 0.8253588516746412, 'eval_runtime': 25.2609, 'eval_samples_per_second': 16.547, 'eval_steps_per_second': 2.098, 'epoch': 3.0}
{'train_runtime': 1834.7207, 'train_samples_per_second': 2.722, 'train_steps_per_second': 0.342, 'train_loss': 0.21306065128940904, 'epoch': 3.0}


In [17]:
bert_model6 = run_bert_model(tokenized_dataset_training6, tokenized_dataset_testing6, num_classes6)
bertmodel_folder = code_folder / "bertmodels6"
bert_model6.save_pretrained(bertmodel_folder)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NbAiLab/notram-bert-norwegian-cased-080321 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                 
 33%|███▎      | 325/975 [16:06<28:21,  2.62s/it]

{'eval_loss': 0.32797539234161377, 'eval_accuracy': 0.5283307810107197, 'eval_runtime': 37.439, 'eval_samples_per_second': 17.442, 'eval_steps_per_second': 2.19, 'epoch': 1.0}


 51%|█████▏    | 500/975 [25:38<25:36,  3.23s/it]  Checkpoint destination directory test_trainer\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.2873, 'learning_rate': 2.435897435897436e-05, 'epoch': 1.54}


                                                 
 67%|██████▋   | 650/975 [34:09<14:23,  2.66s/it]

{'eval_loss': 0.2832270562648773, 'eval_accuracy': 0.6339969372128637, 'eval_runtime': 37.9039, 'eval_samples_per_second': 17.228, 'eval_steps_per_second': 2.163, 'epoch': 2.0}


                                                   
100%|██████████| 975/975 [49:46<00:00,  3.06s/it]


{'eval_loss': 0.3057621717453003, 'eval_accuracy': 0.6600306278713629, 'eval_runtime': 34.176, 'eval_samples_per_second': 19.107, 'eval_steps_per_second': 2.399, 'epoch': 3.0}
{'train_runtime': 2986.2839, 'train_samples_per_second': 2.607, 'train_steps_per_second': 0.326, 'train_loss': 0.2051008801582532, 'epoch': 3.0}


In [18]:
bert_model12 = run_bert_model(tokenized_dataset_training12, tokenized_dataset_testing12, num_classes12)
bertmodel_folder = code_folder / "bertmodels12"
bert_model12.save_pretrained(bertmodel_folder)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NbAiLab/notram-bert-norwegian-cased-080321 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 33%|███▎      | 500/1500 [22:25<46:08,  2.77s/it]  Checkpoint destination directory test_trainer\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.2441, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}



 33%|███▎      | 500/1500 [23:25<46:08,  2.77s/it]

{'eval_loss': 0.22281526029109955, 'eval_accuracy': 0.3996023856858847, 'eval_runtime': 56.7044, 'eval_samples_per_second': 17.741, 'eval_steps_per_second': 2.222, 'epoch': 1.0}


 67%|██████▋   | 1000/1500 [46:57<22:41,  2.72s/it] 

{'loss': 0.1517, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}



 67%|██████▋   | 1000/1500 [47:56<22:41,  2.72s/it]

{'eval_loss': 0.2079782783985138, 'eval_accuracy': 0.48906560636182905, 'eval_runtime': 56.6574, 'eval_samples_per_second': 17.756, 'eval_steps_per_second': 2.224, 'epoch': 2.0}


100%|██████████| 1500/1500 [1:12:37<00:00,  2.90s/it]

{'loss': 0.0878, 'learning_rate': 0.0, 'epoch': 3.0}



100%|██████████| 1500/1500 [1:13:40<00:00,  2.95s/it]


{'eval_loss': 0.20189645886421204, 'eval_accuracy': 0.5218687872763419, 'eval_runtime': 59.3681, 'eval_samples_per_second': 16.945, 'eval_steps_per_second': 2.122, 'epoch': 3.0}
{'train_runtime': 4420.2015, 'train_samples_per_second': 2.714, 'train_steps_per_second': 0.339, 'train_loss': 0.16118491363525392, 'epoch': 3.0}


In [19]:
bert_model17 = run_bert_model(tokenized_dataset_training17, tokenized_dataset_testing17, num_classes17)
bertmodel_folder = code_folder / "bertmodels17"
bert_model17.save_pretrained(bertmodel_folder)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NbAiLab/notram-bert-norwegian-cased-080321 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 28%|██▊       | 500/1782 [24:05<1:11:25,  3.34s/it]Checkpoint destination directory test_trainer\checkpoint-500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.2057, 'learning_rate': 3.5970819304152635e-05, 'epoch': 0.84}


 33%|███▎      | 594/1782 [28:48<48:36,  2.45s/it]  
 33%|███▎      | 594/1782 [29:56<48:36,  2.45s/it]

{'eval_loss': 0.18718963861465454, 'eval_accuracy': 0.31187290969899667, 'eval_runtime': 68.2142, 'eval_samples_per_second': 17.533, 'eval_steps_per_second': 2.199, 'epoch': 1.0}


 56%|█████▌    | 1000/1782 [48:54<39:11,  3.01s/it] Checkpoint destination directory test_trainer\checkpoint-1000 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.1453, 'learning_rate': 2.1941638608305277e-05, 'epoch': 1.68}


 67%|██████▋   | 1188/1782 [57:40<24:42,  2.50s/it]
 67%|██████▋   | 1188/1782 [58:43<24:42,  2.50s/it]

{'eval_loss': 0.16739873588085175, 'eval_accuracy': 0.40217391304347827, 'eval_runtime': 63.233, 'eval_samples_per_second': 18.914, 'eval_steps_per_second': 2.372, 'epoch': 2.0}


 84%|████████▍ | 1500/1782 [1:10:53<12:23,  2.63s/it]Checkpoint destination directory test_trainer\checkpoint-1500 already exists and is non-empty.Saving will proceed but saved results may be invalid.


{'loss': 0.1064, 'learning_rate': 7.912457912457913e-06, 'epoch': 2.53}


100%|██████████| 1782/1782 [1:22:37<00:00,  2.05s/it]
100%|██████████| 1782/1782 [1:23:31<00:00,  2.81s/it]


{'eval_loss': 0.17034120857715607, 'eval_accuracy': 0.41387959866220736, 'eval_runtime': 54.4092, 'eval_samples_per_second': 21.982, 'eval_steps_per_second': 2.757, 'epoch': 3.0}
{'train_runtime': 5011.6074, 'train_samples_per_second': 2.842, 'train_steps_per_second': 0.356, 'train_loss': 0.14168905649923716, 'epoch': 3.0}
