In [4]:
from pathlib import Path
from conllu import parse
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import random
import openai
from openai import OpenAI

notebook_path = Path().resolve()
code_folder = notebook_path.parent
treebanks_folder = code_folder / "treebanks"
print(treebanks_folder)

nltk.download('stopwords')
nltk.download('punkt')
norwegian_stop_words = set(stopwords.words("norwegian"))

/Users/liamo/Desktop/School/550/FinalProj/comp550-norwegian-dialects/treebanks


[nltk_data] Downloading package stopwords to /Users/liamo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/liamo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def parse_conll_file(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        conll_data = file.read()
        return parse(conll_data)

In [6]:
#In this function we will have a nested list all_parsed_data (nested list of sentences where the outter list is a specific dialect while the inner list is the sentences per dialect)
#The second list all_parsed_data_filtered is the filtered version of the same thing
#Each element in the inner list is a token and there is meta data at the end
#Each token has associated info lemma,upos,etc
all_parsed_data = [] 
print(treebanks_folder)

for file_path in treebanks_folder.glob("*.conll"):
    parsed_data = parse_conll_file(file_path)
    all_parsed_data.append(parsed_data)


if all_parsed_data:
    first_file_parsed_data = all_parsed_data[0][1]
    serialized = first_file_parsed_data.serialize()
    print(first_file_parsed_data)
    print(serialized)
    token = first_file_parsed_data[0]
    # print(token['id'])
    # print(token['form'])
    # print(token['lemma'])
    # print(token['upos'])
else:
    all_parsed_data = []

/Users/liamo/Desktop/School/550/FinalProj/comp550-norwegian-dialects/treebanks
TokenList<frå, #, Førde, ?, metadata={text: "frå # Førde ?", segstart: "10.891", segstop: "11.983", file: "foerde_uib_05", speaker: "int1", id: "2"}>
# text = frå # Førde ?
# segstart = 10.891
# segstop = 11.983
# file = foerde_uib_05
# speaker = int1
# id = 2
1	frå	frå	prep	prep	_	0	FRAG	_	_
2	#	#	pause	pause	_	3	IK	_	_
3	Førde	Førde	subst	subst	prop	1	PUTFYLL	_	_
4	?	$?	clb	clb	<spm>	1	IP	_	_




In [20]:
def create_chatgpt_dataset(num_classes):

    better_data ={}

    for dialect in all_parsed_data:

        dialect_name = dialect[0].metadata.get('file', '')
        better_data[dialect_name] = dialect
    
    ls_aal = better_data['aal_uio_02'] + better_data['aal_uio_06']
    ls_austevoll = better_data['austevoll_uib_01'] + better_data['austevoll_uib_04']

    better_data['aal_uio_02'] = ls_aal
    better_data['austevoll_uib_01'] = ls_austevoll

    del better_data['aal_uio_06']
    del better_data['austevoll_uib_04']

    temp_data = {}
    for dialect, token_list in better_data.items():
        temp_ls = []
        for token in token_list:
            sentence = token.metadata.get('text', '')
            if len(token) > 3:
                temp_ls.append(sentence)
        temp_data[dialect] = temp_ls


    len_each_dialect = {}
    for dialect, token_list in temp_data.items():
        len_each_dialect[dialect] = len(token_list)
    
    sorted_len_each_dialect = sorted(len_each_dialect.items(), key=lambda x: x[1], reverse=True)[:num_classes]

    filtered_dict = {key[0]: temp_data[key[0]] for key in sorted_len_each_dialect}
    
    map_names = {
        'bardu_uit_01': 'Bardu',
        'brandbu_uio_01_ny': 'Brandbu',
        'eidsberg_uio_03': 'Eidsberg',
        'fana_uib_03': 'Fana',
        'farsund_uib_02': 'Farsund',
        'flakstad_uib_04': 'Flakstad', 
        'foerde_uib_05': 'Foerde',
        'giske_uib_02': 'Giske',
        'gol_uio_01': 'Gol',
        'hemsedal_uio_01': 'Hemsedal',
        'herad_uio_01': 'Herad',
        'hjartdal_uio_01': 'Hjartdal',
        'hoeyanger_uib_02': 'Hoeyanger',
        'lierne_uio_01': 'Lierne', 
        'vardoe_uio_01': 'Vardoe',
        'aal_uio_02': 'Al', 
        'austevoll_uib_01': 'Austevoll',
    }

    list_of_names_add_to_prompt = ", ".join([map_names[key] for key in filtered_dict.keys()])
    # print(list_of_names_add_to_prompt)
    prompt = f"\n Here is the sentence: "
    
    test_size = 0.2

    train_dataset = []
    validation_dataset = []
    test_dataset = []

    for dialect, sentence_list in filtered_dict.items():
        
        copy_sentence_list = sentence_list.copy()
        random.shuffle(copy_sentence_list)

        train_size = int(len(copy_sentence_list) * (1 - test_size))
        

        train_dataset.extend([{
            "messages": [
            {
                "role": "system", 
                "content": f'''
                You are tasked with being a Norwegian Dialect classifier. 
                The goal is to train a model that can accurately distinguish between different Norwegian dialects. 
                The primary dialects of interest are {list_of_names_add_to_prompt} and you should be able to distinguish between these dialects.'''
            },
            {
                "role": "user",
                "content": f"What dialect does this sentence belong to: {sentence}?"
            },
            {
                "role": "assistant",
                "content": f"{dialect}"
            }
            ]
        } for sentence in copy_sentence_list[:train_size*0.8]])

        validation_dataset.extend([{"messages": [
            {
                "role": "system", 
                "content": f'You are tasked with being a Norwegian Dialect classifier. The goal is to train a model that can accurately distinguish between different Norwegian dialects. The primary dialects of interest are {list_of_names_add_to_prompt} and you should be able to distinguish between these dialects.'
            },
            {
                "role": "user",
                "content": f"What dialect does this sentence belong to: {sentence}?"
            },
            {
                "role": "assistant",
                "content": f"{dialect}"
            }
            ]
        } for sentence in copy_sentence_list[train_size*0.8:train_size]])

        test_dataset.extend([{"messages": [
            {
                "role": "system", 
                "content": f'You are tasked with being a Norwegian Dialect classifier. The goal is to train a model that can accurately distinguish between different Norwegian dialects. The primary dialects of interest are {list_of_names_add_to_prompt} and you should be able to distinguish between these dialects.'
            },
            {
                "role": "user",
                "content": f"What dialect does this sentence belong to: {sentence}?"
            },
            {
                "role": "assistant",
                "content": f"{dialect}"
            }
            ]
        } for sentence in copy_sentence_list[train_size:]])

    return train_dataset, validation_dataset, test_dataset

In [21]:
import os
import json
def setup_finetune_files(train_dataset, validation_dataset, test_dataset, num_classes):
    folder_name = "gptdataset"
    with open(os.path.join(folder_name, f"training_data_{num_classes}.jsonl"), "w") as output_file:
        for entry in train_dataset:
            json.dump(entry, output_file)
            output_file.write("\n")

    with open(os.path.join(folder_name, f"validation_data_{num_classes}.jsonl"), "w") as output_file:
        for entry in validation_dataset:
            json.dump(entry, output_file)
            output_file.write("\n")
    
    with open(os.path.join(folder_name, f"test_data_{num_classes}.jsonl"), "w") as output_file:
        for entry in test_dataset:
            json.dump(entry, output_file)
            output_file.write("\n")
    return os.path.join(folder_name, f"training_data_{num_classes}.jsonl"), os.path.join(folder_name, f"validation_data_{num_classes}.jsonl"), os.path.join(folder_name, f"test_data_{num_classes}.jsonl")

In [9]:
def fine_tune_model(train_file, validation_file):
    os.environ["OPENAI_API_KEY"] = "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" 

    client = openai.Client(
    )
    
    train_file = openai.files.create(
        file=open(train_file, 'rb'),
        purpose='fine-tune',
    )

    train_file_id = train_file.id

    validation_file = openai.files.create(
        file=open(validation_file, 'rb'),
        purpose='fine-tune',
    )

    validation_file_id = validation_file.id

    client.fine_tuning.jobs.create(
        training_file=train_file_id,
        validation_file=validation_file_id,
        model="gpt-3.5-turbo",
    )

    return

In [15]:
train_chatgpt_dataset_3, validation_dataset_3, test_chatgpt_dataset_3 = create_chatgpt_dataset(3)
train_path_3, validation_path_3, test_path_3 = setup_finetune_files(train_chatgpt_dataset_3, validation_dataset_3, test_chatgpt_dataset_3, 3)
fine_tune_model(train_path_3, validation_path_3)

In [None]:
train_chatgpt_dataset_6, validation_dataset_6, test_chatgpt_dataset_6 = create_chatgpt_dataset(6)
train_path_6, validation_path_6, test_path_6 = setup_finetune_files(train_chatgpt_dataset_6, validation_dataset_6, test_chatgpt_dataset_6, 6)
fine_tune_model(train_path_6, validation_path_6)

In [None]:
train_chatgpt_dataset_12, validation_dataset_12, test_chatgpt_dataset_12 = create_chatgpt_dataset(12)
train_path_12, validation_path_12, test_path_12 = setup_finetune_files(train_chatgpt_dataset_12, validation_dataset_12, test_chatgpt_dataset_12, 12)
fine_tune_model(train_path_12, validation_path_12)

In [None]:
train_chatgpt_dataset_17, validation_dataset_17, test_chatgpt_dataset_17 = create_chatgpt_dataset(17)
train_path_17, validation_path_17, test_path_17 = setup_finetune_files(train_chatgpt_dataset_17, validation_dataset_17, test_chatgpt_dataset_17, 17)
fine_tune_model(train_path_17, validation_path_17)

In [16]:
client = openai.Client()

all_jobs = []
for job in client.fine_tuning.jobs.list(
    limit=20,
):
    # Do something with job here
    all_jobs.append(job)
    print(job)


FineTuningJob(id='ftjob-PJ89dRYXWyawbvtSaxAkus0S', created_at=1702580089, error=None, fine_tuned_model='ft:gpt-3.5-turbo-0613:personal::8VpoRtxb', finished_at=1702598026, hyperparameters=Hyperparameters(n_epochs=3, batch_size=6, learning_rate_multiplier=2), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-iUdpRZhOIj1jObRe6m5Gu8vD', result_files=['file-DaTGaKDYfzLgyxbjD2KlHPEY'], status='succeeded', trained_tokens=1384029, training_file='file-rnaOF7fZgSeSGEuHlonzbRHJ', validation_file='file-o1zk0OLI32DcsDXNzAgLj6Mp')
FineTuningJob(id='ftjob-TIVM0rLZime0lnbggtzr8SSf', created_at=1702580073, error=None, fine_tuned_model='ft:gpt-3.5-turbo-0613:personal::8VpoeBUx', finished_at=1702598039, hyperparameters=Hyperparameters(n_epochs=3, batch_size=4, learning_rate_multiplier=2), model='gpt-3.5-turbo-0613', object='fine_tuning.job', organization_id='org-iUdpRZhOIj1jObRe6m5Gu8vD', result_files=['file-6DJdRFrtExRtKA9mfD8CI94s'], status='succeeded', trained_tokens=790773, t

In [10]:
def test_class_gpt(model_id, input_message):
  from openai import OpenAI
  client = OpenAI()

  response = client.chat.completions.create(
    model=model_id,
    messages=input_message,
  )
  return response

In [25]:
def run_test_gpt_model(dialect_mapping, num_classes, file_path, model_id):

    import json
    import os

    os.environ["OPENAI_API_KEY"] = "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"

    true_labels = []
    predicted_labels = []

    i = 0
    with open(file_path, 'r') as file:
        for line in file:
            # Parse each line into a dictionary
            data = json.loads(line)
            label = data["messages"][-1]["content"]
            # Test 3 classes:
            response = test_class_gpt(model_id, data["messages"][:-1])
            predicted_label = dict(dict(dict(response)["choices"][0])["message"])["content"]

            true_labels.append(label)
            predicted_labels.append(predicted_label)
            i += 1
    return true_labels, predicted_labels

In [29]:
def run_metrics(num_classes, dialect_mapping, true_labels, predicted_labels):
    accuracy_count = 0
    tp_count = [0] * num_classes
    fp_count = [0] * num_classes
    fn_count = [0] * num_classes

    for true, pred in zip(true_labels, predicted_labels):
        try:
            if true == pred:
                accuracy_count += 1
                tp_count[dialect_mapping[true]] += 1
            else:
                fp_count[dialect_mapping[pred]] += 1
                fn_count[dialect_mapping[true]] += 1
        except Exception:
            try:
                if pred.split("_")[0].lower() in true:
                    accuracy_count += 1
                    tp_count[dialect_mapping[true]] += 1
                else:
                    for key in dialect_mapping.keys():
                        if key.split("_")[0] in pred:
                            pred = key
                            break
                    fp_count[dialect_mapping[pred]] += 1
                    fn_count[dialect_mapping[true]] += 1
            except Exception:
                print(f"True: {true}, Predicted: {pred}")
                continue


    accuracy = accuracy_count / len(true_labels)
    precision = [tp / (tp + fp) if (tp + fp) != 0 else 0 for tp, fp in zip(tp_count, fp_count)]
    recall = [tp / (tp + fn) if (tp + fn) != 0 else 0 for tp, fn in zip(tp_count, fn_count)]
    f1 = [2 * (p * r) / (p + r) if (p + r) != 0 else 0 for p, r in zip(precision, recall)]

    print(f"Accuracy: {accuracy}")
    for i in range(len(tp_count)):
        print(f"Class {i + 1} - Precision: {precision[i]}, Recall: {recall[i]}, F1 Score: {f1[i]}")

In [27]:
dialect_mapping_3 ={
    'flakstad_uib_04': 0,
    'giske_uib_02': 1,
    'eidsberg_uio_03': 2,
}

test_path_3 = "gptdataset/test_data_3.jsonl"
true_labels_3, predicted_labels_3 = run_test_gpt_model(dialect_mapping_3, 3, test_path_3, "ft:gpt-3.5-turbo-0613:personal::8W6ysErn")
run_metrics(3, dialect_mapping_3, true_labels_3, predicted_labels_3)

Accuracy: 0.8397129186602871
Class 1 - Precision: 0.8265895953757225, Recall: 0.8411764705882353, F1 Score: 0.8338192419825073
Class 2 - Precision: 0.87248322147651, Recall: 0.8552631578947368, F1 Score: 0.8637873754152824
Class 3 - Precision: 0.8125, Recall: 0.8125, F1 Score: 0.8125


In [None]:
dialect_mapping_6 = {
        'flakstad_uib_04': 0,
        'giske_uib_02': 1,
        'eidsberg_uio_03': 2,
        'austevoll_uib_01': 3,
        'bardu_uit_01': 4,
        'vardoe_uio_01': 5
    }

test_path_6 = "gptdataset/test_data_6.jsonl"
true_labels_6, predicted_labels_6 = run_test_gpt_model(dialect_mapping_6, 6, test_path_6, "ft:gpt-3.5-turbo-0613:personal::8VpoeBUx")

In [14]:
run_metrics(6, dialect_mapping_6, true_labels_6, predicted_labels_6)

Accuracy: 0.666156202143951
Class 1 - Precision: 0.7485380116959064, Recall: 0.7529411764705882, F1 Score: 0.750733137829912
Class 2 - Precision: 0.6758241758241759, Recall: 0.8092105263157895, F1 Score: 0.7365269461077845
Class 3 - Precision: 0.6476190476190476, Recall: 0.7083333333333334, F1 Score: 0.6766169154228856
Class 4 - Precision: 0.5684210526315789, Recall: 0.6136363636363636, F1 Score: 0.5901639344262295
Class 5 - Precision: 0.5636363636363636, Recall: 0.41333333333333333, F1 Score: 0.47692307692307695
Class 6 - Precision: 0.6888888888888889, Recall: 0.4305555555555556, F1 Score: 0.5299145299145299


In [30]:
dialect_mapping_12 = {
    'flakstad_uib_04': 0,
    'giske_uib_02': 1,
    'eidsberg_uio_03': 2,
    'austevoll_uib_01': 3,
    'bardu_uit_01': 4,
    'vardoe_uio_01': 5,
    'aal_uio_02': 6,
    'brandbu_uio_01_ny': 7,
    'hjartdal_uio_01': 8,
    'farsund_uib_02': 9,
    'foerde_uib_05': 10,
    'lierne_uio_01': 11
}

test_path_12 = "gptdataset/test_data_12.jsonl"
true_labels_12, predicted_labels_12 = run_test_gpt_model(dialect_mapping_12, 12, test_path_12, "ft:gpt-3.5-turbo-0613:personal::8VpoRtxb")
run_metrics(12, dialect_mapping_12, true_labels_12, predicted_labels_12)

True: giske_uib_02, Predicted: fana_uib_03
True: eidsberg_uio_03, Predicted: fana_uib_03
True: eidsberg_uio_03, Predicted: fana_uib_03
True: austevoll_uib_01, Predicted: fana_uib_03
True: austevoll_uib_01, Predicted: fana_uib_03
True: austevoll_uib_01, Predicted: fana_uib_03
True: austevoll_uib_01, Predicted: fana_uib_03
True: austevoll_uib_01, Predicted: fana_uib_03
True: austevoll_uib_01, Predicted: fana_uib_03
True: austevoll_uib_01, Predicted: fana_uib_03
True: bardu_uit_01, Predicted: fana_uib_03
True: bardu_uit_01, Predicted: fana_uib_03
True: brandbu_uio_01_ny, Predicted: fana_uib_03
True: hjartdal_uio_01, Predicted: fana_uib_03
True: farsund_uib_02, Predicted: fana_uib_03
True: farsund_uib_02, Predicted: fana_uib_03
True: farsund_uib_02, Predicted: fana_uib_03
True: farsund_uib_02, Predicted: fana_uib_03
True: farsund_uib_02, Predicted: fana_uib_03
True: farsund_uib_02, Predicted: fana_uib_03
True: farsund_uib_02, Predicted: fana_uib_03
True: farsund_uib_02, Predicted: fana_uib

In [31]:
dialect_mapping_17 = {
    'flakstad_uib_04': 0,
    'giske_uib_02': 1,
    'eidsberg_uio_03': 2,
    'austevoll_uib_01': 3,
    'bardu_uit_01': 4,
    'vardoe_uio_01': 5,
    'aal_uio_02': 6,
    'brandbu_uio_01_ny': 7,
    'hjartdal_uio_01': 8,
    'farsund_uib_02': 9,
    'foerde_uib_05': 10,
    'lierne_uio_01': 11,
    'fana_uib_03': 12,
    'hemsedal_uio_01': 13,
    'herad_uio_01': 14,
    'gol_uio_01': 15,
    'hoeyanger_uib_02': 16
}

test_path_17 = "gptdataset/test_data_17.jsonl"
true_labels_17, predicted_labels_17 = run_test_gpt_model(dialect_mapping_17, 17, test_path_17, "ft:gpt-3.5-turbo-0613:personal::8W7Bc36I")
run_metrics(17, dialect_mapping_17, true_labels_17, predicted_labels_17)

True: eidsberg_uio_03, Predicted: hoyanger_uib_02
True: brandbu_uio_01_ny, Predicted: hogne
True: fana_uib_03, Predicted: hoyanger_uib_09
Accuracy: 0.3762541806020067
Class 1 - Precision: 0.5777777777777777, Recall: 0.4588235294117647, F1 Score: 0.5114754098360657
Class 2 - Precision: 0.5346534653465347, Recall: 0.7105263157894737, F1 Score: 0.6101694915254237
Class 3 - Precision: 0.40350877192982454, Recall: 0.4842105263157895, F1 Score: 0.44019138755980863
Class 4 - Precision: 0.19318181818181818, Recall: 0.19318181818181818, F1 Score: 0.19318181818181818
Class 5 - Precision: 0.4642857142857143, Recall: 0.17333333333333334, F1 Score: 0.2524271844660194
Class 6 - Precision: 0.375, Recall: 0.375, F1 Score: 0.375
Class 7 - Precision: 0.417910447761194, Recall: 0.4057971014492754, F1 Score: 0.411764705882353
Class 8 - Precision: 0.20454545454545456, Recall: 0.14754098360655737, F1 Score: 0.17142857142857143
Class 9 - Precision: 0.2980769230769231, Recall: 0.5081967213114754, F1 Score: 0.