<a href="https://colab.research.google.com/github/Mova-2020/Subworded-Explanatory-Dictionary-on-Physics-/blob/main/RevDic_v02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [87]:
from IPython import get_ipython
from IPython.display import display

!pip install transformers datasets torch



In [88]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer
from datasets import Dataset

Load the subworded text files

In [89]:
def load_subworded_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return [{"term": line.split(" @ ")[0].strip(), "definition": line.split(" @ ")[1].replace("^p", "").strip()}
                    for line in f if " @ " in line]
    except FileNotFoundError:
        print(f"Error: File not found: {file_path}")
        return []

Uploading files

In [90]:
from google.colab import files

# Завантажуємо файл
uploaded = files.upload()
# Check if the file was uploaded successfully
if "fts_dl_ukr_subworded_combined_validation_136.txt" in uploaded:
    print("File uploaded successfully!")
else:
    print("Error: File upload failed.")

Saving fts_dl_ukr_subworded_combined_training.txt to fts_dl_ukr_subworded_combined_training (1).txt
Saving fts_dl_ukr_subworded_combined_validation_136.txt to fts_dl_ukr_subworded_combined_validation_136 (2).txt
Saving fts_dl_ukr_subworded_simple_training.txt to fts_dl_ukr_subworded_simple_training (1).txt
Saving fts_dl_ukr_subworded_simple_validation_136.txt to fts_dl_ukr_subworded_simple_validation_136 (1).txt
Error: File upload failed.


Load datasets

In [91]:

# Load the data using the load_subworded_file function
file_combined_training_path = "/content/fts_dl_ukr_subworded_combined_training.txt"
file_simple_training_path = "/content/fts_dl_ukr_subworded_simple_training.txt"
file_combined_validation_path = "/content/fts_dl_ukr_subworded_combined_validation_136.txt"
file_simple_validation_path = "/content/fts_dl_ukr_subworded_simple_validation_136.txt"



Convert data to Hugging Face Dataset objects

In [93]:
from datasets import Dataset

data_combined_training = load_subworded_file(file_combined_training_path)
data_simple_training = load_subworded_file(file_simple_training_path)
data_combined_validation = load_subworded_file(file_combined_validation_path)
data_simple_validation = load_subworded_file(file_simple_validation_path)

dataset_combined_training = Dataset.from_dict({"definition": [d["definition"] for d in data_combined_training], "term": [d["term"] for d in data_combined_training]})
dataset_simple_training = Dataset.from_dict({"definition": [d["definition"] for d in data_simple_training], "term": [d["term"] for d in data_simple_training]})
dataset_combined_validation = Dataset.from_dict({"definition": [d["definition"] for d in data_combined_validation], "term": [d["term"] for d in data_combined_validation]})
dataset_simple_validation = Dataset.from_dict({"definition": [d["definition"] for d in data_simple_validation], "term": [d["term"] for d in data_simple_validation]})

In [94]:
available_models = {
    "GPT-2": "gpt2",
    "GPT-2 Medium": "gpt2-medium",
    "GPT-2 Large": "gpt2-large",
    "GPT-3 Small": "EleutherAI/gpt-neo-125M"  # Example of an alternative model
}


Select model and tokenizer

In [95]:
model_name = "gpt2-large"  # Change model if necessary
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

Add padding token if necessary

In [96]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Tokenize the dataset

In [97]:
def tokenize_function(examples):
    inputs = tokenizer(examples["definition"], padding="max_length", truncation=True, max_length=128)
    labels = tokenizer(examples["term"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = labels["input_ids"]
    return inputs

Tokenize all datasets

In [98]:
tokenized_combined_training = dataset_combined_training.map(tokenize_function, batched=True)
tokenized_simple_training = dataset_simple_training.map(tokenize_function, batched=True)
tokenized_combined_validation = dataset_combined_validation.map(tokenize_function, batched=True)
tokenized_simple_validation = dataset_simple_validation.map(tokenize_function, batched=True)

Training arguments

In [115]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=1000000,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

Train models

In [116]:

def tokenize_function(examples):
    inputs = tokenizer(examples["definition"], padding="max_length", truncation=True, max_length=128)
    # Separate labels from inputs
    labels = tokenizer(examples["term"], padding="max_length", truncation=True, max_length=128)["input_ids"]
    # Return as a dictionary with separate keys
    return {"input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": labels}

Validation

In [117]:
# Load the data using the load_subworded_file function
data_combined_training = load_subworded_file(file_combined_training_path)
data_simple_training = load_subworded_file(file_simple_training_path)
data_combined_validation = load_subworded_file(file_combined_validation_path)
data_simple_validation = load_subworded_file(file_simple_validation_path)

# Convert data to Hugging Face Dataset objects
dataset_combined_training = Dataset.from_dict({"definition": [d["definition"] for d in data_combined_training], "term": [d["term"] for d in data_combined_training]})
dataset_simple_training = Dataset.from_dict({"definition": [d["definition"] for d in data_simple_training], "term": [d["term"] for d in data_simple_training]})
dataset_combined_validation = Dataset.from_dict({"definition": [d["definition"] for d in data_combined_validation], "term": [d["term"] for d in data_combined_validation]})
dataset_simple_validation = Dataset.from_dict({"definition": [d["definition"] for d in data_simple_validation], "term": [d["term"] for d in data_simple_validation]})



Generate a term from a definition

In [110]:
def generate_term(definition, tokenizer, model, max_length=256): # Increased max_length
    inputs = tokenizer(definition, return_tensors="pt", max_length=128, truncation=True)
    outputs = model.generate(**inputs, max_length=max_length, num_return_sequences=1, temperature=0.9)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

Setting validation model

In [118]:
def validate_model(definitions, terms, tokenizer, model):
    #  Calculates the accuracy of the model by comparing generated terms with actual terms.

#    Args:
#       definitions: A list of definitions.
#      terms: A list of corresponding terms.
#     tokenizer: The tokenizer used for the model.
#        model: The language model.

#    Returns:
#        The accuracy of the model (number of correct predictions / total number of predictions).

    correct_predictions = 0
    total_predictions = len(definitions)

    for definition, term in zip(definitions, terms):
        generated_term = generate_term(definition, tokenizer, model)
        print(f"Definition: {definition}")
        print(f"Actual Term: {term}")  # Corrected indentation
        print(f"Generated Term: {generated_term}")  # Corrected indentation
        print("-" * 20)  # Separator

        if generated_term == term:
            correct_predictions += 1

    accuracy = correct_predictions / total_predictions if total_predictions else 0  # Avoid division by zero
    return accuracy

Run validation

In [119]:
accuracy_combined = validate_model(
    dataset_combined_validation["definition"], dataset_combined_validation["term"], tokenizer, model
)
accuracy_simple = validate_model(
    dataset_simple_validation["definition"], dataset_simple_validation["term"], tokenizer, model
)

print(f"Validation accuracy for combined dataset: {accuracy_combined:.2%}")
print(f"Validation accuracy for simple dataset: {accuracy_simple:.2%}")


Validation accuracy for combined dataset: 0.00%
Validation accuracy for simple dataset: 0.00%


Compare generated terms for dataset_combined and dataset_simple

In [113]:
definition_combined = "&один& &із& &вид&ів& &абер&аці&й& &зображ&ення& &3&-&г&о& &порядк&у&, &за& &як&ої& &точк&а&, &що& &лежить& &на& &опт&ичн&ій& &ос&і&, &зображ&а&ється& &на& &екран&і&, &перпендикулярн&ому& &до& &опт&ичн&ої& &ос&і&, &у& &ви&гляд&і& &плям&и& &роз&с&i&яння& &кругл&ої& &форм&и& &з& &роз&поділ&ом& &освітл&ен&ос&т&i&, &що& &за&лежить& &від& &полож&ення& &екран&а&. &Ц&ю& &абер&аці&ю& &з&менш&у&ють& &ком&б&i&на&ц&іє&ю& &додатн&ої& &та& &від'&ємн&ої& &лінз& &зі& &спец&і&альн&о& &роз&рах&ован&ими& &раді&ус&ами& &кривизн&и& &заламлюва&льн&их& &поверх&онь&"
definition_simple = "&один& &із& &вид&ів& &абер&аці&й& &зображ&ення& &3&-&г&о& &порядк&у&, &за& &як&ої& &точк&а&, &що& &лежить& &на& &опт&ичн&ій& &ос&і&, &зображ&а&ється& &на& &екран&і&, &перпендикулярн&ому& &до& &опт&ичн&ої& &ос&і&, &у& &ви&гляд&і& &плям&и& &роз&с&i&яння& &кругл&ої& &форм&и& &з& &роз&поділ&ом& &освітл&ен&ос&т&i&, &що& &за&лежить& &від& &полож&ення& &екран&а&. &Ц&ю& &абер&аці&ю& &з&менш&у&ють& &ком&б&i&на&ц&іє&ю& &додатн&ої& &та& &від'&ємн&ої& &лінз& &зі& &спец&і&альн&о& &роз&рах&ован&ими& &раді&ус&ами& &кривизн&и& &заламлюва&льн&их& &поверх&онь&"

In [114]:
print("Generated term from subworded combined file: ", generate_term(definition_combined, tokenizer, model))
print("Generated term from subworded simple file: ", generate_term(definition_simple, tokenizer, model))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated term from subworded combined file:  &один& &із& &вид&ів& &абер&аці&й& &зображ&ення& &3&-&г&о& &порядк&у&, &за& &як&ої& &точк&а&, &що& &лежить& &на& &опт&ичн&із& &вид&ів& &абер&аці&й& &зображ&ення& &3&-&г&о& &порядк&у&, &за& &як&ої& &точк&а&, &що& &лежить& &на& &опт&ичн&із& &вид&ів
Generated term from subworded simple file:  &один& &із& &вид&ів& &абер&аці&й& &зображ&ення& &3&-&г&о& &порядк&у&, &за& &як&ої& &точк&а&, &що& &лежить& &на& &опт&ичн&із& &вид&ів& &абер&аці&й& &зображ&ення& &3&-&г&о& &порядк&у&, &за& &як&ої& &точк&а&, &що& &лежить& &на& &опт&ичн&із& &вид&ів
