In [None]:
import os
import pickle
import spacy
from datasets import Dataset 
from datasets import load_from_disk
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    pipeline
)
from sklearn.model_selection import train_test_split
import numpy as np
import evaluate
from tqdm.auto import tqdm
tqdm.pandas()
import wandb
wandb.init(mode="disabled") 
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -------- ------------------------------- 2.6/12.8 MB 15.1 MB/s eta 0:00:01
     ----------------- ---------------------- 5.5/12.8 MB 13.4 MB/s eta 0:00:01
     ----------------------------- ---------- 9.4/12.8 MB 15.4 MB/s eta 0:00:01
     ---------------------------------------- 12.8/12.8 MB 15.4 MB/s  0:00:00
âœ” Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


'\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nimport torch\nimport torchvision\nimport torchvision.transforms as transforms\n\nimport torch.nn as nn\nimport torch.nn.functional as F\nimport torch.optim as optim\n'

In [2]:
# Cleaning and Normalization Pipeline

import re

def clean_aristotle(text: str) -> str:

    # remove any ALL CAPS words (Titles, character names, stage directions, etc.)
    text = re.sub(r'\b[A-Z]{2,}\.', '', text)
    text = re.sub(r'\b[A-Z]{2,}\b', '', text)

    # Remove "Part X," or "BOOK X," (case-insensitive)
    text = re.sub(r'\b(?:Part|BOOK)\s+\w+,?', '', text, flags=re.IGNORECASE)
    
    # Remove things like (12), (3a), etc.
    text = re.sub(r'\(\s*[\w\d]+\s*\)', '', text)
    
    # Remove solitary quotes (")
    text = text.replace('"', '')
    
    # Remove long lines of dashes (3 or more in a row)
    text = re.sub(r'-{3,}', '', text)
    
    # Collapse multiple spaces into one
    text = re.sub(r'\s+', ' ', text).strip()

    # lowercase normalization
    text = text.lower()

    # remove standalone numbers (often sonnet numbers or line counts)
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)

    # remove play headers and all-uppercase lines (likely metadata, not verse)
    text = re.sub(r'^[A-Z\s]{3,}$', '', text, flags=re.MULTILINE)

    # collapse multiple newlines
    text = re.sub(r'\n+', '\n', text)

    # remove '\n' as our model will not need to generate new lines
    text = text.replace("\n", " ")

    # strip leading/trailing whitespace
    text = text.strip()

    return text

# Clean the file
with open("aristotle.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

cleaned_text = clean_aristotle(raw_text)

with open("aristotle_cleaned.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)


In [3]:
# Load Text
with open("aristotle_cleaned.txt", "r", encoding="utf-8") as f:
    corpus = f.read()

# Sentence segmentation (extract sentences from .txt)

nlp = spacy.load("en_core_web_sm")
nlp.max_length = len(corpus) + 1000  # handle long corpus

doc = nlp(corpus)
sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]

print(f"Number of sentences: {len(sentences)}")

Number of sentences: 4267


In [4]:
# Explore sentences

print("=== Example Sentences ===")
for i, s in enumerate(sentences[:10]):  # show first 10
    print(f"{i+1}: {s}")
print("\nTotal sentences:", len(sentences))

=== Example Sentences ===
1: things are said to be named 'equivocally' when, though they have a common name, the definition corresponding with the name differs for each.
2: thus, a real man and a figure in a picture can both lay claim to the name 'animal'; yet these are equivocally so named, for, though they have a common name, the definition corresponding with the name differs for each.
3: for should any one define in what sense each is an animal, his definition in the one case will be appropriate to that case only.
4: on the other hand, things are said to be named 'univocally' which have both the name and the definition answering to the name in common.
5: a man and an ox are both 'animal', and these are univocally so named, inasmuch as not only the name, but also the definition, is the same in both cases: for if a man should state in what sense each is an animal, the statement in the one case would be identical with that in the other.
6: things are said to be named 'derivatively', wh

In [5]:
# Hugging Face Dataset

dataset = Dataset.from_dict({"text": sentences})

# Explore Dataset

print("\n=== Example Dataset Entries ===")
for i in range(10):  # show first 10
    print(f"{i+1}: {dataset[i]}")
print("\nDataset length:", len(dataset))



=== Example Dataset Entries ===
1: {'text': "things are said to be named 'equivocally' when, though they have a common name, the definition corresponding with the name differs for each."}
2: {'text': "thus, a real man and a figure in a picture can both lay claim to the name 'animal'; yet these are equivocally so named, for, though they have a common name, the definition corresponding with the name differs for each."}
3: {'text': 'for should any one define in what sense each is an animal, his definition in the one case will be appropriate to that case only.'}
4: {'text': "on the other hand, things are said to be named 'univocally' which have both the name and the definition answering to the name in common."}
5: {'text': "a man and an ox are both 'animal', and these are univocally so named, inasmuch as not only the name, but also the definition, is the same in both cases: for if a man should state in what sense each is an animal, the statement in the one case would be identical with tha

In [8]:
# Load tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Load pre-trained model

model = GPT2LMHeadModel.from_pretrained("gpt2")

# GPT-2 doesnâ€™t have a padding token by default

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

def tokenize_function(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=64,
        return_tensors="pt"
    )

    tokens["labels"] = tokens["input_ids"].clone()
    return tokens

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_dataset.save_to_disk("aristotle_tokenized_to_colab")

Map:   0%|          | 0/4267 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4267 [00:00<?, ? examples/s]

In [None]:
# Visualizations for Training

'''
import transformers
from transformers import TrainerCallback
from IPython.display import clear_output
import matplotlib.pyplot as plt

train_losses = []
eval_losses = []
eval_steps = []

def plot_metrics():
    clear_output(wait=True)
    plt.plot(train_losses, label="Train Loss")
    plt.plot(eval_steps, eval_losses, label="Eval Loss")
    plt.xlabel("Step")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

class PlotCallback(transformers.TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs is not None:
            if "loss" in logs:
                train_losses.append(logs["loss"])
            if "eval_loss" in logs:
                eval_losses.append(logs["eval_loss"])
                eval_steps.append(state.global_step)
                plot_metrics()
'''

In [None]:
# SWITCH TO COLAB, WITH TOKENIZED_DATASET UPLOADED

from datasets import load_from_disk
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    pipeline
)

# Load tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Load pre-trained model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Split dataset into train/eval
tokenized_dataset = load_from_disk("aristotle_tokenized_to_colab")
split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]

# Training arguments

training_args = TrainingArguments(
    output_dir="ARISTOTLE",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    save_strategy="no",
    eval_strategy="steps",
    logging_strategy="steps",
    logging_steps=10,
    eval_steps=50,    
    disable_tqdm=False,
    logging_dir="////your_path///////ARISTOTLE",
    logging_first_step=True,
    report_to=["none"]
)

#Initialize Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    #compute_metrics=compute_metrics,
    #callbacks=[PlotCallback],   # ðŸ‘ˆ add live plotting
)

In [None]:
#Train the model

trainer.train()

In [None]:
# Save Model + Tokenizer

save_dir = "aristotle_gpt2_trained"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

import json

with open("./trained_model/training_metrics.json", "w") as f:
    json.dump(trainer.state.log_history, f)

('C:/Users/Gaels/MATH-Proj-GPT2/shakespeare_gpt2_final\\tokenizer_config.json',
 'C:/Users/Gaels/MATH-Proj-GPT2/shakespeare_gpt2_final\\special_tokens_map.json',
 'C:/Users/Gaels/MATH-Proj-GPT2/shakespeare_gpt2_final\\vocab.json',
 'C:/Users/Gaels/MATH-Proj-GPT2/shakespeare_gpt2_final\\merges.txt',
 'C:/Users/Gaels/MATH-Proj-GPT2/shakespeare_gpt2_final\\added_tokens.json')

In [None]:
# BACK TO LOCAL MACHINE AFTER COLAB TRAINING, WITH aristotle_gpt2_trained FOLDER + training_metrics.json DOWNLOADED

from transformers import GPT2LMHeadModel, GPT2TokenizerFast, Trainer, pipeline

local_path = r"////////your_entire_path/////////aristotle_trained_gpt2_model"

model = GPT2LMHeadModel.from_pretrained(local_path)
tokenizer = GPT2TokenizerFast.from_pretrained(local_path)

# Load metrics

import json

with open("////////your_entire_path/////////training_metrics.json") as f:
    history = json.load(f)

In [None]:
# Perplexity Evaluation

'''
# Metric: Perplexity (based on loss)
# Hugging Face's `evaluate` doesn't include perplexity directly,
# but we can derive it from cross-entropy loss.
def compute_metrics(eval_pred):
    loss = eval_pred.metrics["eval_loss"] if "eval_loss" in eval_pred.metrics else None
    if loss is None:
        return {}
    perplexity = np.exp(loss)
    return {"perplexity": perplexity, "loss": loss}

eval_results = trainer.evaluate()
metrics = compute_metrics(eval_results)

print("Evaluation results with GPT-2:")
print(metrics)
'''

In [None]:
#Evaluation

'''
# Load model from folder
model_path = local_path
model = GPT2LMHeadModel.from_pretrained(model_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_path)

# BLEU/ROUGE Scores
print("\n--- BLEU/ROUGE Evaluation ---")
import evaluate
metric = evaluate.load("rouge")
'''

In [7]:
# Inference (text generation)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    pad_token_id=tokenizer.eos_token_id
)

prompt = "what if a man"

print("\n=== Sample Aristotelian Text ===\n")

outputs = generator(
    prompt,
    max_length=70,
    num_return_sequences=10,   # ðŸ‘ˆ generate 10 completions
    do_sample=True,            # ðŸ‘ˆ enable sampling
    top_k=100,                  # limit to top-k tokens
    top_p=0.95,                 # nucleus sampling
    temperature=0.8             # control creativity
)

for i, out in enumerate(outputs, 1):
    print(f"\n--- Output {i} ---\n{out['generated_text']}")


Device set to use cpu
Both `max_new_tokens` (=256) and `max_length`(=70) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



=== Sample Aristotelian Text ===


--- Output 1 ---
what if a man-for-hisself, who is not hisself, is not in that state of being but is in the same state of being, and also is not in the same state of being but is in the same state of being?

--- Output 2 ---
what if a man is not at home yet?

--- Output 3 ---
what if a man, then, is said to be one, as when the latter say he is not one, but he is one?

--- Output 4 ---
what if a man is not a man, but a horse?

--- Output 5 ---
what if a man also said that the earth is a substance?

--- Output 6 ---
what if a man were to say, 'the world is a cube'; and, again, if the people who composed these statements were to say so, the one and the same thing would have to be different; for the one would be better and the other worse, but the one and the same thing would be neither worse nor worse than the other.

--- Output 7 ---
what if a man is white or is not white?

--- Output 8 ---
what if a man is present, what must happen to him, and what wi