In [62]:
!pip install transformers datasets accelerate torch -q

In [63]:
import torch

if torch.cuda.is_available():
    print(f"GPU is available!")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("GPU not available. Using CPU.")
    device = torch.device("cpu")

GPU is available!
Device name: Tesla T4


In [64]:
from datasets import load_dataset, Dataset, DatasetDict, ClassLabel, Features, Value
from transformers import AutoTokenizer
import pandas as pd

model_checkpoint = "distilbert-base-uncased"
data_file = "poems.csv"

In [65]:
df = pd.read_csv(data_file)
if 'poem content' in df.columns and 'label' in df.columns:
    df_subset = df[['poem content', 'label']].copy()
    df_subset.rename(columns={'poem content': 'text', 'label': 'emotion'}, inplace=True)
    print("Selected and renamed columns: 'text', 'emotion'")

unique_emotions = df_subset['emotion'].unique()
label2id = {label: i for i, label in enumerate(unique_emotions)}
id2label = {i: label for label, i in label2id.items()}
df_subset['labels'] = df_subset['emotion'].map(label2id)

Selected and renamed columns: 'text', 'emotion'


In [66]:
print("Label to ID:", label2id)
print("ID to Label:", id2label)
print(df_subset.head())

Label to ID: {'sadness': 0, 'anger': 1, 'joy': 2, 'disgust': 3, 'fear': 4, 'neutral': 5, 'surprise': 6}
ID to Label: {0: 'sadness', 1: 'anger', 2: 'joy', 3: 'disgust', 4: 'fear', 5: 'neutral', 6: 'surprise'}
                                                text  emotion  labels
0  Let the bird of loudest lay\r\nOn the sole Ara...  sadness       0
1  Sir Charles into my chamber coming in,\r\nWhen...    anger       1
2  Our vice runs beyond all that old men saw,\r\n...    anger       1
3  Lo I the man, whose Muse whilome did maske,\r\...    anger       1
4  Long have I longd to see my love againe,\r\nSt...  sadness       0


In [67]:
hg_dataset = Dataset.from_pandas(df_subset)
print(hg_dataset)
emotion_class_label = ClassLabel(names=unique_emotions.tolist())
features = Features({
        'text': Value('string'),
        'emotion': Value('string'),
        'labels': emotion_class_label
    })
hg_dataset = hg_dataset.cast(features)
print("\nCasted 'labels' column to ClassLabel type:")
print(hg_dataset.features)

Dataset({
    features: ['text', 'emotion', 'labels'],
    num_rows: 450
})


Casting the dataset:   0%|          | 0/450 [00:00<?, ? examples/s]


Casted 'labels' column to ClassLabel type:
{'text': Value('string'), 'emotion': Value('string'), 'labels': ClassLabel(names=['sadness', 'anger', 'joy', 'disgust', 'fear', 'neutral', 'surprise'])}


In [68]:
train_test_split = hg_dataset.train_test_split(test_size=0.2, seed=42, stratify_by_column='labels')
raw_datasets = DatasetDict({
        'train': train_test_split['train'],
        'test': train_test_split['test']
    })
print("\nSplit dataset into training and testing sets:")
print(raw_datasets)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(f"\nLoaded tokenizer for model: {model_checkpoint}")


Split dataset into training and testing sets:
DatasetDict({
    train: Dataset({
        features: ['text', 'emotion', 'labels'],
        num_rows: 360
    })
    test: Dataset({
        features: ['text', 'emotion', 'labels'],
        num_rows: 90
    })
})

Loaded tokenizer for model: distilbert-base-uncased


In [69]:
from transformers import AutoTokenizer
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text", "emotion"])
tokenized_datasets.set_format("torch")
print("Dataset structure after tokenization:")
print(tokenized_datasets)
print()
print(tokenized_datasets["train"][0])

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/90 [00:00<?, ? examples/s]

Dataset structure after tokenization:
DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 360
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 90
    })
})

{'labels': tensor(0), 'input_ids': tensor([  101,  1045,  2253,  2000,  1996, 11278,  2012, 13814,  3077,  1010,
         1998,  2209, 10245,  1011,  2041,  2012, 12841,  1012,  2028,  2051,
         2057,  2904,  5826,  1010,  4439,  2188,  1999,  1996, 11986,  1997,
         2690,  2238,  1010,  1998,  2059,  1045,  2179,  4482,  1012,  2057,
         2020,  2496,  1998,  2973,  2362,  2005, 10920,  2086,  1010,  9107,
         1010,  2551,  1010,  6274,  1996,  4376,  2336,  1010,  2809,  1997,
         3183,  2057,  2439,  9413,  2063,  1045,  2018,  2584,  1996,  2287,
         1997,  8442,  1012,  1045,  7455,  1010,  1045, 24185,  3726,  1010,
         1045,  2921,  1996,  2160,  1010,  1045,  6821,  2094,  

In [70]:
from transformers import AutoModelForSequenceClassification
import torch

num_labels = len(id2label)
print(f"Loading pre-trained model '{model_checkpoint}' for {num_labels}-class classification...")
model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
)

model.to(device)
print(f"Model loaded successfully and moved to device: {device}")

Loading pre-trained model 'distilbert-base-uncased' for 7-class classification...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully and moved to device: cuda


In [71]:
!pip install evaluate -q

In [72]:
import numpy as np
import evaluate
metric = evaluate.load("accuracy")
print("Loaded accuracy metric using 'evaluate' library.")

Loaded accuracy metric using 'evaluate' library.


In [73]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [74]:
from transformers import TrainingArguments, Trainer
import math

train_dataset_size = len(tokenized_datasets["train"])
batch_size = 16
steps_per_epoch = math.ceil(train_dataset_size / batch_size)
print(steps_per_epoch)

training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        warmup_steps=50,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        save_steps=steps_per_epoch,
        report_to="none"
)

23


In [75]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
    )

  trainer = Trainer(


In [76]:
import os
os.environ["WANDB_DISABLED"] = "true"
print("Weights & Biases logging disabled.")

Weights & Biases logging disabled.


In [77]:
print("Fine-tuning...")
train_result = trainer.train()
trainer.save_model("./fine-tuned-model")
tokenizer.save_pretrained("./fine-tuned-model")
print(train_result.metrics)

Fine-tuning...


Step,Training Loss
10,1.9284
20,1.8536
30,1.7873
40,1.6069
50,1.5341
60,1.5753


{'train_runtime': 72.7481, 'train_samples_per_second': 14.846, 'train_steps_per_second': 0.948, 'total_flos': 143077547335680.0, 'train_loss': 1.6800831089849058, 'epoch': 3.0}


In [78]:
eval_results = trainer.evaluate()
for key, value in eval_results.items():
  if "accuracy" in key:
    print(f"  {key}: {value*100:.2f}%")
  else:
    print(f"  {key}: {value}")

  eval_loss: 1.6552298069000244
  eval_accuracy: 38.89%
  eval_runtime: 1.3216
  eval_samples_per_second: 68.102
  eval_steps_per_second: 4.54
  epoch: 3.0


In [86]:
from transformers import pipeline
import torch

classifier_device = 0 if device.type == 'cuda' else -1
emotion_classifier = pipeline(
        "text-classification",
        model=model,
        tokenizer=tokenizer,
        device=classifier_device
)
print("Text classification pipeline created successfully.")

def predict_emotion(poem_text):
  if not poem_text.strip():
    return "Error: Input text is empty.", 0.0
  try:
    results = emotion_classifier(poem_text)
    predicted_label = results[0]['label']
    predicted_score = results[0]['score']
    return predicted_label, predicted_score

  except Exception as e:
    print(f"Error during prediction: {e}")
    return "Error during prediction", 0.0

test_poem = "The rain pours down, a somber gray, My heart feels heavy on this day."
predicted_emotion, score = predict_emotion(test_poem)
print(f"\n--- Prediction Test ---")
print(f"Poem: '{test_poem}'")
print(f"Predicted Emotion: {predicted_emotion} (Score: {score:.4f})")

test_poem_2 = "Oh, joy! The sun breaks through the clouds, A cheerful bird sings clear and loud!"
predicted_emotion_2, score_2 = predict_emotion(test_poem_2)
print(f"\nPoem: '{test_poem_2}'")
print(f"Predicted Emotion: {predicted_emotion_2} (Score: {score_2:.4f})")

Device set to use cuda:0


Text classification pipeline created successfully.

--- Prediction Test ---
Poem: 'The rain pours down, a somber gray, My heart feels heavy on this day.'
Predicted Emotion: fear (Score: 0.3170)

Poem: 'Oh, joy! The sun breaks through the clouds, A cheerful bird sings clear and loud!'
Predicted Emotion: sadness (Score: 0.3475)


In [79]:
!pip install gensim -q

In [80]:
import gensim.downloader as api
import gensim

model_name = 'word2vec-google-news-300'
wv_model = api.load(model_name)
test_word = 'sadness'

if test_word in wv_model:
  similar_words = wv_model.most_similar(test_word, topn=5)
  print(f"Words most similar to '{test_word}':")

  for word, score in similar_words:
    print(f"  - {word} (Similarity: {score:.4f})")

else:
  print(f"'{test_word}' not found in Word2Vec vocabulary.")


Words most similar to 'sadness':
  - sorrow (Similarity: 0.8653)
  - grief (Similarity: 0.7104)
  - profound_sadness (Similarity: 0.6926)
  - anguish (Similarity: 0.6862)
  - saddness (Similarity: 0.6407)


In [81]:
!pip install nltk -q

In [82]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [83]:
import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

def get_keywords(text):
    keywords = []
    words = nltk.word_tokenize(text.lower())
    tagged_words = nltk.pos_tag(words)
    allowed_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJR', 'JJS'}

    for word, tag in tagged_words:
        if tag in allowed_tags and len(word) > 2:
            keywords.append(word)
    unique_keywords = []
    for word in keywords:
        if word not in unique_keywords:
            unique_keywords.append(word)

    return unique_keywords[:5]

print("Keywords from 'The rain pours down, a somber gray':", get_keywords("The rain pours down, a somber gray"))

Keywords from 'The rain pours down, a somber gray': ['rain', 'somber', 'gray']


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [91]:
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import re
import os
import sys
import string

def calculate_stats(poem_text):
    cleaned_text=poem_text.replace("--"," ")
    words=cleaned_text.lower().split()
    word_count=len(words)
    total_chars=sum(len(word) for word in words)
    if word_count > 0:
        average_word_length = total_chars/ word_count
    else:
        average_word_length = 0
    return {
        "word_count": word_count,
        "average_word_length": average_word_length
    }

try:
    with open("clean_poems.txt", "r", encoding="utf-8") as f:
        poem_text = f.read()
    stats = calculate_stats(poem_text)
    print(f"Total Word Count: {stats['word_count']}")
    print(f"Average Word Length: {stats['average_word_length']:.2f} characters")

except FileNotFoundError:
    print("Error: The file was not found.")

def load_cmudict(filepath="cmudict.txt"):
    print(f"Loading phonetic dictionary from {filepath}...")
    pronunciations = {}
    variant_regex = re.compile(r'\(\d+\)$')
    allowed_chars_regex = re.compile(r"[^A-Z']")
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line.startswith(';;;') or not line:
                    continue
                parts = line.split(maxsplit=1)
                if len(parts) < 2:
                    continue
                word = parts[0].strip()
                word = variant_regex.sub('', word)
                word = word.upper()
                word = allowed_chars_regex.sub('', word)
                if not word:
                    continue

                phonemes_str = parts[1].strip()
                phonemes = phonemes_str.split()
                if word not in pronunciations:
                    pronunciations[word] = phonemes

        if not pronunciations:
             print("Warning: The dictionary appears empty after loading. Check the file format.")
             return None
        print(f"Dictionary loaded successfully. {len(pronunciations)} words found.")
        return pronunciations

    except FileNotFoundError:
        print(f"Error: The dictionary file '{filepath}' was not found.")
        return None

def detect_rhyme_scheme(stanza, pronunciation_dict):
    if not pronunciation_dict:
        return "Error: Dictionary not loaded."
    lines = stanza.strip().split('\n')
    last_word_sounds = []

    for line in lines:
        words = line.split()
        if words:
            clean_word = words[-1].upper().strip(string.punctuation)
            if clean_word in pronunciation_dict:
                phonemes = pronunciation_dict[clean_word]
                last_stress_index = -1
                for i in range(len(phonemes) - 1, -1, -1):
                    if phonemes[i][-1] in ('1', '2'):
                        last_stress_index = i
                        break
                if last_stress_index != -1:
                    rhyming_part = tuple(phonemes[last_stress_index:])
                    last_word_sounds.append(rhyming_part)
                else:
                    last_word_sounds.append(tuple(phonemes))
            else:
                last_word_sounds.append(None)
        else:
             last_word_sounds.append(None)

    rhyme_groups = {}
    scheme = []
    next_rhyme_label = 'A'

    for sounds in last_word_sounds:
        if sounds is None:
            scheme.append('X')
            continue
        if sounds in rhyme_groups:
            scheme.append(rhyme_groups[sounds])
        else:
            rhyme_groups[sounds] = next_rhyme_label
            scheme.append(next_rhyme_label)
            next_rhyme_label = chr(ord(next_rhyme_label) + 1)

    return "".join(scheme)

cmudict = load_cmudict()

if cmudict:
    try:
        sample_stanza="""
        Because I could not stop for Death,
        He kindly stopped for me;
        The carriage held but just ourselves
        And Immortality.
        """
        rhyme_scheme = detect_rhyme_scheme(sample_stanza, cmudict)
        print(f"The rhyme scheme of the sample stanza is: {rhyme_scheme}")

    except FileNotFoundError:
        print("Error: 'clean_poems.txt' was not found.")
else:
    print("Exiting because the phonetic dictionary could not be loaded.")


def find_alliteration(line):
    stop_words = set([
        'a', 'an', 'the', 'in', 'on', 'at', 'to', 'for', 'of',
        'is', 'am', 'are', 'was', 'were', 'be', 'been', 'being',
        'and', 'or', 'but', 'if', 'as', 'by', 'with', 'from',
        'i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him',
        'her', 'us', 'them', 'my', 'your', 'his', 'its', 'our', 'their'
    ])
    translator = str.maketrans('', '', string.punctuation)
    cleaned_line = line.lower().translate(translator)
    words = cleaned_line.split()
    alliterations = []
    num_words = len(words)
    for i in range(num_words - 1):
        current_word = words[i]
        if not current_word or current_word in stop_words:
            continue
        for j in range(i + 1, num_words):
            next_word = words[j]
            if not next_word:
                continue
            if next_word in stop_words:
                continue
            if current_word[0] == next_word[0]:
                alliterations.append((current_word, next_word))
            break
    return alliterations

try:
    sample_line = "Success is counted sweetest by those who ne'er succeed."
    alliterative_pairs = find_alliteration(sample_line)

    if alliterative_pairs:
        print(f"Found alliteration in the line: '{sample_line}'")
        for pair in alliterative_pairs:
            print(f"  - {pair[0]} / {pair[1]}")
    else:
        print(f"No simple alliteration found in the line: '{sample_line}'")

except FileNotFoundError:
        print("Error: 'clean_poems.txt' was not found.")

Total Word Count: 28906
Average Word Length: 4.46 characters
Loading phonetic dictionary from cmudict.txt...
Dictionary loaded successfully. 35676 words found.
The rhyme scheme of the sample stanza is: AXXX
No simple alliteration found in the line: 'Success is counted sweetest by those who ne'er succeed.'


In [90]:
import random
import string

print("\n--- KAVI PoetBot ---")
print("Type 'analyze', 'generate', or 'quit'.")

while True:
    try:
        user_choice = input("\nWhat would you like to do? (analyze/generate/quit): ").lower().strip()

        if user_choice == 'quit':
            print("Goodbye!")
            break

        elif user_choice == 'analyze':
            print("\nPlease paste the poem stanza:")
            print("(Enter an empty line when finished):")
            stanza_lines = []
            while True:
                line = input()
                if line == "":
                    break
                stanza_lines.append(line)
            input_stanza = "\n".join(stanza_lines)

            if not input_stanza.strip():
                print("No stanza provided.")
                continue

            print("\n--- Analysis Results ---")
            predicted_emotion, score = predict_emotion(input_stanza)
            print(f"Predicted Emotion: {predicted_emotion} (Confidence: {score:.2f})")
            keywords = get_keywords(input_stanza)
            print(f"Identified Keywords: {keywords}")
            similar_word_info = {}

            if keywords:
                first_keyword = keywords[0]
                if first_keyword in wv_model:
                    try:
                        similar = wv_model.most_similar(first_keyword, topn=2)
                        similar_word_info[first_keyword] = [w for w, s in similar]
                    except KeyError:
                        print(f"(Word '{first_keyword}' caused Word2Vec error)")
                else:
                    print(f"(Keyword '{first_keyword}' not in Word2Vec vocab)")

            print("\n--- Explanation ---")
            if predicted_emotion != "Error during prediction":
                explanation = f"The poem seems to convey a primary emotion of **{predicted_emotion}**."

                if keywords:
                    explanation += f" Key terms like '**{keywords[0]}**'"
                    if first_keyword in similar_word_info and similar_word_info[first_keyword]:
                        sim_words = " or ".join([f"'**{sw}**'" for sw in similar_word_info[first_keyword]])
                        explanation += f", which relates to concepts like {sim_words},"
                    explanation += " likely contribute to this feeling."
                else:
                    explanation += " Specific keywords contributing to this were not easily identified by the current analysis."
                print(explanation)
            else:
                print("Could not generate explanation due to prediction error.")

            print("\n--- Technical Details ---")
            scheme = detect_rhyme_scheme(input_stanza, cmudict)
            print(f"Rhyme Scheme: {scheme}")
            print("Alliteration:")
            found_any_alliteration = False

            for i, line_text in enumerate(input_stanza.split('\n')):
                pairs = find_alliteration(line_text)
                if pairs:
                    found_any_alliteration = True
                    print(f"  Line {i + 1}:")
                    for pair in pairs:
                        print(f"    - {pair[0]} / {pair[1]}")
            if not found_any_alliteration:
                print("  None detected.")

        elif user_choice == 'generate':
            print("\n--- Generating Poem Snippet (Markov Chain) ---")
            generated_output = generate_text(markov_model, tokens, length=50)
            if generated_output:
                print(generated_output)
            else:
                print("Could not generate text.")

        else:
            print("Invalid choice.")

    except (EOFError, KeyboardInterrupt):
        print("\nInterrupted. Goodbye!")
        break
    except Exception as e:
        print(f"\nAn error occurred: {e}")
        break


--- KAVI PoetBot ---
Type 'analyze', 'generate', or 'quit'.

What would you like to do? (analyze/generate/quit): analyze

Please paste the poem stanza:
(Enter an empty line when finished):
I wandered lonely as a cloud
That floats on high o'er vales and hills,
When all at once I saw a crowd,
A host, of golden daffodils;
Beside the lake, beneath the trees,
Fluttering and dancing in the breeze.


--- Analysis Results ---
Predicted Emotion: sadness (Confidence: 0.43)
Identified Keywords: ['cloud', 'high', "o'er", 'vales', 'hills']

--- Explanation ---
The poem seems to convey a primary emotion of **sadness**. Key terms like '**cloud**', which relates to concepts like '**clouds**' or '**Cloud**', likely contribute to this feeling.

--- Technical Details ---
Rhyme Scheme: AXABXC
Alliteration:
  None detected.

What would you like to do? (analyze/generate/quit): quit
Goodbye!
