In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, SimpleRNN, Bidirectional, Dropout
from tensorflow.keras.models import Sequential
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk # Import the main nltk library
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
# Define the URL of the Wikipedia page for LaPerm
url = "https://en.wikipedia.org/wiki/LaPerm"

# Send a GET request to the URL
response = requests.get(url)

# Parse the HTML content of the page
soup = BeautifulSoup(response.content, 'html.parser')

# Find the container element that holds the main content of the page
content_div = soup.find('div', {'class': 'mw-parser-output'})

# Find all <p> tags within the content area
if content_div:
    paragraphs = content_div.find_all('p')

    # Extract text from each paragraph and concatenate them into a single string
    text_data = ''
    for paragraph in paragraphs:
        text_data += paragraph.get_text()

    # Print the scraped text data
    print(text_data)
    all_words = text_data.split()
else:
    print("Main content container not found.")

The LaPerm is a breed of cat. A LaPerm's fur is curly (hence the name "perm"), with the tightest curls being on the throat and on the base of the ears. LaPerms come in many colors and patterns. LaPerms generally have a very affectionate personality.
The LaPerm is a rex breed which originated in the United States and is now present in many other countries worldwide. The breed is genetically unique and not related to any other rex cat varieties, having a dominant gene causing their curly coats. They have an elegant and athletic build and are affectionate, active, and outgoing in character. They are reputed to be hypoallergenic cats, provoking a significantly lower level of an allergic response in humans than normal cats. Their most significant feature is their coat, which is made up of soft waves, curls, and ringlets, resembling a shaggy perm.
The LaPerm emerged around the early 1980s as a spontaneous mutation of cats bred for pest control. The breed founders were Linda and Richard Koehl

In [4]:
def clean_text(text):
  # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()  # Lowercase
    return text

def tokenize_and_lemmatize(text):
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in word_tokenize(text) if word not in stop_words]
  lemmatizer = WordNetLemmatizer()
  return [lemmatizer.lemmatize(word) for word in tokens]


In [5]:
cleaned_text = clean_text(text_data)
tokens = tokenize_and_lemmatize(cleaned_text)

In [6]:
import numpy as np

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([tokens])
sequences = tokenizer.texts_to_sequences([tokens])[0]  # Get sequences from single text

# Hyperparameters (adjust as needed)
max_len = 50
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128

# Create Sequences and Labels
input_sequences = []
labels = []

for i in range(0, len(sequences) - max_len):
    input_sequences.append(sequences[i:i + max_len])
    labels.append(sequences[i + max_len])

# Convert to numpy arrays
input_sequences = np.array(input_sequences)
labels = np.array(labels)

# Convert labels to one-hot encoding
labels = tf.keras.utils.to_categorical(labels, num_classes=vocab_size)

# Pad sequences
input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='post')


In [54]:
# Model Building
def build_rnn_model():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_len),
        SimpleRNN(units=128, return_sequences=False),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def build_lstm_model():
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_len),
        LSTM(units=64, return_sequences=False),
        Dense(vocab_size, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [9]:
from tensorflow.keras.optimizers import Adam

In [35]:
from sklearn.model_selection import train_test_split

train_sequences, val_sequences, train_labels, val_labels = train_test_split(input_sequences, labels, test_size=0.2)

In [48]:
lstm_model = build_lstm_model()
lstm_model.fit(input_sequences, labels, epochs=60, batch_size=32, validation_split=0.2)

Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 24/60
Epoch 25/60
Epoch 26/60
Epoch 27/60
Epoch 28/60
Epoch 29/60
Epoch 30/60
Epoch 31/60
Epoch 32/60
Epoch 33/60
Epoch 34/60
Epoch 35/60
Epoch 36/60
Epoch 37/60
Epoch 38/60
Epoch 39/60
Epoch 40/60
Epoch 41/60
Epoch 42/60
Epoch 43/60
Epoch 44/60
Epoch 45/60
Epoch 46/60
Epoch 47/60
Epoch 48/60
Epoch 49/60
Epoch 50/60
Epoch 51/60
Epoch 52/60
Epoch 53/60
Epoch 54/60
Epoch 55/60
Epoch 56/60
Epoch 57/60
Epoch 58/60
Epoch 59/60
Epoch 60/60


<keras.src.callbacks.History at 0x7a54560ddc00>

In [55]:
rnn_model = build_rnn_model()
rnn_model.fit(input_sequences, labels, epochs=25, batch_size=32, validation_split=0.2)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.src.callbacks.History at 0x7a5457045d20>

In [12]:
def predict_next_word(model, sequence):
  padded_sequence = pad_sequences([sequence], maxlen=max_len, padding='post')
  predictions = model.predict(padded_sequence)[0]
  return tokenizer.index_word[tf.math.argmax(predictions).numpy()]


In [56]:
seed_text = "Their most significant feature is their coat, which is made up of soft waves, curls, and ringlets, resembling a shaggy"
cleaned_text = clean_text(seed_text)
tokens = tokenize_and_lemmatize(cleaned_text)
seed_sequence = tokenizer.texts_to_sequences([tokens])[0]

predicted_word_rnn = predict_next_word(rnn_model, seed_sequence)
predicted_word_lstm = predict_next_word(lstm_model, seed_sequence)

print(f"RNN predicted next word: {predicted_word_rnn}")
print(f"LSTM predicted next word: {predicted_word_lstm}")

RNN predicted next word: breeding
LSTM predicted next word: 1997


In [57]:
seed_text = "The breed standard describes a muscular foreign-type body, which is medium in size with longish legs and"
cleaned_text = clean_text(seed_text)
tokens = tokenize_and_lemmatize(cleaned_text)
seed_sequence = tokenizer.texts_to_sequences([tokens])[0]

predicted_word_rnn = predict_next_word(rnn_model, seed_sequence)
predicted_word_lstm = predict_next_word(lstm_model, seed_sequence)

print(f"RNN predicted next word: {predicted_word_rnn}")
print(f"LSTM predicted next word: {predicted_word_lstm}")

RNN predicted next word: bred
LSTM predicted next word: breed


In [26]:
def calculate_perplexity(model, sequences, max_length):
    total_log_prob = 0.0
    total_words = 0

    for seq in sequences:
        input_seq = seq[:-1]
        target_seq = seq[1:]

        input_padded = pad_sequences([input_seq], maxlen=max_length, padding='post')
        preds = model.predict(input_padded, verbose=0)[0]

        # Ensure preds is 1D array and handle it correctly
        for t, word in enumerate(target_seq):
            if word == 0:
                break
            total_log_prob += np.log(preds[word])  # preds[word] instead of preds[t, word]
            total_words += 1

    perplexity = np.exp(-total_log_prob / total_words)
    return perplexity


In [27]:
def evaluate_accuracy(model, sequences, max_length):
    correct_preds = 0
    total_preds = 0

    for seq in sequences:
        input_seq = seq[:-1]
        target_seq = seq[1:]

        input_padded = pad_sequences([input_seq], maxlen=max_length, padding='post')
        preds = model.predict(input_padded, verbose=0)[0]

        for t, word in enumerate(target_seq):
            if word == 0:
                break
            predicted_word = np.argmax(preds)  # Get the predicted word index from 1D preds
            if predicted_word == word:
                correct_preds += 1
            total_preds += 1

    accuracy = correct_preds / total_preds
    return accuracy


In [58]:
rnn_perplexity = calculate_perplexity(rnn_model, val_sequences, max_len)
rnn_accuracy = evaluate_accuracy(rnn_model, val_sequences, max_len)

print(f"RNN Model Perplexity: {rnn_perplexity}")
print(f"RNN Model Accuracy: {rnn_accuracy}")


lstm_perplexity = calculate_perplexity(lstm_model, val_sequences, max_len)
lstm_accuracy = evaluate_accuracy(lstm_model, val_sequences, max_len)

print(f"LSTM Model Perplexity: {lstm_perplexity}")
print(f"LSTM Model Accuracy: {lstm_accuracy}")



RNN Model Perplexity: 4058.1804259379596
RNN Model Accuracy: 0.006388642413487134
LSTM Model Perplexity: 1715.1280243652964
LSTM Model Accuracy: 0.018278615794143745


**Conclusion** **and comparison:**

The RNN model has a higher perplexity (4058.18) compared to the LSTM model (1715.128), suggesting that the LSTM model is better at predicting the next word in the sequence.
The RNN model has a very low accuracy (0.64%) compared to the LSTM model (1.82%). While both accuracies are low, the LSTM model still outperforms the RNN model.
Some of the reasons for this are:
LSTMs have a more complex architecture with mechanisms (input, forget, and output gates) that help them retain important information over longer sequences.
Simple RNNs lack these mechanisms, making them less capable of handling dependencies in longer sequences.

To receive better performance, we might tune the hyperparameters, add more layers. And, most importantly, use bigger and more diverse dataset for training.

In [59]:
!pip install transformers torch
!pip install scikit-learn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [61]:
def preprocess_text(text):
    sentences = nltk.sent_tokenize(text)
    return sentences

def kl_sum(similarity_matrix, sentences, num_sentences=5):
    sentence_scores = np.sum(similarity_matrix, axis=1)
    ranked_sentences = [sentences[i] for i in np.argsort(sentence_scores)[-num_sentences:]]
    return ' '.join(ranked_sentences)

sentences = preprocess_text(text_data)
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(sentences)
similarity_matrix = cosine_similarity(X, X)
summary = kl_sum(similarity_matrix, sentences, num_sentences=5)

print("Summary:")
print(summary)


Summary:
In June 2008, the LaPerm gained Provisional Recognition in the GCCF and the first cat to gain an Intermediate Certificate was Aswani Miranda Keys. The first LaPerm with an Imperial title was also Aswani Miranda Keys, the title being gained at the world's first LaPerm breed show, which was held by the LaPerm Cat Club. The coat varies according to the season and the maturity of the cat but is essentially wavy or curly with the longest and most defined curls in the ruff and on the neck. The UK now has an active LaPerm breeding program and is the home of the LaPerm Cat Club. The LaPerm is a breed of cat.


# **Chat GPT model**

In [75]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/547.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m542.7/547.8 kB[0m [31m17.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 

In [3]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import load_dataset, Dataset


In [3]:
!pip install transformers torch
from google.colab import files




In [7]:
with open("laperm_corpus.txt", "w") as f:
    f.write(text_data)

# Then download the file
#files.download("laperm_corpus.txt")


In [12]:
!pip install transformers[torch]
!pip install accelerate -U
!pip install --upgrade accelerate

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.32.1


In [4]:
# Load the corpus
dataset = load_dataset('text', data_files={'train': 'laperm_corpus.txt'})

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add a padding token to the tokenizer
tokenizer.pad_token = tokenizer.eos_token

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [5]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=256)

# Create labels by shifting inputs
def add_labels(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples


In [6]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])

tokenized_datasets = tokenized_datasets.map(add_labels, batched=True)
tokenized_datasets.set_format("torch")

In [7]:
# Define the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Masked Language Modeling is not used here
)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-laperm",
    overwrite_output_dir=True,
    num_train_epochs=3,  # Reduced number of epochs
    per_device_train_batch_size=2,  # Reduced batch size to lower memory usage
    save_steps=10_000,
    save_total_limit=2,
    fp16=True,  # Use mixed precision training
    gradient_accumulation_steps=2,  # Reduced gradient accumulation steps to reduce memory usage
    dataloader_num_workers=2,  # Reduced number of worker processes
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

  self.pid = os.fork()


Step,Training Loss


TrainOutput(global_step=15, training_loss=3.977416229248047, metrics={'train_runtime': 317.841, 'train_samples_per_second': 0.179, 'train_steps_per_second': 0.047, 'total_flos': 7446822912000.0, 'train_loss': 3.977416229248047, 'epoch': 3.0})

In [8]:
# Save the model and tokenizer
trainer.save_model("./gpt2-laperm")
tokenizer.save_pretrained("./gpt2-laperm")

# Load the fine-tuned model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./gpt2-laperm")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-laperm")

In [9]:
# Generate completions for partial sentences
partial_sentences = [
    "The LaPerm is a rex breed which originated in the United States and is now",
    "They are reputed to be hypoallergenic cats, provoking a significantly lower level of",
    "Their most significant feature is their coat, which is made up of soft waves, curls,",
    "The LaPerm is in many ways a cat of moderation with no extremes and is still true to",
    "The coat varies according to the season and the maturity of the cat but is essentially wavy or "
]

for sentence in partial_sentences:
    inputs = tokenizer.encode(sentence, return_tensors="pt")
    attention_mask = inputs != tokenizer.pad_token_id
    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,
        max_length=40,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id,
        num_beams=5,  # Adding beam search for better quality
        early_stopping=True  # Stop when an EOS token is generated
    )
    print(f"Original: {sentence}")
    print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}\n")



Original: The LaPerm is a rex breed which originated in the United States and is now
Generated: The LaPerm is a rex breed which originated in the United States and is now found in many countries around the world. The breed was first introduced to the UK in 1997 and has since spread

Original: They are reputed to be hypoallergenic cats, provoking a significantly lower level of
Generated: They are reputed to be hypoallergenic cats, provoking a significantly lower level of allergic reactions than other breeds. However, there is no scientific evidence to support the use of these cats for

Original: Their most significant feature is their coat, which is made up of soft waves, curls,
Generated: Their most significant feature is their coat, which is made up of soft waves, curls, and long, curly hair. They are also known for their ability to grow up to six feet tall.


Original: The LaPerm is in many ways a cat of moderation with no extremes and is still true to
Generated: The LaPerm is in man

# **Sentiment Analysis**

In [72]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import Counter

# Ensure you have downloaded the VADER lexicon
nltk.download('vader_lexicon')

# Initialize VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()


# Analyze sentiment for each sentence
sentiment_labels = []

for sentence in sentences:
    scores = sid.polarity_scores(sentence)
    if scores['compound'] >= 0.05:
        sentiment_labels.append('positive')
    elif scores['compound'] <= -0.05:
        sentiment_labels.append('negative')
    else:
        sentiment_labels.append('neutral')

# Count the occurrences of each sentiment
sentiment_counts = Counter(sentiment_labels)
total_sentences = len(sentences)

# Calculate percentage distribution
sentiment_distribution = {label: count / total_sentences * 100 for label, count in sentiment_counts.items()}

print("Sentiment Distribution:")
print(f"Positive: {sentiment_distribution.get('positive', 0):.2f}%")
print(f"Negative: {sentiment_distribution.get('negative', 0):.2f}%")
print(f"Neutral: {sentiment_distribution.get('neutral', 0):.2f}%")


Sentiment Distribution:
Positive: 52.87%
Negative: 8.05%
Neutral: 39.08%


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [74]:
for sentence in sentences[:10]:  # Check the first 10 sentences
    scores = sid.polarity_scores(sentence)
    print(f"Sentence: {sentence}")
    print(f"Scores: {scores}\n")


Sentence: The LaPerm is a breed of cat.
Scores: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

Sentence: A LaPerm's fur is curly (hence the name "perm"), with the tightest curls being on the throat and on the base of the ears.
Scores: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

Sentence: LaPerms come in many colors and patterns.
Scores: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

Sentence: LaPerms generally have a very affectionate personality.
Scores: {'neg': 0.0, 'neu': 0.61, 'pos': 0.39, 'compound': 0.4927}

Sentence: The LaPerm is a rex breed which originated in the United States and is now present in many other countries worldwide.
Scores: {'neg': 0.0, 'neu': 0.872, 'pos': 0.128, 'compound': 0.4215}

Sentence: The breed is genetically unique and not related to any other rex cat varieties, having a dominant gene causing their curly coats.
Scores: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

Sentence: They have an elegant and athletic build an