##**Imports**

In [1]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import numpy as np
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

# Load pre-trained model and tokenizer


In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# Function to generate text for a specific field


In [3]:
def generate_document(prompt, max_length=200, temperature=0.9):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(input_ids, max_length=max_length, temperature=temperature)
    return tokenizer.decode(output[0], skip_special_tokens=True)


# Function to generate documents for given fields

In [4]:
def generate_documents_for_fields(fields, gen_pipeline):
    generated_documents = {}
    for field, prompt in fields.items():
        generated_documents[field] = generate_document(prompt)
    return generated_documents


# Function to preprocess documents for TF-IDF calculation


In [5]:
def preprocess_documents_for_tfidf(documents):
    preprocessed_documents = []
    for doc in documents.values():
        # Cleaning
        cleaned_text = ''.join([char if char.isalnum() or char.isspace() else ' ' for char in doc])
        # Normalization
        normalized_text = cleaned_text.lower()
        preprocessed_documents.append(normalized_text)
    return preprocessed_documents


# Function to calculate TF-IDF matrix


In [6]:
def calculate_tfidf(preprocessed_documents):
    # TF-IDF calculation using scikit-learn
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_documents)
    return tfidf_matrix



# Function to calculate TF-IDF from scratch

In [7]:
def calculate_tfidf_from_scratch(preprocessed_documents):
    # TF calculation using scikit-learn
    count_vectorizer = TfidfVectorizer(use_idf=False)
    term_freq_matrix = count_vectorizer.fit_transform(preprocessed_documents)

    total_docs = len(preprocessed_documents)
    word_idf = {}
    for word in count_vectorizer.vocabulary_:
        word_docs_count = sum(1 for doc in preprocessed_documents if word in doc)
        word_idf[word] = total_docs / (1 + word_docs_count)

    # Convert term_freq_matrix to float64
    term_freq_matrix = term_freq_matrix.astype(np.float64)

    # Multiply TF * IDF
    tfidf_matrix_scratch = term_freq_matrix.copy()
    for word, idx in count_vectorizer.vocabulary_.items():
        tfidf_matrix_scratch[:, idx] *= word_idf[word]

    # Normalize TF-IDF
    tfidf_matrix_scratch = normalize(tfidf_matrix_scratch)
    return tfidf_matrix_scratch

In [8]:
import math

In [9]:
# Define fields and their prompts
fields = {
    "technology": "The latest advancements in technology.",
    "healthcare": "Improving healthcare systems for better patient outcomes.",
    "business": "Strategies for successful business growth.",
}

# Generate documents for given fields
generated_documents = generate_documents_for_fields(fields, model)

# Preprocess generated documents for TF-IDF calculation
preprocessed_documents = preprocess_documents_for_tfidf(generated_documents)




The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [10]:
# Calculate TF-IDF matrix using scikit-learn
tfidf_matrix = calculate_tfidf(preprocessed_documents)

# Calculate TF-IDF matrix from scratch
tfidf_matrix_scratch = calculate_tfidf_from_scratch(preprocessed_documents)

# Print TF-IDF matrices
print("TF-IDF Matrix (scikit-learn):\n", tfidf_matrix.toarray())
print("\nTF-IDF Matrix (from scratch):\n", (tfidf_matrix_scratch.toarray()))

TF-IDF Matrix (scikit-learn):
 [[0.         0.03661862 0.         0.25633033 0.         0.19494582
  0.         0.         0.         0.         0.         0.29294895
  0.19494582 0.25633033 0.         0.         0.18309309 0.0278494
  0.         0.         0.         0.         0.         0.
  0.         0.         0.25633033 0.         0.         0.
  0.0278494  0.         0.         0.19494582 0.05569881 0.03661862
  0.21971171 0.18309309 0.         0.25953055 0.         0.
  0.25633033 0.25633033 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.03661862
  0.         0.13924702 0.         0.         0.         0.
  0.03661862 0.15139282 0.17302037 0.03661862 0.19494582 0.0278494
  0.25953055 0.18309309 0.         0.25633033 0.         0.        ]
 [0.23022222 0.         0.17508994 0.         0.         0.
  0.         0.02877778 0.         0.         0.23022222 0.
  0.         0.         0.23022222 0.         0.         0.
  0.         0

In [11]:
print("\nTF-IDF Matrix (from scratch):\n", (tfidf_matrix_scratch.toarray()))


TF-IDF Matrix (from scratch):
 [[0.         0.03877591 0.         0.27143135 0.         0.13571568
  0.         0.         0.         0.         0.         0.31020726
  0.18095423 0.27143135 0.         0.         0.19387954 0.0258506
  0.         0.         0.         0.         0.         0.
  0.         0.         0.27143135 0.         0.         0.
  0.01938795 0.         0.         0.18095423 0.03877591 0.03877591
  0.23265544 0.19387954 0.         0.23265544 0.         0.
  0.27143135 0.27143135 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.03877591
  0.         0.12925302 0.         0.         0.         0.
  0.03877591 0.13571568 0.15510363 0.03877591 0.18095423 0.0258506
  0.23265544 0.19387954 0.         0.27143135 0.         0.        ]
 [0.23873802 0.         0.15915868 0.         0.         0.
  0.         0.02984225 0.         0.         0.23873802 0.
  0.         0.         0.23873802 0.         0.         0.
  0.         

In [12]:
generated_documents = generate_documents_for_fields(fields, model)
print(generated_documents)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


{'technology': 'The latest advancements in technology.\n\nThe first of these is the new "smart" camera. This is a camera that can be used to capture images of people, places, and things. It\'s a camera that can be used to capture images of people, places, and things. It\'s a camera that can be used to capture images of people, places, and things.\n\nThe next step is to develop a new type of camera that can be used to capture images of people, places, and things.\n\nThe next step is to develop a new type of camera that can be used to capture images of people, places, and things.\n\nThe next step is to develop a new type of camera that can be used to capture images of people, places, and things.\n\nThe next step is to develop a new type of camera that can be used to capture images of people, places, and things.\n\nThe next step is to develop a new type', 'healthcare': 'Improving healthcare systems for better patient outcomes.\n\nThe report also recommends that the government take action 