##**Imports**

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
import numpy as np
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# Load pre-trained model and tokenizer


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# Function to generate text for a specific field


In [None]:
def generate_document(prompt, max_length=200, temperature=0.9):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(input_ids, max_length=max_length, temperature=temperature)
    return tokenizer.decode(output[0], skip_special_tokens=True)


# Function to generate documents for given fields

In [None]:
def generate_documents_for_fields(fields, gen_pipeline):
    generated_documents = {}
    for field, prompt in fields.items():
        generated_documents[field] = generate_document(prompt)
    return generated_documents


# Function to preprocess documents for TF-IDF calculation


In [None]:
def preprocess_documents_for_tfidf(documents):
    preprocessed_documents = []
    for doc in documents.values():
        # Cleaning
        cleaned_text = ''.join([char if char.isalnum() or char.isspace() else ' ' for char in doc])
        # Normalization
        normalized_text = cleaned_text.lower()
        preprocessed_documents.append(normalized_text)
    return preprocessed_documents


# Function to calculate TF-IDF matrix


In [None]:
def calculate_tfidf(preprocessed_documents):
    # TF-IDF calculation using scikit-learn
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_documents)
    return tfidf_matrix


# Function to calculate TF-IDF from scratch

In [None]:
def calculate_tfidf_from_scratch(preprocessed_documents):
    def tf(word, document):
        return document.count(word) / len(document)

    def idf(word, documents):
        num_documents_with_word = sum(1 for doc in documents if word in doc)
        return np.log(len(documents) / (1 + num_documents_with_word))

    unique_words = set(word for doc in documents for word in doc)
    tfidf_vectors = []

    for doc in documents:
        tfidf_vector = [tf(word, doc) * idf(word, documents) for word in unique_words]
        tfidf_vectors.append(tfidf_vector)

    return tfidf_vectors




In [None]:
# Define fields and their prompts
fields = {
    "technology": "The latest advancements in technology.",
    "healthcare": "Improving healthcare systems for better patient outcomes.",
    "business": "Strategies for successful business growth.",
}

# Generate documents for given fields
generated_documents = generate_documents_for_fields(fields, model)

# Preprocess generated documents for TF-IDF calculation
preprocessed_documents = preprocess_documents_for_tfidf(generated_documents)

# Calculate TF-IDF matrix using scikit-learn
tfidf_matrix = calculate_tfidf(preprocessed_documents)

# Calculate TF-IDF matrix from scratch
tfidf_matrix_scratch = calculate_tfidf_from_scratch(preprocessed_documents)

# Print TF-IDF matrices
print("TF-IDF Matrix (scikit-learn):\n", tfidf_matrix.toarray())
print("\nTF-IDF Matrix (from scratch):\n", tfidf_matrix_scratch.toarray())


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


TF-IDF Matrix (scikit-learn):
 [[0.         0.03661862 0.         0.25633033 0.         0.19494582
  0.         0.         0.         0.         0.         0.29294895
  0.19494582 0.25633033 0.         0.         0.18309309 0.0278494
  0.         0.         0.         0.         0.         0.
  0.         0.         0.25633033 0.         0.         0.
  0.0278494  0.         0.         0.19494582 0.05569881 0.03661862
  0.21971171 0.18309309 0.         0.25953055 0.         0.
  0.25633033 0.25633033 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.03661862
  0.         0.13924702 0.         0.         0.         0.
  0.03661862 0.15139282 0.17302037 0.03661862 0.19494582 0.0278494
  0.25953055 0.18309309 0.         0.25633033 0.         0.        ]
 [0.23022222 0.         0.17508994 0.         0.         0.
  0.         0.02877778 0.         0.         0.23022222 0.
  0.         0.         0.23022222 0.         0.         0.
  0.         0

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Step 2: Text processing and unique words
# You can use any text processing techniques here, such as tokenization, stop word removal, etc.
# For simplicity, let's just use the documents as they are and extract unique words.

unique_words = set(" ".join(documents).split())

# Step 3: TF-IDF feature vector
tfidf_vectorizer = TfidfVectorizer(vocabulary=unique_words)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# tfidf_matrix now contains the TF-IDF representation of the documents
# Each row represents a document, and each column represents a unique word
# The value in each cell represents the TF-IDF score of that word in that document

# If you want to get the TF-IDF feature vector for each document separately:
for i in range(len(documents)):
    print("Document", i+1, "TF-IDF vector:")
    print(tfidf_matrix[i].toarray().flatten())


Document 1 TF-IDF vector:
[0.         0.         0.         0.46979139 0.         0.
 0.58028582 0.38408524 0.         0.         0.38408524 0.
 0.38408524]
Document 2 TF-IDF vector:
[0.         0.         0.         0.6876236  0.         0.
 0.         0.28108867 0.         0.53864762 0.28108867 0.
 0.28108867]
Document 3 TF-IDF vector:
[0.         0.         0.         0.         0.         0.
 0.         0.38713857 0.         0.         0.38713857 0.74187006
 0.38713857]
Document 4 TF-IDF vector:
[0.         0.         0.         0.46979139 0.         0.
 0.58028582 0.38408524 0.         0.         0.38408524 0.
 0.38408524]




In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Step 2: Text processing and unique words
# You can use any text processing techniques here, such as tokenization, stop word removal, etc.
# For simplicity, let's just use the documents as they are and extract unique words.

unique_words = set(" ".join(documents).split())

# Step 3: TF-IDF feature vector
tfidf_vectorizer = TfidfVectorizer(vocabulary=unique_words)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Convert the set of unique words to a list
unique_words_list = list(unique_words)

# Create a DataFrame to display the TF-IDF feature vectors
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=unique_words_list)

# Print the DataFrame
print(tfidf_df)


   first  And  document?       the  document  third      This      one.   Is  \
0    0.0  0.0        0.0  0.469791       0.0    0.0  0.580286  0.384085  0.0   
1    0.0  0.0        0.0  0.687624       0.0    0.0  0.000000  0.281089  0.0   
2    0.0  0.0        0.0  0.000000       0.0    0.0  0.000000  0.387139  0.0   
3    0.0  0.0        0.0  0.469791       0.0    0.0  0.580286  0.384085  0.0   

         is    second  document.      this  
0  0.000000  0.384085    0.00000  0.384085  
1  0.538648  0.281089    0.00000  0.281089  
2  0.000000  0.387139    0.74187  0.387139  
3  0.000000  0.384085    0.00000  0.384085  


In [None]:
import numpy as np

def calculate_tfidf(documents):
    def tf(word, document):
        return document.count(word) / len(document)

    def idf(word, documents):
        num_documents_with_word = sum(1 for doc in documents if word in doc)
        return np.log(len(documents) / (1 + num_documents_with_word))

    unique_words = set(word for doc in documents for word in doc)
    tfidf_vectors = []

    for doc in documents:
        tfidf_vector = [tf(word, doc) * idf(word, documents) for word in unique_words]
        tfidf_vectors.append(tfidf_vector)

    return tfidf_vectors

# Assuming 'processed_documents' is your list of processed documents
tfidf_vectors = calculate_tfidf(documents)

# Print the TF-IDF feature vector for each document
for i, vec in enumerate(tfidf_vectors):
    print(f"TF-IDF for document {i+1}: {vec}")

TF-IDF for document 1: [-0.02479372792380108, -0.033058303898401434, 0.0, -0.016529151949200717, -0.008264575974600358, 0.010654891572288178, -0.02479372792380108, 0.0, -0.016529151949200717, -0.02479372792380108, 0.0, 0.0, 0.0, -0.008264575974600358, -0.008264575974600358, 0.0, 0.010654891572288178, 0.0, 0.0]
TF-IDF for document 2: [-0.018092720376827815, -0.030154533961379692, 0.0, -0.012061813584551877, -0.018092720376827815, 0.0, -0.018092720376827815, 0.0, -0.024123627169103753, -0.012061813584551877, 0.0, 0.0, 0.0, -0.018092720376827815, -0.018092720376827815, 0.0, 0.007775191147345428, 0.0, 0.0]
TF-IDF for document 3: [-0.025747332843947274, -0.04291222140657879, 0.026659506944613283, -0.025747332843947274, -0.017164888562631517, 0.0, -0.017164888562631517, 0.0, -0.017164888562631517, -0.025747332843947274, 0.0, 0.0, 0.0, -0.008582444281315759, -0.017164888562631517, 0.0, 0.0, 0.0, 0.0]
TF-IDF for document 4: [-0.033058303898401434, -0.033058303898401434, 0.0, -0.016529151949200

▶



---



In [None]:
documents_lower = [doc.lower() for doc in documents]

# Assuming you have your documents stored in a list called 'documents'
num_documents = len(documents)
print("Number of documents:", num_documents)

Number of documents: 4


In [None]:
import re

# Define a function to preprocess each document
def preprocess_document(doc):
    # Convert to lowercase
    doc = doc.lower()
    # Remove punctuation marks
    doc = re.sub(r'[^\w\s]', '', doc)
    doc =  re.sub(r"[^a-zA-Z0-9]", " ", doc)
    re.sub(r"\s+", " ", doc)
    return doc

# Preprocess each document
documents_preprocessed = [preprocess_document(doc) for doc in documents]

# Assuming you have your documents processed and stored in a list called 'processed_docs'
unique_words = set(" ".join(documents_preprocessed).split())
print("Unique words:", unique_words)

Unique words: {'one', 'first', 'the', 'document', 'third', 'and', 'is', 'second', 'this'}


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming you have your processed documents in 'processed_docs_list'
corpus = [' '.join(doc) for doc in documents]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# Print the TF-IDF feature vector for each document
for i, doc in enumerate(X.toarray()):
    print(f"TF-IDF for document {i+1}: {doc}")

In [None]:
import numpy as np

def calculate_tfidf(documents):
    def tf(word, document):
        return document.count(word) / len(document)

    def idf(word, documents):
        num_documents_with_word = sum(1 for doc in documents if word in doc)
        return np.log(len(documents) / (1 + num_documents_with_word))

    unique_words = set(word for doc in documents for word in doc)
    tfidf_vectors = []

    for doc in documents:
        tfidf_vector = [tf(word, doc) * idf(word, documents) for word in unique_words]
        tfidf_vectors.append(tfidf_vector)

    return tfidf_vectors

# Assuming 'processed_documents' is your list of processed documents
tfidf_vectors = calculate_tfidf(documents)

# Print the TF-IDF feature vector for each document
for i, vec in enumerate(tfidf_vectors):
    print(f"TF-IDF for document {i+1}: {vec}")

In [None]:
import math

def preprocess_documents(documents):
    # Preprocess documents and get unique words
    unique_words = set()
    preprocessed_documents = []
    for doc in documents:
        preprocessed_doc = doc.lower().replace('.', '').replace('?', '')  # Remove punctuation
        preprocessed_documents.append(preprocessed_doc)
        unique_words.update(preprocessed_doc.split())
    return preprocessed_documents, list(unique_words)

def calculate_tf(preprocessed_documents, unique_words):
    # Calculate TF for each document
    tf_values = []
    for doc in preprocessed_documents:
        tf_doc = {}
        words = doc.split()
        word_count = len(words)
        for word in unique_words:
            tf_doc[word] = words.count(word) / word_count
        tf_values.append(tf_doc)
    return tf_values

def calculate_idf(preprocessed_documents, unique_words):
    # Calculate IDF for each word
    idf_values = {}
    num_documents = len(preprocessed_documents)
    for word in unique_words:
        num_documents_containing_word = sum(1 for doc in preprocessed_documents if word in doc)
        idf_values[word] = math.log(num_documents / (1 + num_documents_containing_word))
    return idf_values

def calculate_tfidf(preprocessed_documents, unique_words, tf_values, idf_values):
    # Calculate TF-IDF for each document
    tfidf_vectors = []
    for i, doc in enumerate(preprocessed_documents):
        tfidf_doc = {}
        for word in unique_words:
            tfidf_doc[word] = tf_values[i][word] * idf_values[word]
        tfidf_vectors.append(tfidf_doc)
    return tfidf_vectors

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Step 1: Preprocess documents and get unique words
preprocessed_documents, unique_words = preprocess_documents(documents)

# Step 2: Calculate TF
tf_values = calculate_tf(preprocessed_documents, unique_words)

# Step 3: Calculate IDF
idf_values = calculate_idf(preprocessed_documents, unique_words)

# Step 4: Calculate TF-IDF
tfidf_vectors = calculate_tfidf(preprocessed_documents, unique_words, tf_values, idf_values)

# Print TF-IDF vectors for each document
for i, doc_tfidf in enumerate(tfidf_vectors):
    print("TF-IDF vector for document", i+1)
    for word, tfidf_score in doc_tfidf.items():
        print(f"{word}: {tfidf_score:.4f}")
    print()


TF-IDF vector for document 1
one: 0.0000
first: 0.0575
the: -0.0446
document: 0.0000
third: 0.0000
and: 0.0000
is: -0.0446
second: 0.0000
this: -0.0446

TF-IDF vector for document 2
one: 0.0000
first: 0.0000
the: -0.0372
document: 0.0000
third: 0.0000
and: 0.0000
is: -0.0372
second: 0.1155
this: -0.0372

TF-IDF vector for document 3
one: 0.1155
first: 0.0000
the: -0.0372
document: 0.0000
third: 0.1155
and: 0.1155
is: -0.0372
second: 0.0000
this: -0.0372

TF-IDF vector for document 4
one: 0.0000
first: 0.0575
the: -0.0446
document: 0.0000
third: 0.0000
and: 0.0000
is: -0.0446
second: 0.0000
this: -0.0446



In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.preprocessing import normalize
import string
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re

# Function to generate documents using GPT model
def generate_documents(num_documents, gen_pipeline):
    documents = []
    for _ in range(num_documents):
        generated_text = gen_pipeline("Generate a document about a topic")[0]['generated_text']
        # Remove special characters and extra whitespaces
        generated_text = re.sub(r'\s+', ' ', generated_text)
        documents.append(generated_text.strip())
    return documents

# Function to calculate TF-IDF for documents
def calculate_tfidf(documents):
    # Convert documents to lowercase and remove punctuation
    preprocessed_documents = [re.sub(r'[^\w\s]', '', doc.lower()) for doc in documents]

    # Initialize TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Fit and transform documents to TF-IDF matrix
    tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_documents)

    # Get feature names (unique words)
    unique_words = tfidf_vectorizer.get_feature_names_out()

    # Create DataFrame to display TF-IDF feature vectors
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=unique_words)

    return tfidf_df

# Initialize GPT pipeline
gen = pipeline('text-generation', model ='EleutherAI/gpt-neo-2.7B')

# Generate documents
num_documents = 5
generated_documents = generate_documents(num_documents, gen)

# Calculate TF-IDF for generated documents
tfidf_result = calculate_tfidf(generated_documents)

print("TF-IDF Feature Vectors for Generated Documents:")
print(tfidf_result)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


KeyboardInterrupt: 