In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from collections import Counter
import math
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def generate_documents(topic_name, num_documents=5, max_length=200):
    # Load pre-trained GPT-2 model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    model = GPT2LMHeadModel.from_pretrained("gpt2")

    # Encode the input text
    input_text = f"Write a document about '{topic_name}'"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    # Generate documents based on the input topic
    generated_docs = []
    for _ in range(num_documents):
        # Generate text based on the input
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=1,
            no_repeat_ngram_size=2,
            top_k=50,
            top_p=0.95,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

        # Decode and append generated text to the list
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        generated_docs.append(generated_text)

    return generated_docs


topic_name = "sports"
num_documents = 5
documents = generate_documents(topic_name, num_documents)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nasser\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nasser\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nasser\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
# Function to preprocess text
def preprocess_text(text):
    # Cleaning data: Remove symbols and special characters
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Normalization: Convert text to lowercase
    normalized_text = cleaned_text.lower()
    
    # Tokenization: Split text into words
    words = word_tokenize(normalized_text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    # Return preprocessed text as a string
    return ' '.join(lemmatized_words)

# Preprocess documents
preprocessed_documents = [preprocess_text(doc) for doc in documents]



In [4]:
# Calculate Term Frequency (TF) for each word in all documents
def calculate_tf(documents_tokens):
    tf_documents = []
    for doc_tokens in documents_tokens:
        word_count = Counter(doc_tokens.split())
        tf_doc = {word: count/len(doc_tokens.split()) for word, count in word_count.items()}
        tf_documents.append(tf_doc)
    return tf_documents

# Calculate Inverse Document Frequency (IDF) for each word
def calculate_idf(documents_tokens):
    idf_values = {}
    num_documents = len(documents_tokens)
    
    # Flatten list of tokenized documents
    all_words = set(word for doc in documents_tokens for word in doc.split())
    
    for word in all_words:
        num_docs_containing_word = sum([1 for doc in documents_tokens if word in doc])
        idf_values[word] = math.log(num_documents +1 / (1 + num_docs_containing_word))+1
    
    return idf_values

# Calculate TF-IDF and normalize
def calculate_tfidf(documents_tokens):
    tf_documents = calculate_tf(documents_tokens)
    idf_values = calculate_idf(documents_tokens)
    
    tfidf_normalized = []
    for tf_doc in tf_documents:
        tfidf_doc = {word: tf * idf_values[word] for word, tf in tf_doc.items()}
        # Normalize TF-IDF scores
        max_tfidf = max(tfidf_doc.values())
        tfidf_normalized.append({word: tfidf_score / max_tfidf for word, tfidf_score in tfidf_doc.items()})
    
    return tfidf_normalized

In [5]:
# Calculate TF-IDF scores manually
tfidf_scores_manual = calculate_tfidf(preprocessed_documents)

# TF-IDF vectorization using TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix_sklearn = vectorizer.fit_transform(preprocessed_documents)

# Get feature names (unique words)
feature_names = vectorizer.get_feature_names_out()

# Normalize TF-IDF scores from sklearn to match manual normalization
def normalize_tfidf(tfidf_matrix_sklearn):
    tfidf_normalized = []
    for i in range(tfidf_matrix_sklearn.shape[0]):
        max_tfidf = max(tfidf_matrix_sklearn[i].toarray()[0])
        tfidf_normalized.append({feature_names[j]: tfidf_matrix_sklearn[i, j]/max_tfidf for j in tfidf_matrix_sklearn[i].nonzero()[1]})
    return tfidf_normalized

# Normalize TF-IDF scores from sklearn
tfidf_scores_sklearn = normalize_tfidf(tfidf_matrix_sklearn)

# Display normalized TF-IDF scores (Manual)
print("TF-IDF scores calculated manually:")
for i, doc_tfidf in enumerate(tfidf_scores_manual):
    print(f"Document {i+1}: {doc_tfidf}")

# Display normalized TF-IDF scores from sklearn
print("TF-IDF scores using TfidfVectorizer (sklearn):")
for i, doc_tfidf in enumerate(tfidf_scores_sklearn):
    print(f"Document {i+1}: {doc_tfidf}")

TF-IDF scores calculated manually:
Document 1: {'write': 0.2442212403106279, 'document': 0.2442212403106279, 'aboutsports': 0.2442212403106279, 'list': 0.25, 'people': 0.25, 'play': 0.24715577406681086, 'espn': 0.7500000000000001, 'might': 0.25, 'include': 0.25, 'name': 0.25, 'person': 0.25, 'played': 0.25, 'program': 0.5, 'team': 0.4943115481336217, 'hisher': 0.25, 'teammate': 0.25, 'number': 1.0, 'player': 0.5, 'career': 0.25, 'total': 0.25, 'want': 0.24715577406681086, 'know': 0.25, 'average': 0.5, 'use': 0.9828006106119875, 'value': 0.25, 'example': 0.7414673222004325, 'mean': 0.25, 'million': 1.0, 'x': 0.24481564782206536, 'could': 0.5, 'also': 0.24570015265299688, 'would': 0.25, 'see': 0.24570015265299688, 'comparing': 0.25, 'year': 0.25, 'say': 0.25, 'nfl': 0.24715577406681086}
Document 2: {'write': 0.3256283204141705, 'document': 0.3256283204141705, 'aboutsports': 0.3256283204141705, 'link': 0.3295410320890811, 'httpsportbookcomenusarticlesnewsportschicagobillsfootballteamnewsc

In [7]:
print(tfidf_scores_sklearn[0]['nfl'])
print(tfidf_scores_manual[0]['nfl'])

0.2016984258719968
0.24715577406681086


In [8]:
print(tfidf_scores_sklearn[0]['see'])
print(tfidf_scores_manual[0]['see'])


0.16742791363812926
0.24570015265299688
