In [1]:
import nltk
import re
import numpy as np
from transformers import pipeline
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize

  from .autonotebook import tqdm as notebook_tqdm





# Generating Documents

In [2]:
# Initialize the text generation pipeline
text_generator = pipeline("text-generation")

No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [3]:
def generate_complete_document(topic_sentence):
    prompt = f"{topic_sentence}\n"
    generated_document = text_generator(prompt, max_length=100, do_sample=True)[0]['generated_text']
    # Split the generated document into sentences
    sentences = nltk.sent_tokenize(generated_document)
    # Remove incomplete sentences
    complete_sentences = [sentence for sentence in sentences if re.match(r'.*[.!?]$', sentence.strip())]
    # Join the complete sentences back into a document
    complete_document = ' '.join(complete_sentences)
    return complete_document

In [4]:
# Define topic sentences for each topic
topic_sentences = {
    "health_care": "The advancement of medical technology is revolutionizing healthcare.",
    "AI": "Artificial Intelligence is transforming industries and reshaping the future.",
    "football": "Football is one of the most popular sports globally, captivating millions of fans.",
    "computer_science": "Computer science drives innovation and powers technological progress."
}

# Preprocessing

In [5]:
def preprocess_text(text):
    # Step 1: Cleaning data from symbols and characters not part of the data
    text = re.sub(r'[^\w\s]', '', text)
    
    # Step 2: Normalization - Convert text to lowercase
    text = text.lower()
    
    # Step 3: Tokenization - Split the data into words
    words = word_tokenize(text)
    
    # Step 4: Remove stop words 
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]
    
    # Step 5: 
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join the processed words back into a string
    processed_text = ' '.join([word for word in words if len(word)>2])
    
    return processed_text

# Let's genrate Documents

In [6]:
# Generate documents for each topic
generated_documents = {}
for topic, topic_sentence in topic_sentences.items():
    generated_document = generate_complete_document(topic_sentence)
    generated_documents[topic] = generated_document

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [7]:
# Preprocess generated documents
preprocessed_documents = [preprocess_text(doc) for doc in generated_documents.values()]

In [8]:
# Print generated documents
for topic, document in zip(topic_sentences.keys(), preprocessed_documents):
    print(f"Topic: {topic}")
    print("Generated Document:")
    print(document)
    print()

Topic: health_care
Generated Document:
advancement medical technology revolutionizing healthcare new technology brings great benefit physical side procedure psychological side dont confuse human body bacteria prioninfected fungi human body also microcosm ancient time shared basic sacred human need new technology allow mindful health

Topic: AI
Generated Document:
artificial intelligence transforming industry reshaping future know expect begin take role industrial guard first step addressing transformation work bring value invention entirely new level see making future robotics whole lot brighter said pivatnam

Topic: football
Generated Document:
football one popular sport globally captivating million fan fan experience would probably good product well see

Topic: computer_science
Generated Document:
computer science drive innovation power technological progress information citizen scientist engineer see cosmiccores website cosmiccorecom webcast cosmiccore webcast



In [9]:
# Get unique words
all_words = ' '.join(preprocessed_documents).split()
unique_words = list(set(all_words))
print(unique_words)

['dont', 'know', 'invention', 'website', 'future', 'drive', 'revolutionizing', 'time', 'cosmiccore', 'prioninfected', 'side', 'intelligence', 'level', 'would', 'power', 'new', 'great', 'probably', 'mindful', 'also', 'body', 'bring', 'first', 'advancement', 'take', 'reshaping', 'product', 'addressing', 'transforming', 'making', 'step', 'begin', 'brighter', 'need', 'bacteria', 'procedure', 'whole', 'entirely', 'cosmiccorecom', 'benefit', 'innovation', 'health', 'physical', 'well', 'see', 'computer', 'technological', 'expect', 'said', 'psychological', 'citizen', 'guard', 'brings', 'medical', 'fungi', 'artificial', 'experience', 'progress', 'sacred', 'fan', 'popular', 'good', 'transformation', 'scientist', 'million', 'information', 'technology', 'captivating', 'healthcare', 'confuse', 'pivatnam', 'sport', 'lot', 'webcast', 'role', 'engineer', 'cosmiccores', 'microcosm', 'ancient', 'one', 'robotics', 'football', 'basic', 'work', 'allow', 'globally', 'industrial', 'human', 'value', 'science'

In [10]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

In [11]:
# Fit and transform the preprocessed corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_documents)

In [12]:
tfidf_vectorizer.vocabulary_

{'advancement': 1,
 'medical': 49,
 'technology': 81,
 'revolutionizing': 67,
 'healthcare': 37,
 'new': 54,
 'brings': 13,
 'great': 34,
 'benefit': 9,
 'physical': 56,
 'side': 76,
 'procedure': 62,
 'psychological': 65,
 'dont': 21,
 'confuse': 17,
 'human': 38,
 'body': 10,
 'bacteria': 6,
 'prioninfected': 60,
 'fungi': 30,
 'also': 3,
 'microcosm': 50,
 'ancient': 4,
 'time': 82,
 'shared': 75,
 'basic': 7,
 'sacred': 70,
 'need': 53,
 'allow': 2,
 'mindful': 52,
 'health': 36,
 'artificial': 5,
 'intelligence': 43,
 'transforming': 84,
 'industry': 40,
 'reshaping': 66,
 'future': 31,
 'know': 45,
 'expect': 25,
 'begin': 8,
 'take': 79,
 'role': 69,
 'industrial': 39,
 'guard': 35,
 'first': 28,
 'step': 78,
 'addressing': 0,
 'transformation': 83,
 'work': 90,
 'bring': 12,
 'value': 85,
 'invention': 44,
 'entirely': 24,
 'level': 46,
 'see': 74,
 'making': 48,
 'robotics': 68,
 'whole': 89,
 'lot': 47,
 'brighter': 11,
 'said': 71,
 'pivatnam': 57,
 'football': 29,
 'one': 5

In [13]:
print(tfidf_matrix)

  (0, 36)	0.1354740329710464
  (0, 52)	0.1354740329710464
  (0, 2)	0.1354740329710464
  (0, 53)	0.1354740329710464
  (0, 70)	0.1354740329710464
  (0, 7)	0.1354740329710464
  (0, 75)	0.1354740329710464
  (0, 82)	0.1354740329710464
  (0, 4)	0.1354740329710464
  (0, 50)	0.1354740329710464
  (0, 3)	0.1354740329710464
  (0, 30)	0.1354740329710464
  (0, 60)	0.1354740329710464
  (0, 6)	0.1354740329710464
  (0, 10)	0.2709480659420928
  (0, 38)	0.40642209891313924
  (0, 17)	0.1354740329710464
  (0, 21)	0.1354740329710464
  (0, 65)	0.1354740329710464
  (0, 62)	0.1354740329710464
  (0, 76)	0.2709480659420928
  (0, 56)	0.1354740329710464
  (0, 9)	0.1354740329710464
  (0, 34)	0.1354740329710464
  (0, 13)	0.1354740329710464
  :	:
  (2, 51)	0.23968061829807585
  (2, 14)	0.23968061829807585
  (2, 32)	0.23968061829807585
  (2, 77)	0.23968061829807585
  (2, 58)	0.23968061829807585
  (2, 55)	0.23968061829807585
  (2, 29)	0.23968061829807585
  (2, 74)	0.15298503393562646
  (3, 18)	0.22699495530751052
  (3

In [14]:
# Get IDF for each term
idf = tfidf_vectorizer.idf_

In [15]:
# Convert TF-IDF matrix to array for easier manipulation
tfidf_array = tfidf_matrix.toarray()

# Calculate Term Frequency (TF) for each word in each document
term_frequency_matrix = tfidf_array / np.sum(tfidf_array, axis=1, keepdims=True)

# Normalize the TF-IDF matrix
tfidf_norm = normalize(tfidf_array, norm='l2', axis=1)

In [16]:
# Get the feature names (terms)
terms = tfidf_vectorizer.get_feature_names_out()

# Iterate over the terms to print TF, IDF, TF-IDF, and Normalized TF-IDF
for i, term in enumerate(terms):
    print(f"Term: {term}")
    print(f"  TF: {term_frequency_matrix[:, i]}")
    print(f"  IDF: {idf[i]}")
    print(f"  TF-IDF: {tfidf_array[:, i]}")
    print(f"  Normalized TF-IDF: {tfidf_norm[:, i]}")
    print()


Term: addressing
  TF: [0.         0.03083879 0.         0.        ]
  IDF: 1.916290731874155
  TF-IDF: [0.         0.17142549 0.         0.        ]
  Normalized TF-IDF: [0.         0.17142549 0.         0.        ]

Term: advancement
  TF: [0.02661215 0.         0.         0.        ]
  IDF: 1.916290731874155
  TF-IDF: [0.13547403 0.         0.         0.        ]
  Normalized TF-IDF: [0.13547403 0.         0.         0.        ]

Term: allow
  TF: [0.02661215 0.         0.         0.        ]
  IDF: 1.916290731874155
  TF-IDF: [0.13547403 0.         0.         0.        ]
  Normalized TF-IDF: [0.13547403 0.         0.         0.        ]

Term: also
  TF: [0.02661215 0.         0.         0.        ]
  IDF: 1.916290731874155
  TF-IDF: [0.13547403 0.         0.         0.        ]
  Normalized TF-IDF: [0.13547403 0.         0.         0.        ]

Term: ancient
  TF: [0.02661215 0.         0.         0.        ]
  IDF: 1.916290731874155
  TF-IDF: [0.13547403 0.         0.         0. 