In [1]:
import os
import nltk
import math
import numpy as np
import pandas as pd
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [2]:
gen = pipeline('text-generation', model ='EleutherAI/gpt-neo-2.7B')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [4]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Lowercasing
    tokens = [token.lower() for token in tokens]

    # Remove punctuation and non-alphanumeric tokens
    tokens = [token for token in tokens if token.isalnum()]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

In [5]:
def generate_document(output_directory, prompts, num_phrases):
    documents = []  # List to store preprocessed documents

    # Ensure the output directory exists
    os.makedirs(output_directory, exist_ok=True)

    for prompt in prompts:
        # Generate a unique filename for each prompt
        filename = os.path.join(output_directory, f"{prompt[:10]}_document.txt")

        with open(filename, 'w') as file:
            # Generate text based on the prompt
            generated_text = gen(prompt, max_length=50, num_return_sequences=num_phrases)
            # Write the generated text to the file
            for i, text in enumerate(generated_text):
                generated_text_cleaned = preprocess_text(text['generated_text'])
                documents.append(generated_text_cleaned)
                # file.write(f"Prompt: {prompt}\n")
                file.write(generated_text_cleaned)

    return documents

In [7]:
prompts = ["AI", "Deep Learning", "Natural Language Processing"]
documents = generate_document("Generated Documents", prompts, num_phrases=5)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [47]:
sum = 0;
st = set()
for doc in documents:
    for word in doc.split():
      st.add(word)
print(len(st))

187


In [8]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
feature_names = tfidf_vectorizer.get_feature_names_out()

print("TFIDF Matrix:")
print(pd.DataFrame(tfidf_matrix.toarray()))
print("\nFeature Names (Words):")
print(feature_names)

TFIDF Matrix:
         0         1         2         3         4         5         6    \
0   0.000000  0.000000  0.152392  0.000000  0.000000  0.000000  0.000000   
1   0.000000  0.000000  0.000000  0.230915  0.000000  0.000000  0.000000   
2   0.000000  0.000000  0.000000  0.000000  0.615535  0.205178  0.000000   
3   0.000000  0.000000  0.107759  0.000000  0.000000  0.000000  0.000000   
4   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
5   0.000000  0.000000  0.134645  0.000000  0.000000  0.000000  0.000000   
6   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.515325   
7   0.000000  0.000000  0.187650  0.000000  0.000000  0.000000  0.000000   
8   0.208761  0.208761  0.000000  0.000000  0.000000  0.000000  0.000000   
9   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
10  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
11  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  

In [9]:
pd.DataFrame(tfidf_matrix)

Unnamed: 0,0
0,"(0, 182)\t0.21694355412709468\n (0, 68)\t0...."
1,"(0, 112)\t0.23091547932294199\n (0, 106)\t0..."
2,"(0, 165)\t0.20517839388853568\n (0, 60)\t0...."
3,"(0, 157)\t0.15340484707502275\n (0, 139)\t0..."
4,"(0, 36)\t0.42609167703084017\n (0, 138)\t0...."
5,"(0, 67)\t0.1916789940247076\n (0, 8)\t0.191..."
6,"(0, 148)\t0.5153249567249414\n (0, 83)\t0.5..."
7,"(0, 113)\t0.26713662790744325\n (0, 185)\t0..."
8,"(0, 100)\t0.20876088919062544\n (0, 38)\t0...."
9,"(0, 48)\t0.14315306223919919\n (0, 75)\t0.1..."


In [10]:
len(documents)

15

In [11]:
documents

['ai lqe ratio energy entropy production positive value lqe implies stochastic process likely lead certain outcome collapse black hole would',
 'aigocap detection protein interest database downloaded genomic integration site b27 genome datasets used study downloaded ncbi nucle',
 'aivat aivat aka automated interstitial vat aivat robotic treatment cervical cancer evaluated randomized controlled trial rct first time',
 'ai find evidence theory moreover relationship neural circuit generating subjective sense depth subjective sense depth brain known result suggest subjective sense depth brain',
 'parameter gaussian process set respectively corresponding',
 'deep learning university chicago deep learning ai perspective hype deep learning recent exponential rise popularity caused many enthusiast purchase machine learning book amazon google book',
 'deep learning algebraic kernel shen',
 'deep learning william coughlin book ai short introduction cognitive scientist artificial intelligence pio

In [105]:
def calculate_tf(documents):
    res_tf = []
    for doc in documents:
      tf = {}
      for _ in documents:
        for word in _.split():
          tf[word] = 0
      word_count = Counter(doc.split())
      total_words = len(doc.split())
      for word, count in word_count.items():
        tf[word] = round((count/total_words),3)
      res_tf.append(tf)

    # convert dict into list
    res = []
    for i in res_tf:
      res.append(list(i.values()))
    return res

def calculate_idf(documents):
    idf = {}
    total_docs = len(documents)

    # Count document frequency for each word
    for doc in documents:
        words = set(doc.split())
        for word in words:
            idf[word] = idf.get(word, 0) + 1

    # Calculate IDF for each word
    for word, freq in idf.items():
        idf[word] = round(math.log(total_docs / freq), 3)

    res = list(idf.values())

    return res

def calculate_tfidf(tf, idf):
    tfidf = []
    for i in range(len(tf)):
      tmp = []
      for j in range(len(tf[i])):
        tmp.append(tf[i][j] * idf[j])
      tfidf.append(tmp)
    return tfidf

def normalize_tfidf(tfidf):
    normalized_tfidf = []
    for tfidf_doc in tfidf:
        norm = np.linalg.norm(tfidf_doc)
        normalized_tfidf_doc = [round((tfidf_doc[i]/norm),3) for i in range(len(tfidf_doc))]
        normalized_tfidf.append(normalized_tfidf_doc)
    return normalized_tfidf

In [15]:
def get_feature_names(documents):
    feature_names = set()
    for doc in documents:
        words = doc.split()
        feature_names.update(words)
    return sorted(list(feature_names))

In [110]:
# Step 1: Calculate TF
tf = calculate_tf(documents)

# Step 2: Calculate IDF
idf = calculate_idf(documents)

# Step 3: Multiply TF * IDF
tfidf = calculate_tfidf(tf, idf)

# Step 4: Normalize TF-IDF
normalized_tfidf = normalize_tfidf(tfidf)
# print(pd.DataFrame(normalized_tfidf))

feature_names = get_feature_names(documents)
print("Feature Names (Words):", feature_names)

Feature Names (Words): ['2', '20', 'ability', 'ai', 'aigocap', 'aivat', 'aka', 'algebraic', 'algorithm', 'amazon', 'analyzing', 'artificial', 'aspect', 'automated', 'b27', 'black', 'book', 'brain', 'broad', 'called', 'cancer', 'capture', 'caused', 'certain', 'cervical', 'change', 'chicago', 'circuit', 'clay', 'cognitive', 'collapse', 'communicate', 'computer', 'concept', 'construct', 'controlled', 'converting', 'corresponding', 'coughlin', 'data', 'database', 'datasets', 'deal', 'deep', 'depth', 'detection', 'developed', 'different', 'downloaded', 'enable', 'energy', 'english', 'enthusiast', 'entropy', 'evaluated', 'evidence', 'exponential', 'family', 'field', 'find', 'fingertip', 'first', 'gaussian', 'gd', 'general', 'generating', 'genome', 'genomic', 'google', 'hole', 'human', 'hungry', 'hype', 'illness', 'implies', 'improve', 'innovative', 'integration', 'intelligence', 'interest', 'interstitial', 'introduces', 'introduction', 'javascript', 'kernel', 'key', 'known', 'language', 'las

In [111]:
pd.DataFrame(tf)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,177,178,179,180,181,182,183,184,185,186
0,0.05,0.1,0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.05,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.043,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [112]:
pd.DataFrame(idf)

Unnamed: 0,0
0,2.708
1,2.708
2,2.708
3,2.708
4,2.708
...,...
182,2.708
183,2.708
184,2.708
185,2.708


In [113]:
pd.DataFrame(tfidf)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,177,178,179,180,181,182,183,184,185,186
0,0.1354,0.2708,0.1354,0.1354,0.1354,0.1354,0.1354,0.1354,0.1354,0.1354,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.116444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.10832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.167896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
pd.DataFrame(normalized_tfidf)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,177,178,179,180,181,182,183,184,185,186
0,0.219,0.439,0.219,0.219,0.219,0.219,0.219,0.219,0.219,0.219,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
