In [1]:
import os
import nltk
import math
import numpy as np
import pandas as pd
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

### Data: pipeline function to generate documents based on given prompts

In [2]:
gen = pipeline('text-generation', model ='EleutherAI/gpt-neo-2.7B')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/10.7G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [5]:
prompts = ["AI", "Deep Learning", "Natural Language Processing"]
num_phrases = 3

In [6]:
documents = []
for prompt in prompts:
    documents.append(gen(prompt, max_length=50, num_return_sequences=num_phrases))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [59]:
docs = ['', '', '']

for i in range(len(prompts)):
    for j in range(num_phrases):
      docs[i] += documents[i][j]['generated_text']

In [66]:
for i in range(len(prompts)):
  print("Document", i+1 , ":\n", docs[i])
  print()

Document 1 :
 AI-Ventilator/Intra-operative/Intra-hospital monitoring - Patient specific - Heart Rate (HR); Respiration Rate (RR); Oxygen Concentration; Blood Pressure; Oxygen Delivery Index (DO2I); OxyAI-based, and that the program can detect the location of the heart based on its characteristic sound.AI-PDA

ai-PDA is a smart device that can be integrated with the iPad.ai, the smart assistant app for iPad and iPhone. PDA is short for personal digital assistant. The device can be used as a personal

Document 2 :
 Deep Learning
TensorFlow vs Theano. I’m currently using Theano as my neural network library in
my project. I’ve found very
frustratingly little in Theano’s documentation or on-Deep Learning in the Age of Machine Translation

In the summer of 2012, Stanford University’s Dr. Andrew Ng published a paper in which he argued that machine translation techniques based on statistical machine translation models do not have much to offer to aDeep Learning: A Revolution in Machine Learni

### Processing on data:

In [68]:
def preprocess_text(text):
    tokens = word_tokenize(text)

    tokens = [token.lower() for token in tokens]

    tokens = [token for token in tokens if token.isalnum()]

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

In [73]:
preprocessed_text = []
for i in docs:
  preprocessed_text.append(preprocess_text(i))

In [79]:
for i in range(len(prompts)):
  print("Document", i+1 , "After Preprocessing:\n", preprocessed_text[i])
  print()

Document 1 After Preprocessing:
 monitoring patient specific heart rate hr respiration rate rr oxygen concentration blood pressure oxygen delivery index do2i program detect location heart based characteristic smart device integrated smart assistant app ipad iphone pda short personal digital assistant device used personal

Document 2 After Preprocessing:
 deep learning tensorflow v theano currently using theano neural network library project found frustratingly little theano documentation learning age machine translation summer 2012 stanford university andrew ng published paper argued machine translation technique based statistical machine translation model much offer adeep learning revolution machine learning sec learn section present brief overview deep learning set method learning model data begin classic supervised setting

Document 3 After Preprocessing:
 natural language processing nlp technique extracting understanding making use unstructured natural language data nlp system invo

### TF-IDF (Built-in)

In [97]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_text)
feature_names = tfidf_vectorizer.get_feature_names_out()

In [98]:
tfidf_builtin = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
tfidf_builtin

Unnamed: 0,2012,across,adeep,age,ai,also,anatural,andrew,app,argued,...,university,unstructured,use,used,useful,using,variety,via,way,world
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138772,0.0,...,0.0,0.0,0.0,0.10554,0.0,0.0,0.0,0.0,0.0,0.0
1,0.092417,0.0,0.092417,0.092417,0.0,0.0,0.0,0.092417,0.0,0.092417,...,0.092417,0.0,0.0,0.0,0.0,0.070285,0.0,0.0,0.0,0.0
2,0.0,0.073517,0.0,0.0,0.073517,0.073517,0.073517,0.0,0.0,0.0,...,0.0,0.073517,0.073517,0.055911,0.073517,0.055911,0.073517,0.073517,0.073517,0.073517


### TF-IDF (From Scratch)

- Get TF for each word for all documents

In [None]:
def compute_tf(documents):
    tf_matrix = []

    vocab = list(set(word for doc in documents for word in doc.split()))

    for doc in documents:
        tf_doc = {}
        words = doc.split()
        word_count = len(words)
        for word in words:
            tf_doc[word] = tf_doc.get(word, 0) + 1 / word_count
        tf_matrix.append(tf_doc)

    return tf_matrix, vocab

- Get IDF for each word

In [None]:
def compute_idf(tf_matrix, vocab):
    idf_dict = {}
    num_docs = len(tf_matrix)
    for term in vocab:
        doc_count = sum(1 for tf_doc in tf_matrix if term in tf_doc)
        idf_dict[term] = np.log(num_docs / (1 + doc_count)) + 1
    return idf_dict

- Get TF-IDF multiply TF * IDF

In [None]:
def compute_tfidf(tf_matrix, idf_dict):
    tfidf_matrix = []
    for tf_doc in tf_matrix:
        tfidf_doc = {}
        for term, tf in tf_doc.items():
            idf = idf_dict[term]
            tfidf_doc[term] = tf * idf
        tfidf_matrix.append(tfidf_doc)
    return tfidf_matrix

- Get Normalized TFIDF

In [None]:
def normalize_tfidf(tfidf_matrix):
    normalized_tfidf = []
    for doc in tfidf_matrix:
        doc_vector = np.array(list(doc.values()))
        norm = np.sqrt(np.sum(np.square(doc_vector)))
        normalized_doc = {term: score / norm for term, score in doc.items()}
        normalized_tfidf.append(normalized_doc)
    return normalized_tfidf

In [None]:
def tfidf(documents):
    tf_matrix, vocab = compute_tf(documents)
    idf_dict = compute_idf(tf_matrix, vocab)
    tfidf_matrix = compute_tfidf(tf_matrix, idf_dict)
    normalized_tfidf = normalize_tfidf(tfidf_matrix)
    return normalized_tfidf

In [99]:
tfidf_matrix = tfidf(preprocessed_text)

In [100]:
tfidf_scratch = pd.DataFrame(tfidf_matrix)
tfidf_scratch.fillna(0, inplace=True)
tfidf_scratch

Unnamed: 0,monitoring,patient,specific,heart,rate,hr,respiration,rr,oxygen,concentration,...,useful,knowledge,across,number,domain,including,business,information,retrieval,b
0,0.138992,0.138992,0.138992,0.277984,0.277984,0.138992,0.138992,0.138992,0.277984,0.138992,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.073551,0.073551,0.073551,0.073551,0.073551,0.073551,0.073551,0.073551,0.073551,0.073551


In [112]:
sorted_columns = sorted(tfidf_scratch.columns)
sorted_df = df[sorted_columns]
sorted_df

Unnamed: 0,2012,across,adeep,age,ai,also,anatural,andrew,app,argued,...,unstructured,use,used,useful,using,v,variety,via,way,world
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138992,0.0,...,0.0,0.0,0.098894,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.092145,0.0,0.092145,0.092145,0.0,0.0,0.0,0.092145,0.0,0.092145,...,0.0,0.0,0.0,0.0,0.065562,0.092145,0.0,0.0,0.0,0.0
2,0.0,0.073551,0.0,0.0,0.073551,0.073551,0.073551,0.0,0.0,0.0,...,0.073551,0.073551,0.052332,0.073551,0.052332,0.0,0.073551,0.073551,0.073551,0.073551


### Bonus: Comparing between built-in and from scratch results

In [109]:
print('TF-IDF values using built-in function: ')
print(tfidf_builtin[['2012', 'across', 'adeep',	'nlp', 'used']])

print('\nTF-IDF values from scratch: ')
print(tfidf_scratch[['2012', 'across', 'adeep',	'nlp', 'used']])

TF-IDF values using built-in function: 
       2012    across     adeep      nlp      used
0  0.000000  0.000000  0.000000  0.00000  0.105540
1  0.092417  0.000000  0.092417  0.00000  0.000000
2  0.000000  0.073517  0.000000  0.22055  0.055911

TF-IDF values from scratch: 
       2012    across     adeep       nlp      used
0  0.000000  0.000000  0.000000  0.000000  0.098894
1  0.092145  0.000000  0.092145  0.000000  0.000000
2  0.000000  0.073551  0.000000  0.220654  0.052332


In [120]:
print('TF-IDF values using built-in function: ')
tfidf_builtin

TF-IDF values using built-in function: 


Unnamed: 0,2012,across,adeep,age,ai,also,anatural,andrew,app,argued,...,university,unstructured,use,used,useful,using,variety,via,way,world
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138772,0.0,...,0.0,0.0,0.0,0.10554,0.0,0.0,0.0,0.0,0.0,0.0
1,0.092417,0.0,0.092417,0.092417,0.0,0.0,0.0,0.092417,0.0,0.092417,...,0.092417,0.0,0.0,0.0,0.0,0.070285,0.0,0.0,0.0,0.0
2,0.0,0.073517,0.0,0.0,0.073517,0.073517,0.073517,0.0,0.0,0.0,...,0.0,0.073517,0.073517,0.055911,0.073517,0.055911,0.073517,0.073517,0.073517,0.073517


In [122]:
print('TF-IDF values from scratch: ')
sorted_df

TF-IDF values from scratch: 


Unnamed: 0,2012,across,adeep,age,ai,also,anatural,andrew,app,argued,...,unstructured,use,used,useful,using,v,variety,via,way,world
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138992,0.0,...,0.0,0.0,0.098894,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.092145,0.0,0.092145,0.092145,0.0,0.0,0.0,0.092145,0.0,0.092145,...,0.0,0.0,0.0,0.0,0.065562,0.092145,0.0,0.0,0.0,0.0
2,0.0,0.073551,0.0,0.0,0.073551,0.073551,0.073551,0.0,0.0,0.0,...,0.073551,0.073551,0.052332,0.073551,0.052332,0.0,0.073551,0.073551,0.073551,0.073551
