In [190]:
import random
import nltk
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import re
from nltk.corpus import reuters
import pandas as pd
from collections import Counter
import math
import numpy as np
from transformers import pipeline


In [191]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('reuters')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!


True

In [192]:
generator = pipeline('text-generation', model='gpt2')


In [193]:
def generate_documents(topics):
    generated_docs = []
    for topic in topics:
        generated_docs.append(generator(topic, max_length=100)[0]['generated_text'])
    return generated_docs

In [194]:
topics = ['write definintion about science' , 'write definintion about medical field']

In [195]:
docs = generate_documents(topics)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [196]:
docs

["write definintion about science\n\n(7) I'm a person who has never been a physicist; if I had, I would have taught him about science as an amateur-athlete, and he would have spent his whole schooling studying and practicing how to write mathematical formulas. He would have been more effective as an educator than an engineer, and he would have gone on and applied mathematics to problems at a fast pace rather than waiting for his professor to do it all over again, and you",
 "write definintion about medical field data, such as whether or not it has actually been contaminated, how many times the field reported adverse effects such as fatigue, nausea and vomiting; a range of other data that may influence how much is included in the health information, such as whether or not it is in the physical health record of the patient and what a patient's gender was for an elective, or, if applicable, whether or not a primary care physician had access to the record.\n\n"]

In [197]:
def processing(doc):
    # clean
    doc = re.sub(r'[^a-zA-Z]', ' ', doc) # non-alphabetic characters

    # normalize
    doc = doc.lower()

    # tokenization
    doc = word_tokenize(doc)

    # lemmatization
    lemmatizer = WordNetLemmatizer()
    doc = [lemmatizer.lemmatize(token) for token in doc]

    # remove stop words
    stop_words = set(stopwords.words('english'))
    doc = [token for token in doc if token not in stop_words]



    return doc

In [198]:
for i in range(len(docs)):
    docs[i] = processing(docs[i])

In [199]:
def docing(doc):
    return ' '.join(doc)

In [200]:
for i in range(len(docs)):
    docs[i] = docing(docs[i])

In [201]:
docs

['write definintion science person ha never physicist would taught science amateur athlete would spent whole schooling studying practicing write mathematical formula would effective educator engineer would gone applied mathematics problem fast pace rather waiting professor',
 'write definintion medical field data whether ha actually contaminated many time field reported adverse effect fatigue nausea vomiting range data may influence much included health information whether physical health record patient patient gender wa elective applicable whether primary care physician access record']

In [202]:
def calculate_tfidf(docs):
    # calculations
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(docs)
    return tfidf_matrix,vectorizer

In [203]:
# Calculate TF-IDF
tfidf_matrix,vectorizer = calculate_tfidf(docs)

In [204]:
def to_dataframe(docs,tfidf_matrix,vectorizer):
    return pd.DataFrame(
        tfidf_matrix.T.toarray(),
        index=vectorizer.get_feature_names_out(),
        columns= [f"Doc {i+1}" for i in range(len(docs))],
    )

In [205]:
tfidf_matrix.T.toarray()

array([[0.        , 0.13301597],
       [0.        , 0.13301597],
       [0.        , 0.13301597],
       [0.14428127, 0.        ],
       [0.        , 0.13301597],
       [0.14428127, 0.        ],
       [0.14428127, 0.        ],
       [0.        , 0.13301597],
       [0.        , 0.13301597],
       [0.        , 0.26603195],
       [0.10265731, 0.09464196],
       [0.14428127, 0.        ],
       [0.        , 0.13301597],
       [0.14428127, 0.        ],
       [0.        , 0.13301597],
       [0.14428127, 0.        ],
       [0.14428127, 0.        ],
       [0.        , 0.13301597],
       [0.        , 0.26603195],
       [0.14428127, 0.        ],
       [0.        , 0.13301597],
       [0.14428127, 0.        ],
       [0.10265731, 0.09464196],
       [0.        , 0.26603195],
       [0.        , 0.13301597],
       [0.        , 0.13301597],
       [0.        , 0.13301597],
       [0.        , 0.13301597],
       [0.14428127, 0.        ],
       [0.14428127, 0.        ],
       [0.

In [206]:
# representation
df = to_dataframe(docs,tfidf_matrix,vectorizer)

In [207]:
df

Unnamed: 0,Doc 1,Doc 2
access,0.000000,0.133016
actually,0.000000,0.133016
adverse,0.000000,0.133016
amateur,0.144281,0.000000
applicable,0.000000,0.133016
...,...,...
waiting,0.144281,0.000000
whether,0.000000,0.399048
whole,0.144281,0.000000
would,0.577125,0.000000


Bonus

In [208]:
for i in range(len(docs)):
    docs[i] = docs[i].split()


In [209]:
def compute_tf(documents):
    tfs = []
    for doc in documents:
        word_count = Counter(doc)
        total_words = len(doc)
        tf = {word: count / total_words for word, count in word_count.items()}
        tfs.append(tf)
    return tfs


def compute_idf(documents):
    total_docs = len(documents)
    idf = {}
    for doc in documents:
        for word in set(doc):
            idf[word] = idf.get(word, 0)

    idf = {word: math.log( ((total_docs + 1)  / (count + 1))  + 1 ) for word, count in idf.items()}
    return idf

def compute_tfidf(documents):
    tf = compute_tf(documents)
    idf = compute_idf(documents)

    tfidf = []
    for doc in tf:
        doc_tfidf = {word: tf_val * idf[word] for word, tf_val in doc.items()}
        tfidf.append(doc_tfidf)

    return tfidf


def normalize_tfidf(tfidf):
    normalized_tfidf = []
    for doc in tfidf:
        norm = np.linalg.norm(list(doc.values()))
        normalized_doc = {word: tfidf_val / norm for word, tfidf_val in doc.items()}
        normalized_tfidf.append(normalized_doc)

    return normalized_tfidf

In [210]:

tfidf = compute_tfidf(docs)
normalized_tfidf = normalize_tfidf(tfidf)

for idx, doc_tfidf in enumerate(normalized_tfidf):
    print(f"TF-IDF for document {idx + 1}: {doc_tfidf}")

TF-IDF for document 1: {'write': 0.28005601680560205, 'definintion': 0.14002800840280102, 'science': 0.28005601680560205, 'person': 0.14002800840280102, 'ha': 0.14002800840280102, 'never': 0.14002800840280102, 'physicist': 0.14002800840280102, 'would': 0.5601120336112041, 'taught': 0.14002800840280102, 'amateur': 0.14002800840280102, 'athlete': 0.14002800840280102, 'spent': 0.14002800840280102, 'whole': 0.14002800840280102, 'schooling': 0.14002800840280102, 'studying': 0.14002800840280102, 'practicing': 0.14002800840280102, 'mathematical': 0.14002800840280102, 'formula': 0.14002800840280102, 'effective': 0.14002800840280102, 'educator': 0.14002800840280102, 'engineer': 0.14002800840280102, 'gone': 0.14002800840280102, 'applied': 0.14002800840280102, 'mathematics': 0.14002800840280102, 'problem': 0.14002800840280102, 'fast': 0.14002800840280102, 'pace': 0.14002800840280102, 'rather': 0.14002800840280102, 'waiting': 0.14002800840280102, 'professor': 0.14002800840280102}
TF-IDF for docume

In [211]:
def tfidf_to_dataframe(docs,tfidf_matrix,vectorizer):
    return pd.DataFrame(
        tfidf_matrix.T.toarray(),
        index=vectorizer.get_feature_names_out(),
        columns= [f"Doc {i+1}" for i in range(len(docs))],
    )

In [212]:
print(normalized_tfidf[0].get('act'))

None


In [213]:
print(normalized_tfidf[1].get('act'))

None


In [214]:
for word , tfidf in df['Doc 1'].items():
  if tfidf == 0 : continue
  print(f"built in  {word} , value : " , df['Doc 1'][word])
  print(f"from scratch  {word} , value : " , normalized_tfidf[0].get(word) )
  print('----')

built in  amateur , value :  0.14428127278288577
from scratch  amateur , value :  0.14002800840280102
----
built in  applied , value :  0.14428127278288577
from scratch  applied , value :  0.14002800840280102
----
built in  athlete , value :  0.14428127278288577
from scratch  athlete , value :  0.14002800840280102
----
built in  definintion , value :  0.10265731390307976
from scratch  definintion , value :  0.14002800840280102
----
built in  educator , value :  0.14428127278288577
from scratch  educator , value :  0.14002800840280102
----
built in  effective , value :  0.14428127278288577
from scratch  effective , value :  0.14002800840280102
----
built in  engineer , value :  0.14428127278288577
from scratch  engineer , value :  0.14002800840280102
----
built in  fast , value :  0.14428127278288577
from scratch  fast , value :  0.14002800840280102
----
built in  formula , value :  0.14428127278288577
from scratch  formula , value :  0.14002800840280102
----
built in  gone , value :  0

In [215]:
for word , tfidf in df['Doc 2'].items():
  if tfidf == 0 : continue
  print(f"built in  {word}     , value : " , df['Doc 2'][word])
  print(f"from scratch  {word} , value : " , normalized_tfidf[1].get(word) )
  print('----')

built in  access     , value :  0.133015972934483
from scratch  access , value :  0.13130643285972254
----
built in  actually     , value :  0.133015972934483
from scratch  actually , value :  0.13130643285972254
----
built in  adverse     , value :  0.133015972934483
from scratch  adverse , value :  0.13130643285972254
----
built in  applicable     , value :  0.133015972934483
from scratch  applicable , value :  0.13130643285972254
----
built in  care     , value :  0.133015972934483
from scratch  care , value :  0.13130643285972254
----
built in  contaminated     , value :  0.133015972934483
from scratch  contaminated , value :  0.13130643285972254
----
built in  data     , value :  0.266031945868966
from scratch  data , value :  0.2626128657194451
----
built in  definintion     , value :  0.09464196027856574
from scratch  definintion , value :  0.13130643285972254
----
built in  effect     , value :  0.133015972934483
from scratch  effect , value :  0.13130643285972254
----
built in