# Importing Libraries

In [1]:
from transformers import pipeline
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np
import math
import nltk
import re

In [2]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Generate the Data

In [3]:
def generate_document(phrase):
    # Connecting to Hugging Face API
    chat = pipeline("text-generation", model="gpt2",pad_token_id=50256)
    return chat(phrase, num_return_sequences=1,max_length=300, truncation=True)[0]['generated_text']

In [4]:
phrases = ["AWS Cloud Computing Network", "History Reinforcement Learning"]

In [5]:
corpus = [generate_document(phrase) for phrase in phrases]

In [6]:
corpus[0]

'AWS Cloud Computing Network, as I believe it would be a great source of data for the future. I\'d also highly recommend this: http://sustainablecloud.com/how-do-my-cloud-services-use-an-independent-cloud-for-everyday-data/ In addition, I\'d highly advise you to use the tools available in a local server for this to be useful. For that, go to https://github.com/sustainablecloud/async-cloud-service for more information. If you want the documentation in this case, you should also read the book "Discovery, Discovery, Discovery".\n\nSUMMARY\n\nA very important component of Cloud Computing that has been an asset for a number of years has been the ability to create a cloud cloud with unlimited data, with no overhead. This can be particularly true for companies with no centralized datacenter.\n\nSome might think of this as making AWS like OpenStack more like VMware but without the need for traditional datacenters. You have access to many of what AWS had in the past and there are more and more 

In [7]:
corpus[1]

'History Reinforcement Learning (TRL) is based on the first version of the software architecture, and it is currently available only for Microsoft systems with Windows Vista, 2000, and 7.\n\nTRL is optimized for use with Office 2011 and XP. This makes it easy to migrate from desktop productivity tasks to applications in a single project.\n\nFor example, if you want to move your computer over to the new Windows 2000 or Windows 8 operating systems, you could create applications using an Office and Microsoft Office suite.\n\nThe TRL software can be updated quickly with just a single click of a button, and the user can quickly create tasks that can be easily run in PowerPoint, Office Mobile, or a Word document.\n\nTRL provides a powerful tool for a range of scenarios, such as work on a group project that requires immediate deployment to a different IT department.\n\nOne common use case for a TRL-based solution will be for teams that want to quickly manage their data and spread out resource

# Preprocessing Text

In [8]:
def preprocess_data(text):
  # Initialize Helper Objects
  stop_words = set(stopwords.words('english'))

  #Normalization - Convert to lowercase
  normalized_text = text.lower()

  #Cleaning data from symbols and characters
  cleaned_text = re.sub(r'[^a-zA-Z\s]', '', normalized_text)
  cleaned_text = re.sub(r'\\.', '', cleaned_text) #Remove \n from \nThe

  #Tokenization
  tokens = word_tokenize(cleaned_text)

  #Stop Words Removal
  filtered_tokens = [word for word in tokens if word not in stop_words]

  return filtered_tokens

In [9]:
def lemmatizer(filtered_tokens):
    lemmatizer = WordNetLemmatizer()
    word_mappings = {}
    res = []

    for word in filtered_tokens:
        # Lemmatize the word
        stemmed_word = lemmatizer.lemmatize(word)
        res.append(stemmed_word)

        # Save the word mapping if the stemmed word has length less than 3
        if len(stemmed_word) < 3:
            word_mappings[word] = stemmed_word

    return res,word_mappings

In [10]:
# Apply Lemmatizer
cleaned_corpus = [preprocess_data(document) for document in corpus]
myDict = dict()

for i in range(0,len(cleaned_corpus)):
  cleaned_corpus[i], dict_part = lemmatizer(cleaned_corpus[i])
  myDict.update(dict_part)

In [11]:
print(f"{' '.join(cleaned_corpus[0])}")
print(f"With Vector Length Equal to: {len(cleaned_corpus[0])}")

aws cloud computing network believe would great source data future id also highly recommend httpsustainablecloudcomhowdomycloudservicesuseanindependentcloudforeverydaydata addition id highly advise use tool available local server useful go httpsgithubcomsustainablecloudasynccloudservice information want documentation case also read book discovery discovery discovery summary important component cloud computing asset number year ability create cloud cloud unlimited data overhead particularly true company centralized datacenter might think making aws like openstack like vmware without need traditional datacenters access many aws past people already using technology imagine many others would use aws buy datacenter aws offering point opportunity leverage knowledge leverage application one big thing cloud computing server top
With Vector Length Equal to: 100


In [12]:
print(f"{' '.join(cleaned_corpus[1])}")
print(f"With Vector Length Equal to: {len(cleaned_corpus[1])}")

history reinforcement learning trl based first version software architecture currently available microsoft system window vista trl optimized use office xp make easy migrate desktop productivity task application single project example want move computer new window window operating system could create application using office microsoft office suite trl software updated quickly single click button user quickly create task easily run powerpoint office mobile word document trl provides powerful tool range scenario work group project requires immediate deployment different department one common use case trlbased solution team want quickly manage data spread resource across multiple network multigroup project trl provides simple yet flexible modular approach team manage existing project one place application simple install operates window office environment workflow straightforward easily configurable allows single user customize setup process run seamlessly create multiple task assignment us

In [13]:
print(myDict)

{'id': 'id', 'go': 'go', 'xp': 'xp'}


# Getting Unique Words

In [14]:
unique_words = set()

for document in cleaned_corpus :
  unique_words.update(document)

print(unique_words)
print(len(unique_words))

{'solution', 'thing', 'straightforward', 'operates', 'year', 'vista', 'scenario', 'simple', 'provides', 'making', 'button', 'flexible', 'suite', 'environment', 'book', 'highly', 'httpsgithubcomsustainablecloudasynccloudservice', 'common', 'deal', 'would', 'httpsustainablecloudcomhowdomycloudservicesuseanindependentcloudforeverydaydata', 'buy', 'workflow', 'computer', 'datacenters', 'local', 'word', 'point', 'available', 'approach', 'existing', 'addition', 'knowledge', 'move', 'allows', 'many', 'seamlessly', 'deployment', 'recommend', 'task', 'vmware', 'spread', 'trl', 'install', 'tool', 'like', 'centralized', 'history', 'based', 'aws', 'particularly', 'offering', 'multiple', 'using', 'quickly', 'go', 'currently', 'mobile', 'window', 'server', 'traditional', 'true', 'desktop', 'across', 'openstack', 'reinforcement', 'case', 'version', 'overhead', 'xp', 'optimized', 'productivity', 'range', 'powerful', 'microsoft', 'use', 'top', 'believe', 'might', 'application', 'opportunity', 'read', '

In [15]:
# Filter words with length less than 3
short_words = [word for word in unique_words if len(word) < 3]

# Print the filtered words
print("Words with length less than 3:", short_words)

# Result is same as before

Words with length less than 3: ['go', 'xp', 'id']


# Using Scikit-learn TF-IDF (Built-In)

In [16]:
# Building List of String
data = []
for document in cleaned_corpus:
  data.append(' '.join(document))


# Fitting the Model
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data)
feature_names = tfidf_vectorizer.get_feature_names_out()

In [17]:
tfidf_df_sk = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
tfidf_df_sk # 3 rows for each document

Unnamed: 0,ability,access,across,addition,advise,allows,already,also,application,approach,...,want,window,without,word,work,workflow,would,xp,year,yet
0,0.077707,0.077707,0.0,0.077707,0.077707,0.0,0.077707,0.155415,0.055289,0.0,...,0.055289,0.0,0.077707,0.0,0.0,0.0,0.155415,0.0,0.077707,0.0
1,0.0,0.0,0.064317,0.0,0.0,0.064317,0.0,0.0,0.137285,0.064317,...,0.091523,0.257266,0.0,0.064317,0.064317,0.064317,0.0,0.064317,0.0,0.064317


## From Scrath

## Calculate TF

In [18]:
def calculate_tf(tokenized_documents, unique_words):
    len_docs = len(tokenized_documents)
    len_words = len(unique_words)
    tf_matrix = np.zeros((len_docs, len_words), dtype=int)

    # Loop over documents and words
    for i in range(len_docs):
        for j in range(len_words):
            cur_word = unique_words[j]
            cur_doc = tokenized_documents[i]
            freq = cur_doc.count(cur_word)
            tf_matrix[i, j] += freq

    return tf_matrix

In [19]:
tf_result = calculate_tf(cleaned_corpus,list(unique_words))
print(tf_result.shape)

(2, 161)


## Calculate IDF

In [22]:
def calculate_idf(tokenized_documents, unique_words):
    len_docs = len(tokenized_documents)
    len_words = len(unique_words)
    idf_values = np.zeros((len_words))

    for i in range(len_words):
        # Calculate frequency of the word in all documents
        counter = 0
        for doc in tokenized_documents:
            if unique_words[i] in doc:
                counter += 1

        # Calculate IDF value for the word
        idf_values[i] = math.log(float(len_docs + 1) / float(counter + 1)) + 1

    return idf_values

In [23]:
idf = calculate_idf(cleaned_corpus,list(unique_words))
len(idf)

161

## Putting All Together

In [24]:
def calculate_tfidf(data):
  # Getting unique word
  unique_words = list(set(" ".join(data).split()))
  unique_words.sort()

  # Split Text
  cleaned_corpus = [text.split(' ') for text in data]

  # Calculate TF & IDF
  tf = calculate_tf(cleaned_corpus,list(unique_words))
  idf = calculate_idf(cleaned_corpus,list(unique_words))

  # Calculate TF-IDF
  tf_idf = tf * idf

  # Normalize Data
  normalization = np.sqrt((tf_idf * tf_idf).sum(axis=1))
  for i in range(len(normalization)):
    tf_idf[i] /= normalization[i]

  # Building DataFrame
  res = pd.DataFrame(tf_idf, columns=unique_words)

  return res




In [25]:
# Calculate TF-IDF
tfidf_calculation = calculate_tfidf(data)
tfidf_calculation


Unnamed: 0,ability,access,across,addition,advise,allows,already,also,application,approach,...,want,window,without,word,work,workflow,would,xp,year,yet
0,0.077707,0.077707,0.0,0.077707,0.077707,0.0,0.077707,0.155415,0.055289,0.0,...,0.055289,0.0,0.077707,0.0,0.0,0.0,0.155415,0.0,0.077707,0.0
1,0.0,0.0,0.064317,0.0,0.0,0.064317,0.0,0.0,0.137285,0.064317,...,0.091523,0.257266,0.0,0.064317,0.064317,0.064317,0.0,0.064317,0.0,0.064317


## Custom TF-IDF vs SKlearn

In [26]:
print("SKlearn Implementation:")
tfidf_df_sk

SKlearn Implementation:


Unnamed: 0,ability,access,across,addition,advise,allows,already,also,application,approach,...,want,window,without,word,work,workflow,would,xp,year,yet
0,0.077707,0.077707,0.0,0.077707,0.077707,0.0,0.077707,0.155415,0.055289,0.0,...,0.055289,0.0,0.077707,0.0,0.0,0.0,0.155415,0.0,0.077707,0.0
1,0.0,0.0,0.064317,0.0,0.0,0.064317,0.0,0.0,0.137285,0.064317,...,0.091523,0.257266,0.0,0.064317,0.064317,0.064317,0.0,0.064317,0.0,0.064317


In [27]:
print("My Implementation:")
tfidf_calculation

My Implementation:


Unnamed: 0,ability,access,across,addition,advise,allows,already,also,application,approach,...,want,window,without,word,work,workflow,would,xp,year,yet
0,0.077707,0.077707,0.0,0.077707,0.077707,0.0,0.077707,0.155415,0.055289,0.0,...,0.055289,0.0,0.077707,0.0,0.0,0.0,0.155415,0.0,0.077707,0.0
1,0.0,0.0,0.064317,0.0,0.0,0.064317,0.0,0.0,0.137285,0.064317,...,0.091523,0.257266,0.0,0.064317,0.064317,0.064317,0.0,0.064317,0.0,0.064317
