In [5]:
# Load necessary libraries
import pandas as pd
import nltk # Natural Language Tool Kit (NLTK)
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK data
nltk.download('punkt') # For tokenizing
nltk.download('stopwords') # For removing stopwords

# Set pandas option for better display of DataFrame
pd.set_option('max_colwidth', 100)

# Sample sentences for building TF-IDF model
documents = [
    "Inflation has increased unemployment",
    "The company has increased its sales",
    "Fear increased his pulse"
]

# Define preprocess function for lowercasing, tokenizing, and stop words removal
def preprocess(document):
    # Change document to lower case
    document = document.lower()

    # Tokenize into words
    words = word_tokenize(document)

    # Remove stop words
    stop_words = set(stopwords.words("english")) # Use a set for faster membership checking
    words = [word for word in words if word not in stop_words]

    # Join words to make a sentence
    document = " ".join(words)

    return document

# Apply preprocess function to the sample sentences
processed_documents = [preprocess(doc) for doc in documents]
print("Processed Documents:")
print(processed_documents)

# Create and fit the TF-IDF model
vectorizer = TfidfVectorizer() # Initialize vectorizer
tfidf_matrix = vectorizer.fit_transform(processed_documents)

# Print the TF-IDF matrix
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

# Create a DataFrame from the TF-IDF matrix
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
print("TF-IDF DataFrame:")
tfidf_df


Processed Documents:
['inflation increased unemployment', 'company increased sales', 'fear increased pulse']
TF-IDF Matrix:
[[0.         0.         0.38537163 0.65249088 0.         0.
  0.65249088]
 [0.65249088 0.         0.38537163 0.         0.         0.65249088
  0.        ]
 [0.         0.65249088 0.38537163 0.         0.65249088 0.
  0.        ]]
TF-IDF DataFrame:


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,company,fear,increased,inflation,pulse,sales,unemployment
0,0.0,0.0,0.385372,0.652491,0.0,0.0,0.652491
1,0.652491,0.0,0.385372,0.0,0.0,0.652491,0.0
2,0.0,0.652491,0.385372,0.0,0.652491,0.0,0.0
