# TF-IDF with Preprocessing in Python

This Jupyter Notebook demonstrates how to apply TF-IDF on text data
with preprocessing techniques like stemming and lemmatization.

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Optional: Improves lemmatization

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [8]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

# Sample text data
documents = [
    "Natural language processing is a subfield of AI.",
    "AI and machine learning are transforming industries.",
    "Text processing with TF-IDF helps in information retrieval.",
]

In [9]:
def preprocess_with_stemming(text):
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    filtered_tokens = [stemmer.stem(word) for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(filtered_tokens)

In [10]:
def preprocess_with_lemmatization(text):
    tokens = word_tokenize(text.lower())  # Convert to lowercase and tokenize
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(filtered_tokens)

In [11]:
preprocessed_stem_docs = [preprocess_with_stemming(doc) for doc in documents]
vectorizer_stem = TfidfVectorizer()
tfidf_matrix_stem = vectorizer_stem.fit_transform(preprocessed_stem_docs)

In [12]:
preprocessed_lem_docs = [preprocess_with_lemmatization(doc) for doc in documents]
vectorizer_lem = TfidfVectorizer()
tfidf_matrix_lem = vectorizer_lem.fit_transform(preprocessed_lem_docs)

In [13]:
def print_tfidf_scores(tfidf_matrix, feature_names, method):
    for i, doc in enumerate(documents):
        print(f"\nTF-IDF scores for Document {i+1} ({method}):")
        scores = zip(feature_names, tfidf_matrix[i].toarray()[0])
        sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
        for word, score in sorted_scores:
            if score > 0:
                print(f"{word}: {score:.4f}")

# Print results for stemming
print_tfidf_scores(tfidf_matrix_stem, vectorizer_stem.get_feature_names_out(), "Stemming")

# Print results for lemmatization
print_tfidf_scores(tfidf_matrix_lem, vectorizer_lem.get_feature_names_out(), "Lemmatization")



TF-IDF scores for Document 1 (Stemming):
languag: 0.4905
natur: 0.4905
subfield: 0.4905
ai: 0.3730
process: 0.3730

TF-IDF scores for Document 2 (Stemming):
industri: 0.4674
learn: 0.4674
machin: 0.4674
transform: 0.4674
ai: 0.3554

TF-IDF scores for Document 3 (Stemming):
help: 0.4674
inform: 0.4674
retriev: 0.4674
text: 0.4674
process: 0.3554

TF-IDF scores for Document 1 (Lemmatization):
language: 0.4905
natural: 0.4905
subfield: 0.4905
ai: 0.3730
processing: 0.3730

TF-IDF scores for Document 2 (Lemmatization):
industry: 0.4674
learning: 0.4674
machine: 0.4674
transforming: 0.4674
ai: 0.3554

TF-IDF scores for Document 3 (Lemmatization):
help: 0.4674
information: 0.4674
retrieval: 0.4674
text: 0.4674
processing: 0.3554
