In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


#Note :  pip install nltk scikit-learn

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mitali\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mitali\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mitali\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
def preprocess_document(document):
    # Tokenization
    tokens = word_tokenize(document)
    
    # POS Tagging
    pos_tags = pos_tag(tokens)
    
    # Stop words removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    
    return lemmatized_tokens

def calculate_tf_idf(documents):
    # Join preprocessed tokens into documents for TF-IDF calculation
    preprocessed_documents = [' '.join(doc) for doc in documents]
    
    # Calculate TF-IDF
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_documents)
    
    # Get feature names (terms)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    return tfidf_matrix, feature_names


In [3]:
# Sample document
sample_document = """
Natural language processing (NLP) is a field of artificial intelligence 
concerned with the interaction between computers and humans in natural 
language. Tokenization is the process of splitting a text into words and 
punctuation marks. POS tagging assigns parts of speech to each word in a 
sentence. Stop words are common words that are often filtered out in text 
analysis. Stemming reduces words to their root or base form. Lemmatization 
is similar to stemming but produces valid words.
"""

# Preprocess the sample document
preprocessed_tokens = preprocess_document(sample_document)

# Calculate TF-IDF representation
tfidf_matrix, feature_names = calculate_tf_idf([preprocessed_tokens])

# Display TF-IDF representation
import pandas as pd
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
print("TF-IDF Representation:")
print(df_tfidf)


TF-IDF Representation:
   analysis  artificial   assigns      base    common  computer  concerned  \
0  0.108465    0.108465  0.108465  0.108465  0.108465  0.108465   0.108465   

      field  filtered      form  ...   similar    speech  splitting  stemming  \
0  0.108465  0.108465  0.108465  ...  0.108465  0.108465   0.108465   0.21693   

       stop   tagging     text  tokenization     valid      word  
0  0.108465  0.108465  0.21693      0.108465  0.108465  0.650791  

[1 rows x 38 columns]
