In [10]:
import nltk

nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gawas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\gawas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# Data handling
import pandas as pd

# Text cleaning
import re

# NLP toolkit
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer


In [12]:
data = {
    "text": [
        "I love NLP! It is amazing.",
        "NLP is used in chatbots and search engines.",
        "I do not like boring lectures.",
        "This NLP session is very interesting!"
    ]
}

df = pd.DataFrame(data)
df


Unnamed: 0,text
0,I love NLP! It is amazing.
1,NLP is used in chatbots and search engines.
2,I do not like boring lectures.
3,This NLP session is very interesting!


In [13]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


In [14]:
def clean_text(text):
    # 1. Convert to lowercase
    text = text.lower()
    
    # 2. Remove punctuation & special characters
    text = re.sub(r'[^a-z\s]', '', text)
    
    # 3. Tokenize (split into words)
    words = text.split()
    
    # 4. Remove stopwords
    words = [w for w in words if w not in stop_words]
    
    # 5. Lemmatization (root word)
    words = [lemmatizer.lemmatize(w) for w in words]
    
    # 6. Join words back to sentence
    return " ".join(words)


In [15]:
df["clean_text"] = df["text"].apply(clean_text)
df


Unnamed: 0,text,clean_text
0,I love NLP! It is amazing.,love nlp amazing
1,NLP is used in chatbots and search engines.,nlp used chatbots search engine
2,I do not like boring lectures.,like boring lecture
3,This NLP session is very interesting!,nlp session interesting


In [16]:
tfidf = TfidfVectorizer()

X = tfidf.fit_transform(df["clean_text"])


In [17]:
print("Vocabulary:\n", tfidf.get_feature_names_out())
print("\nTF-IDF Matrix:\n", X.toarray())


Vocabulary:
 ['amazing' 'boring' 'chatbots' 'engine' 'interesting' 'lecture' 'like'
 'love' 'nlp' 'search' 'session' 'used']

TF-IDF Matrix:
 [[0.64450299 0.         0.         0.         0.         0.
  0.         0.64450299 0.41137791 0.         0.         0.        ]
 [0.         0.         0.47633035 0.47633035 0.         0.
  0.         0.         0.30403549 0.47633035 0.         0.47633035]
 [0.         0.57735027 0.         0.         0.         0.57735027
  0.57735027 0.         0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.64450299 0.
  0.         0.         0.41137791 0.         0.64450299 0.        ]]


In [18]:
tfidf_df = pd.DataFrame(
    X.toarray(),
    columns=tfidf.get_feature_names_out()
)

tfidf_df


Unnamed: 0,amazing,boring,chatbots,engine,interesting,lecture,like,love,nlp,search,session,used
0,0.644503,0.0,0.0,0.0,0.0,0.0,0.0,0.644503,0.411378,0.0,0.0,0.0
1,0.0,0.0,0.47633,0.47633,0.0,0.0,0.0,0.0,0.304035,0.47633,0.0,0.47633
2,0.0,0.57735,0.0,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.644503,0.0,0.0,0.0,0.411378,0.0,0.644503,0.0
