In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
data = {
    'text': [
        "I love NLP",
        "NLP is fun",
        "Machine learning is cool",
        "I enjoy learning NLP",
        "Fun with machine learning and NLP"
    ]
}

In [3]:
df=pd.DataFrame(data)

In [4]:
data

{'text': ['I love NLP',
  'NLP is fun',
  'Machine learning is cool',
  'I enjoy learning NLP',
  'Fun with machine learning and NLP']}

In [5]:
stopwords = set(["i", "is", "and", "the", "a", "an", "with", "to", "in", "on", "for", "of", "was"])

In [6]:
# Cleaning function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return ' '.join([word for word in text.split() if word not in stopwords])

In [7]:
# Apply cleaning
df['cleaned_text'] = df['text'].apply(clean_text)

In [8]:
# Bag of Words
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(df['cleaned_text'])
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())

In [9]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [14]:
print("Cleaned Text:")
print(df[['text', 'cleaned_text']], "\n")

print("Bag of Words Representation:")
print(bow_df, "\n")

print("TF-IDF Representation:")
print(tfidf_df)

Cleaned Text:
                                text              cleaned_text
0                         I love NLP                  love nlp
1                         NLP is fun                   nlp fun
2           Machine learning is cool     machine learning cool
3               I enjoy learning NLP        enjoy learning nlp
4  Fun with machine learning and NLP  fun machine learning nlp 

Bag of Words Representation:
   cool  enjoy  fun  learning  love  machine  nlp
0     0      0    0         0     1        0    1
1     0      0    1         0     0        0    1
2     1      0    0         1     0        1    0
3     0      1    0         1     0        0    1
4     0      0    1         1     0        1    1 

TF-IDF Representation:
       cool     enjoy       fun  learning      love   machine       nlp
0  0.000000  0.000000  0.000000  0.000000  0.871247  0.000000  0.490845
1  0.000000  0.000000  0.819887  0.000000  0.000000  0.000000  0.572526
2  0.690159  0.000000  0.000000  0.4