# **Perform tokenization, stopword removal, stemming, and lemmatization on a sample dataset. Compare how these preprocessing steps impact the quality of text representation.**

#Importing Libararies



In [None]:
!pip install nltk pandas



In [3]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

#Loading Data set

In [4]:
df = pd.read_csv("/content/steam_review.csv", usecols=['content', 'is_positive'])
df = df.dropna().head(3000)   # take 3000 rows for speed
df.head()


Unnamed: 0,content,is_positive
0,At least its a counter strike -1/100,Negative
1,Uh... So far my playthrough has not been great...,Negative
2,Better mechanics than cs2,Negative
3,buggy mess and NOT fun to play at all,Negative
4,"Whoever came up with this, is gonna fucking ge...",Negative


#Data Preprocessing

In [6]:
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # 1. Tokenization
    tokens = word_tokenize(text.lower())

    # 2. Stopword Removal + keep alphabetic words only
    filtered = [w for w in tokens if w.isalpha() and w not in stop_words]

    # 3. Stemming
    stemmed = [stemmer.stem(w) for w in filtered]

    # 4. Lemmatization
    lemmatized = [lemmatizer.lemmatize(w) for w in filtered]

    return tokens, filtered, stemmed, lemmatized


In [8]:
import nltk
nltk.download('punkt_tab')

df["tokens"] = df["content"].apply(lambda x: preprocess(x)[0])
df["no_stopwords"] = df["content"].apply(lambda x: preprocess(x)[1])
df["stemmed"] = df["content"].apply(lambda x: preprocess(x)[2])
df["lemmatized"] = df["content"].apply(lambda x: preprocess(x)[3])

df.head()

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,content,is_positive,tokens,no_stopwords,stemmed,lemmatized
0,At least its a counter strike -1/100,Negative,"[at, least, its, a, counter, strike, -1/100]","[least, counter, strike]","[least, counter, strike]","[least, counter, strike]"
1,Uh... So far my playthrough has not been great...,Negative,"[uh, ..., so, far, my, playthrough, has, not, ...","[uh, far, playthrough, great, glitched, textur...","[uh, far, playthrough, great, glitch, textur, ...","[uh, far, playthrough, great, glitched, textur..."
2,Better mechanics than cs2,Negative,"[better, mechanics, than, cs2]","[better, mechanics]","[better, mechan]","[better, mechanic]"
3,buggy mess and NOT fun to play at all,Negative,"[buggy, mess, and, not, fun, to, play, at, all]","[buggy, mess, fun, play]","[buggi, mess, fun, play]","[buggy, mess, fun, play]"
4,"Whoever came up with this, is gonna fucking ge...",Negative,"[whoever, came, up, with, this, ,, is, gon, na...","[whoever, came, gon, na, fucking, get, negativ...","[whoever, came, gon, na, fuck, get, neg, revie...","[whoever, came, gon, na, fucking, get, negativ..."


#Comparing Representation Quality

In [9]:
def get_vocab_size(list_of_docs):
    vocab = set()
    for doc in list_of_docs:
        vocab.update(doc)
    return len(vocab)

# Ensure preprocessing columns are present before calculating vocabulary size
# This addresses potential inconsistencies if previous cells were not run or state was lost
df["tokens"] = df["content"].apply(lambda x: preprocess(x)[0])
df["no_stopwords"] = df["content"].apply(lambda x: preprocess(x)[1])
df["stemmed"] = df["content"].apply(lambda x: preprocess(x)[2])
df["lemmatized"] = df["content"].apply(lambda x: preprocess(x)[3])

results = {
    "Original Tokens": get_vocab_size(df["tokens"]),
    "After Stopword Removal": get_vocab_size(df["no_stopwords"]),
    "After Stemming": get_vocab_size(df["stemmed"]),
    "After Lemmatization": get_vocab_size(df["lemmatized"])
}

pd.DataFrame(results, index=["Vocabulary Size"])

Unnamed: 0,Original Tokens,After Stopword Removal,After Stemming,After Lemmatization
Vocabulary Size,9655,8238,6213,7550
