Assignment 03: Perform text cleaning, perform lemmatization (any method), remove stop words (any method),
label encoding. Create representations using TF-IDF. Save outputs

In [1]:
texts = [
    "Students are learning Artificial Intelligence today.",
    "Online education makes learning flexible and easy.",
    "Teachers guide students for better understanding.",
    "Technology improves the quality of education.",
    "Learning platforms help students practice coding."
]

labels = ["education", "education", "education", "technology", "education"]


1 Text Cleaning

In [2]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

cleaned_texts = [clean_text(t) for t in texts]


2 Stopword Removal

In [3]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def remove_stopwords(text):
    return " ".join([w for w in text.split() if w not in ENGLISH_STOP_WORDS])

no_stopwords_texts = [remove_stopwords(t) for t in cleaned_texts]


3 Lemmatization

In [4]:
def simple_lemmatizer(word):
    if word.endswith("ing"):
        return word[:-3]
    elif word.endswith("ed"):
        return word[:-2]
    elif word.endswith("s"):
        return word[:-1]
    return word

def lemmatize_text(text):
    return " ".join(simple_lemmatizer(w) for w in text.split())

lemmatized_texts = [lemmatize_text(t) for t in no_stopwords_texts]


4 Label Encoding

In [5]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)


5 TF-IDF Representation

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(lemmatized_texts)

tfidf_array = tfidf_matrix.toarray()
feature_names = vectorizer.get_feature_names_out()


6 Save Outputs to Files

In [7]:
import pandas as pd

# Save cleaned text
df_text = pd.DataFrame({
    "Original_Text": texts,
    "Cleaned_Text": cleaned_texts,
    "No_Stopwords": no_stopwords_texts,
    "Lemmatized_Text": lemmatized_texts,
    "Label": labels,
    "Encoded_Label": encoded_labels
})
df_text.to_csv("text_preprocessing_output.csv", index=False)

# Save TF-IDF matrix
df_tfidf = pd.DataFrame(tfidf_array, columns=feature_names)
df_tfidf.to_csv("tfidf_output.csv", index=False)

print("Files saved successfully!")
print("text_preprocessing_output.csv")
print("tfidf_output.csv")


Files saved successfully!
text_preprocessing_output.csv
tfidf_output.csv


In [8]:
from google.colab import files

files.download("text_preprocessing_output.csv")
files.download("tfidf_output.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>