In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

In [None]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download the punkt_tab resource

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Sample dataset
data = {
    "Text": [
        "Natural Language Processing is a branch of AI.",
        "AI models are trained using data.",
        "Processing natural text is challenging!",
        "Word embeddings capture meaning in text."
    ],
    "Category": ["NLP", "AI", "NLP", "ML"]  # Example labels
}

In [None]:
# Convert to DataFrame
df = pd.DataFrame(data)

In [None]:
# Step 1: Text Cleaning Function
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

df["Cleaned_Text"] = df["Text"].apply(clean_text)

In [None]:
# Step 2: Tokenization, Stopword Removal, and Lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return " ".join(tokens)

df["Processed_Text"] = df["Cleaned_Text"].apply(preprocess_text)

In [None]:
#  Step 3: Label Encoding
label_encoder = LabelEncoder()
df["Label"] = label_encoder.fit_transform(df["Category"])

In [None]:
#  Step 4: TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df["Processed_Text"])

In [None]:
# Convert TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [None]:
#  Step 5: Save Outputs
df.to_csv("processed_data.csv", index=False)  # Save preprocessed text
tfidf_df.to_csv("tfidf_representation.csv", index=False)  # Save TF-IDF output

In [None]:
# Save label encoder for later use
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

In [None]:
# Save TF-IDF vectorizer for later use
with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf_vectorizer, f)

print(" Text processing completed and output saved!")

 Text processing completed and output saved!
