<a href="https://colab.research.google.com/github/NSambhajiS/NLP-Labs/blob/main/Lab_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle


In [14]:
# Download necessary resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [15]:
# Sample dataset
data = {'text': ["I love Natural Language Processing!",
                 "Text preprocessing is an important step in NLP.",
                 "Machine Learning helps improve NLP models.",
                 "Data cleaning enhances text analysis."],
        'label': ['positive', 'neutral', 'positive', 'neutral']}
df = pd.DataFrame(data)


In [16]:
# Text Cleaning
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    return text

df['cleaned_text'] = df['text'].apply(clean_text)


In [17]:
# Stopword Removal
stop_words = set(stopwords.words('english'))
df['text_no_stopwords'] = df['cleaned_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))


In [18]:
# Lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized_text'] = df['text_no_stopwords'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))


In [19]:
# Label Encoding
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])


In [20]:
# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['lemmatized_text'])


In [21]:
# Convert TF-IDF to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [22]:
# Save Outputs
df.to_csv('processed_text_data.csv', index=False)
with open('label_encoder.pkl', 'wb') as le_file:
    pickle.dump(label_encoder, le_file)
with open('tfidf_vectorizer.pkl', 'wb') as tfidf_file:
    pickle.dump(tfidf_vectorizer, tfidf_file)

df.head(), tfidf_df.head()

(                                              text     label  \
 0              I love Natural Language Processing!  positive   
 1  Text preprocessing is an important step in NLP.   neutral   
 2       Machine Learning helps improve NLP models.  positive   
 3            Data cleaning enhances text analysis.   neutral   
 
                                      cleaned_text  \
 0              i love natural language processing   
 1  text preprocessing is an important step in nlp   
 2       machine learning helps improve nlp models   
 3            data cleaning enhances text analysis   
 
                            text_no_stopwords  \
 0           love natural language processing   
 1      text preprocessing important step nlp   
 2  machine learning helps improve nlp models   
 3       data cleaning enhances text analysis   
 
                            lemmatized_text  encoded_label  
 0         love natural language processing              1  
 1    text preprocessing importa