In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import pickle

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

# Sample data
data = {'Text': [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
],
    'Label': ['A', 'B', 'C', 'A']}

df = pd.DataFrame(data)

# Text Cleaning and Lemmatization
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    words = nltk.word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

df['Cleaned_Text'] = df['Text'].apply(preprocess_text)

# Label Encoding
label_encoder = LabelEncoder()
df['Encoded_Label'] = label_encoder.fit_transform(df['Label'])

# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['Cleaned_Text'])

# Save Outputs
df.to_csv('cleaned_data.csv', index=False)
with open('tfidf_matrix.pkl', 'wb') as tfidf_file:
    pickle.dump(X_tfidf, tfidf_file)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rajkumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rajkumar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rajkumar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Text Cleaning
# Definition: Removing unwanted elements like special characters, numbers, and extra spaces.
# Purpose: To prepare text for analysis and ensure consistency.
# Stop Words
# Definition: Common words with little semantic value, like "and," "the," "is."
# Removal: Enhances text processing by focusing on meaningful words.
# Label Encoding
# Definition: Converts categorical data into numeric labels for machine learning.
# Use Case: For categorical text data, typically for supervised learning.
# TF-IDF Representations
# Definition: Represents text as a combination of term frequency and inverse document frequency to highlight significant words.

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample Data
data = pd.DataFrame({
    'text': ["Text cleaning is an important step in natural language processing!",
             "Lemmatization reduces words to their base form, e.g., 'running' to 'run'.",
             "Stop words like 'and', 'the', 'is' are often removed before text analysis.",
             "Label encoding converts categorical labels into numerical values.",
             "TF-IDF is a widely used technique to represent text data numerically."],
    'label': ['A', 'B', 'C', 'D', 'E']
})

# Text Cleaning
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters, numbers
    text = text.lower()  # Convert text to lowercase
    return text

data['cleaned_text'] = data['text'].apply(clean_text)

# Lemmatization
nltk.download('punkt')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

data['lemmatized_text'] = data['cleaned_text'].apply(lemmatize_text)

# Removing Stop Words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

data['stopwords_removed'] = data['lemmatized_text'].apply(remove_stopwords)

# Label Encoding
label_encoder = LabelEncoder()
data['encoded_label'] = label_encoder.fit_transform(data['label'])

# TF-IDF Representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_representation = tfidf_vectorizer.fit_transform(data['stopwords_removed'])

# Combine dataframes
tfidf_df = pd.DataFrame(tfidf_representation.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
label_mapping = pd.DataFrame({'label': label_encoder.classes_, 'encoded_label': label_encoder.transform(label_encoder.classes_)})

# Concatenate dataframes
final_data = pd.concat([data, tfidf_df, label_mapping], axis=1)

# Save the combined dataframe to a CSV file
final_data.to_csv('processed_data.csv', index=False)



