In [None]:
import re
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# Load English tokenizer from spaCy
nlp = spacy.load('en_core_web_sm')

In [None]:
# Ensure NLTK stopwords and WordNetLemmatizer are downloaded
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
data = ['I love programming!', 'Python is amazing...', 'I enjoy solving problems.','i hate c#']
labels = ['positive', 'positive', 'positive', 'negative']

In [None]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
# Step 1: Text Cleaning
def clean_text(text):
    # Remove special characters, digits, and punctuation
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Lowercase the text
    text = text.lower()
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data_cleaned = [clean_text(text) for text in data]

In [None]:
# Step 2: Lemmatization using WordNetLemmatizer
def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return " ".join(lemmatized_words)

data_lemmatized = [lemmatize_text(text) for text in data_cleaned]

In [None]:
# Step 3: Stop Words Removal
stop_words = set(stopwords.words('english'))
def remove_stop_words(text):
    words = text.split()
    words_filtered = [word for word in words if word not in stop_words]
    return " ".join(words_filtered)

data_no_stopwords = [remove_stop_words(text) for text in data_lemmatized]

In [None]:
# Step 4: Label Encoding
label_encoder = LabelEncoder()


labels_encoded = label_encoder.fit_transform(labels)

In [None]:
# Step 5: TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(data_no_stopwords)

In [None]:
# Output the results
print("Cleaned and Lemmatized Text (no stopwords):")
print(data_no_stopwords)
print()
print("Encoded Labels:")
print(labels_encoded)
print()
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

Cleaned and Lemmatized Text (no stopwords):
['love programming', 'python amazing', 'enjoy solving problem', 'hate c']

Encoded Labels:
[1 1 1 0]

TF-IDF Matrix:
[[0.         0.         0.         0.70710678 0.         0.70710678
  0.         0.        ]
 [0.70710678 0.         0.         0.         0.         0.
  0.70710678 0.        ]
 [0.         0.57735027 0.         0.         0.57735027 0.
  0.         0.57735027]
 [0.         0.         1.         0.         0.         0.
  0.         0.        ]]
