In [6]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Sample data
data = [
    {"text": "NLTK is a leading platform for building Python programs to work with human language data.", "label": "technology"},
    {"text": "Word2Vec is a popular technique for generating word embeddings.", "label": "technology"},
    {"text": "TF-IDF stands for Term Frequency-Inverse Document Frequency.", "label": "technology"},
    {"text": "Bag-of-Words is a simple yet effective approach for text representation.", "label": "technology"},
    {"text": "Apples and oranges are fruits.", "label": "fruits"},
    {"text": "Bananas are a good source of potassium.", "label": "fruits"},
    {"text": "Mangoes are delicious.", "label": "fruits"}
]

# Convert data to DataFrame
df = pd.DataFrame(data)

# Text cleaning and lemmatization
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    tokens = word_tokenize(text.lower())
    clean_tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalnum() and token not in stop_words]
    return " ".join(clean_tokens)

df['clean_text'] = df['text'].apply(clean_text)

# Label encoding
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# TF-IDF representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])

# Save TF-IDF representation
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.to_csv("tfidf_representation.csv", index=False)

# Show outputs
print("Cleaned and lemmatized text data:")
print(df['clean_text'])
print("\nLabel encoding mapping:")
print(dict(zip(df['label'], df['label_encoded'])))
print("\nTF-IDF representation:")
print(tfidf_df)


Cleaned and lemmatized text data:
0    nltk leading platform building python program ...
1    word2vec popular technique generating word emb...
2                        stand term document frequency
3    simple yet effective approach text representation
4                                   apple orange fruit
5                         banana good source potassium
6                                      mango delicious
Name: clean_text, dtype: object

Label encoding mapping:
{'technology': 1, 'fruits': 0}

TF-IDF representation:
     apple  approach  banana  building      data  delicious  document  \
0  0.00000  0.000000     0.0  0.316228  0.316228   0.000000       0.0   
1  0.00000  0.000000     0.0  0.000000  0.000000   0.000000       0.0   
2  0.00000  0.000000     0.0  0.000000  0.000000   0.000000       0.5   
3  0.00000  0.408248     0.0  0.000000  0.000000   0.000000       0.0   
4  0.57735  0.000000     0.0  0.000000  0.000000   0.000000       0.0   
5  0.00000  0.000000     0.5  0

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Snehal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Snehal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Snehal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
