1. Text Dataset + Labels

In [12]:
texts = [
    "Regular exercise improves mental health and reduces stress.",
    "Doctors recommend drinking enough water every day.",
    "Healthy eating habits prevent many lifestyle diseases.",
    "Meditation helps people stay calm and focused.",
    "Good sleep is essential for maintaining overall wellbeing.",
    "Walking daily boosts heart health and fitness.",
    "Balanced diet supports strong immunity and energy levels.",
    "Mental health awareness is important in modern society."
]

labels = [
    "health",
    "health",
    "health",
    "wellbeing",
    "wellbeing",
    "fitness",
    "health",
    "wellbeing"
]


In [None]:
2. Text Cleaning

In [13]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

cleaned_texts = [clean_text(t) for t in texts]

print("Cleaned Texts:")
for t in cleaned_texts:
    print(t)


Cleaned Texts:
regular exercise improves mental health and reduces stress
doctors recommend drinking enough water every day
healthy eating habits prevent many lifestyle diseases
meditation helps people stay calm and focused
good sleep is essential for maintaining overall wellbeing
walking daily boosts heart health and fitness
balanced diet supports strong immunity and energy levels
mental health awareness is important in modern society


In [None]:
3. Stopword Removal

In [14]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def remove_stopwords(text):
    words = text.split()
    filtered = [w for w in words if w not in ENGLISH_STOP_WORDS]
    return " ".join(filtered)

no_stopwords_texts = [remove_stopwords(t) for t in cleaned_texts]

print("\nAfter Stopword Removal:")
for t in no_stopwords_texts:
    print(t)



After Stopword Removal:
regular exercise improves mental health reduces stress
doctors recommend drinking water day
healthy eating habits prevent lifestyle diseases
meditation helps people stay calm focused
good sleep essential maintaining overall wellbeing
walking daily boosts heart health fitness
balanced diet supports strong immunity energy levels
mental health awareness important modern society


4. Lemmatization (simple, no download issues)

In [15]:
def simple_lemmatizer(word):
    if word.endswith("ing"):
        return word[:-3]
    elif word.endswith("ed"):
        return word[:-2]
    elif word.endswith("s"):
        return word[:-1]
    return word

def lemmatize_text(text):
    return " ".join(simple_lemmatizer(w) for w in text.split())

lemmatized_texts = [lemmatize_text(t) for t in no_stopwords_texts]

print("\nAfter Lemmatization:")
for t in lemmatized_texts:
    print(t)



After Lemmatization:
regular exercise improve mental health reduce stres
doctor recommend drink water day
healthy eat habit prevent lifestyle disease
meditation help people stay calm focus
good sleep essential maintain overall wellbe
walk daily boost heart health fitnes
balanc diet support strong immunity energy level
mental health awarenes important modern society


5. Label Encoding

In [16]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

print("\nOriginal Labels:", labels)
print("Encoded Labels:", encoded_labels)



Original Labels: ['health', 'health', 'health', 'wellbeing', 'wellbeing', 'fitness', 'health', 'wellbeing']
Encoded Labels: [1 1 1 2 2 0 1 2]


6. TF-IDF Representation

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(lemmatized_texts)

print("\nTF-IDF Feature Names:")
print(vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())



TF-IDF Feature Names:
['awarenes' 'balanc' 'boost' 'calm' 'daily' 'day' 'diet' 'disease'
 'doctor' 'drink' 'eat' 'energy' 'essential' 'exercise' 'fitnes' 'focus'
 'good' 'habit' 'health' 'healthy' 'heart' 'help' 'immunity' 'important'
 'improve' 'level' 'lifestyle' 'maintain' 'meditation' 'mental' 'modern'
 'overall' 'people' 'prevent' 'recommend' 'reduce' 'regular' 'sleep'
 'society' 'stay' 'stres' 'strong' 'support' 'walk' 'water' 'wellbe']

TF-IDF Matrix:
[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.4007901  0.         0.         0.         0.
  0.28984843 0.         0.         0.         0.         0.
  0.4007901  0.         0.         0.         0.         0.33589338
  0.         0.         0.         0.         0.         0.4007901
  0.4007901  0.         0.         0.         0.4007901  0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.      