In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import pickle

# Download data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Sample Data
data = {
    'text': [
        "The football match was thrilling and full of excitement.",
        "Artificial intelligence is transforming technology rapidly.",
        "I love making delicious pasta and trying new recipes.",
        "The new smartphone launch surprised the tech world.",
        "The player scored a stunning goal in the last minute!",
        "I love to eat pizza and sandwiches.",
        "Farmers are cultivating the field.",
        "Village people will harvest the field tomorrow"
    ],
    'category': ['sports', 'technology', 'food', 'technology', 'sports', 'food', "Agriculture", "Agriculture"]
}

df = pd.DataFrame(data)

# Function for preprocessing of the text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

df['clean_text'] = df['text'].apply(preprocess)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['clean_text'])
y = df['category']

# Training Model
model = MultinomialNB()
model.fit(X, y)

# Save Model and Vectorizer 
with open('text_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print("✅ Model and vectorizer are saved successfully!")


✅ Model and vectorizer are saved successfully!


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
# Load the Saved Model and Vectorizer
with open('text_model.pkl', 'rb') as f:
    model = pickle.load(f)

with open('vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

# Function for preprocessing of the text (same as training)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha()]
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

# Predict Category for New Sentence
while True:
    new_text = input("\nEnter a sentence (or type 'exit' to stop): ")
    if new_text.lower() == 'exit':
        break
    clean_text = [preprocess(new_text)]
    new_vec = vectorizer.transform(clean_text)
    prediction = model.predict(new_vec)
    print("➡️ Predicted Category:", prediction[0])


➡️ Predicted Category: Agriculture
➡️ Predicted Category: technology
