In [4]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Download required NLTK resources only once
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Dataset
messages = [
    ['ham', 'Hey, lets meet up tomorrow.'],
    ['ham', 'Did you finish the report?'],
    ['ham', 'Call me when you get home.'],
    ['ham', 'The weather is nice today.'],
    ['ham', 'I enjoyed the movie last night.'],
    ['ham', 'Lets go for a walk in the park.'],
    ['ham', 'Happy birthday to you!'],
    ['ham', 'Thanks for the help yesterday.'],
    ['ham', 'See you at the office.'],
    ['ham', 'Good morning, have a great day.'],
    ['spam', 'Win a free iPhone now! Click here.'],
    ['spam', 'Congratulations, you won a lottery!'],
    ['spam', 'Get cheap viagra online.'],
    ['spam', 'Earn money from home easily.'],
    ['spam', 'Limited time offer, buy now!'],
    ['spam', 'Your account is suspended, verify now.'],
    ['spam', 'Free gift cards available.'],
    ['spam', 'Discounts on luxury watches.'],
    ['spam', 'Invest in bitcoin and get rich.'],
    ['spam', 'Claim your prize immediately.']
]

# Convert to DataFrame
data = pd.DataFrame(messages, columns=['label', 'message'])
data['label'] = data['label'].map({'ham': 0, 'spam': 1})  # Convert labels to numbers

# Text Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))
    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_message'] = data['message'].apply(preprocess_text)

# Feature Extraction
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['cleaned_message'])
y = data['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Predict New Email
new_email = ["Win a free gift!"]
cleaned = preprocess_text(new_email[0])
new_vec = vectorizer.transform([cleaned])
prediction = model.predict(new_vec)

print("Prediction (0 = ham, 1 = spam):", prediction[0])


Accuracy: 0.5

Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         2
           1       0.00      0.00      0.00         2

    accuracy                           0.50         4
   macro avg       0.25      0.50      0.33         4
weighted avg       0.25      0.50      0.33         4

Prediction (0 = ham, 1 = spam): 1


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [5]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Dataset
reviews = [
    ['positive', 'The product is amazing and works perfectly.'],
    ['positive', 'I love this item, highly recommend.'],
    ['positive', 'Great quality and fast delivery.'],
    ['positive', 'Exceeded my expectations.'],
    ['positive', 'Very satisfied with the purchase.'],
    ['positive', 'Best product Ive bought this year.'],
    ['positive', 'Excellent value for money.'],
    ['positive', 'Works like a charm.'],
    ['positive', 'Impressed with the features.'],
    ['positive', 'Would buy again.'],

    ['negative', 'The product broke after one use.'],
    ['negative', 'Poor quality, not worth it.'],
    ['negative', 'Did not work as advertised.'],
    ['negative', 'Very disappointing.'],
    ['negative', 'Waste of money.'],
    ['negative', 'Bad customer service too.'],
    ['negative', 'Arrived damaged.'],
    ['negative', 'Not recommended.'],
    ['negative', 'Cheap materials used.'],
    ['negative', 'Regret buying this.'],

    ['neutral', 'The product is okay, nothing special.'],
    ['neutral', 'It works but could be better.'],
    ['neutral', 'Average quality.'],
    ['neutral', 'Does the job but not great.'],
    ['neutral', 'Fine for the price.'],
    ['neutral', 'No complaints but no excitement.'],
    ['neutral', 'Standard item.'],
    ['neutral', 'Meets basic needs.'],
    ['neutral', 'Alright, I guess.'],
    ['neutral', 'Its just okay.']
]

# Convert to DataFrame
data = pd.DataFrame(reviews, columns=['sentiment', 'review'])
data['sentiment'] = data['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

# Preprocessing Function
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation + string.digits))
    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_review'] = data['review'].apply(preprocess_text)

# Feature Extraction
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data['cleaned_review'])
y = data['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n",
      classification_report(y_test, y_pred,
                            target_names=['Negative', 'Neutral', 'Positive']))

# Predict New Review
new_review = ["The product is okay."]
cleaned = preprocess_text(new_review[0])
new_vec = vectorizer.transform([cleaned])
prediction = model.predict(new_vec)

sent_map = {0: 'Negative', 1: 'Neutral', 2: 'Positive'}
print("Predicted Sentiment:", sent_map[prediction[0]])


Accuracy: 0.16666666666666666

Classification Report:
               precision    recall  f1-score   support

    Negative       0.00      0.00      0.00         2
     Neutral       0.20      0.50      0.29         2
    Positive       0.00      0.00      0.00         2

    accuracy                           0.17         6
   macro avg       0.07      0.17      0.10         6
weighted avg       0.07      0.17      0.10         6

Predicted Sentiment: Neutral


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
