<a href="https://colab.research.google.com/github/KesteHarshada87/MachineLearning/blob/main/_NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Simple sample dataset
data = {
    "label": ["ham", "spam", "ham", "ham", "spam", "ham", "spam", "ham"],
    "message": [
        "Hey, are we still meeting today?",
        "Congratulations! You've won a free lottery ticket. Claim now!",
        "Can you send me the notes for class?",
        "I will call you in the evening.",
        "You won $1000 cash! Click here to claim.",
        "Don't forget about the meeting tomorrow.",
        "Get cheap loans now!!! Limited offer.",
        "Happy Birthday! Wish you a great day."
    ]
}

df = pd.DataFrame(data)
print(df)


  label                                            message
0   ham                   Hey, are we still meeting today?
1  spam  Congratulations! You've won a free lottery tic...
2   ham               Can you send me the notes for class?
3   ham                    I will call you in the evening.
4  spam           You won $1000 cash! Click here to claim.
5   ham           Don't forget about the meeting tomorrow.
6  spam              Get cheap loans now!!! Limited offer.
7   ham              Happy Birthday! Wish you a great day.


In [None]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download("stopwords")

# Initialize tools
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

# Preprocessing function
def preprocess_text(text):
    # 1. Lowercase
    text = text.lower()
    # 2. Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # 3. Tokenize (split by space)
    tokens = text.split()
    # 4. Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # 5. Stemming
    tokens = [stemmer.stem(word) for word in tokens]
    # Join back to string
    return " ".join(tokens)

# Apply preprocessing
df["clean_message"] = df["message"].apply(preprocess_text)

print(df[["message", "clean_message"]])

                                             message  \
0                   Hey, are we still meeting today?   
1  Congratulations! You've won a free lottery tic...   
2               Can you send me the notes for class?   
3                    I will call you in the evening.   
4           You won $1000 cash! Click here to claim.   
5           Don't forget about the meeting tomorrow.   
6              Get cheap loans now!!! Limited offer.   
7              Happy Birthday! Wish you a great day.   

                              clean_message  
0                      hey still meet today  
1  congratul youv free lotteri ticket claim  
2                           send note class  
3                                 call even  
4                     1000 cash click claim  
5                 dont forget meet tomorrow  
6                get cheap loan limit offer  
7             happi birthday wish great day  


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the cleaned messages
X = vectorizer.fit_transform(df["clean_message"])

# Labels (ham/spam)
y = df["label"]

# Show feature names (words)
print("Vocabulary:", vectorizer.get_feature_names_out())

# Show TF-IDF matrix shape
print("TF-IDF Matrix Shape:", X.shape)

# Convert first row to dense array for illustration
print("First message TF-IDF vector:\n", X[0].toarray())


Vocabulary: ['1000' 'birthday' 'call' 'cash' 'cheap' 'claim' 'class' 'click'
 'congratul' 'day' 'dont' 'even' 'forget' 'free' 'get' 'great' 'happi'
 'hey' 'limit' 'loan' 'lotteri' 'meet' 'note' 'offer' 'send' 'still'
 'ticket' 'today' 'tomorrow' 'wish' 'youv']
TF-IDF Matrix Shape: (8, 31)
First message TF-IDF vector:
 [[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.51970849
  0.         0.         0.         0.43555627 0.         0.
  0.         0.51970849 0.         0.51970849 0.         0.
  0.        ]]


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)


Training set shape: (6, 31)
Testing set shape: (2, 31)


In [None]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()

nb_model.fit(X_train, y_train)

print("Model trained successfully!")


Model trained successfully!


In [None]:
# Predict on the test set
y_pred = nb_model.predict(X_test)

print("Predicted labels:", y_pred.tolist())
print("Actual labels:", y_test.tolist())


Predicted labels: ['ham', 'ham']
Actual labels: ['spam', 'ham']


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report (precision, recall, F1-score)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.5

Classification Report:
               precision    recall  f1-score   support

         ham       0.50      1.00      0.67         1
        spam       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred, labels=["ham", "spam"])
print("Confusion Matrix:\n", cm)


Confusion Matrix:
 [[1 0]
 [1 0]]
