In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [2]:
df = pd.read_csv("/home/csl-4/Downloads/spam.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [6]:
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [8]:
df = df[['label', 'text', 'label_num']]  # Drop unnecessary columns
df.head()

Unnamed: 0,label,text,label_num
0,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,spam,"Subject: photoshop , windows , office . cheap ...",1
4,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [9]:
X = df['text']
y = df['label_num']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train.shape, X_test.shape

((4136,), (1035,))

In [10]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

X_train_vec.shape

(4136, 45240)

In [11]:
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [12]:
y_pred = model.predict(X_test_vec)
y_pred[:10]

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

In [13]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[729,  13],
       [ 16, 277]])

In [14]:
tn, fp, fn, tp = cm.ravel()
tn, fp, fn, tp

(729, 13, 16, 277)

In [15]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
error_rate = 1 - accuracy

accuracy, precision, recall, error_rate

(0.9719806763285024,
 0.9551724137931035,
 0.9453924914675768,
 0.02801932367149762)

In [16]:
# Print the metrics with good formatting
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"Error Rate: {error_rate * 100:.2f}%")

Accuracy: 97.20%
Precision: 95.52%
Recall: 94.54%
Error Rate: 2.80%


In [17]:
# Iris Dataset

In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB  # Using Gaussian Naive Bayes for continuous data
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

In [28]:
# Load the Iris dataset
df = pd.read_csv('/home/csl-4/Downloads/Iris.csv')

# Display the first few rows to inspect the data
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [29]:
# Drop the 'Id' column as it is not needed for classification
df = df.drop(columns=['Id'])

# Encode the 'Species' column (Iris-setosa: 0, Iris-versicolor: 1, Iris-virginica: 2)
df['Species'] = df['Species'].map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2})

# Display the cleaned data
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [30]:
# Features (X) and Target (y)
X = df.drop(columns=['Species'])
y = df['Species']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the size of the splits
X_train.shape, X_test.shape

((120, 4), (30, 4))

In [31]:
# Initialize and train the Multinomial Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

In [32]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Display the first 10 predictions
y_pred[:10]

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1])

In [33]:
# Confusion Matrix for multi-class classification
cm = confusion_matrix(y_test, y_pred)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm)

# Optionally, you can also calculate class-wise TP, FP, FN, and TN
for i in range(len(cm)):
    tp = cm[i, i]  # True Positives for class i
    fn = cm[i, :].sum() - tp  # False Negatives for class i
    fp = cm[:, i].sum() - tp  # False Positives for class i
    tn = cm.sum() - (tp + fn + fp)  # True Negatives for class i
    
    print(f"Class {i}: TP = {tp}, FN = {fn}, FP = {fp}, TN = {tn}")

Confusion Matrix:
[[10  0  0]
 [ 0  9  0]
 [ 0  3  8]]
Class 0: TP = 10, FN = 0, FP = 0, TN = 20
Class 1: TP = 9, FN = 0, FP = 3, TN = 18
Class 2: TP = 8, FN = 3, FP = 0, TN = 19


In [34]:
# Compute Accuracy, Precision, Recall, and Error Rate
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
error_rate = 1 - accuracy

# Print the metrics
print(f"Accuracy: {accuracy * 100:.2f}%")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"Error Rate: {error_rate * 100:.2f}%")

Accuracy: 90.00%
Precision: 91.67%
Recall: 90.91%
Error Rate: 10.00%
