# New Section

In [2]:
pip install scikit-learn numpy pandas




In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [4]:
data = pd.read_csv('/content/spam_ham_dataset.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [5]:
print(data.head())

   Unnamed: 0 label                                               text  \
0         605   ham  Subject: enron methanol ; meter # : 988291\r\n...   
1        2349   ham  Subject: hpl nom for january 9 , 2001\r\n( see...   
2        3624   ham  Subject: neon retreat\r\nho ho ho , we ' re ar...   
3        4685  spam  Subject: photoshop , windows , office . cheap ...   
4        2030   ham  Subject: re : indian springs\r\nthis deal is t...   

   label_num  
0          0  
1          0  
2          0  
3          1  
4          0  


In [6]:
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.3, random_state=42)

In [8]:
vectorizer = TfidfVectorizer(stop_words='english')
X_train_transformed = vectorizer.fit_transform(X_train)
X_test_transformed = vectorizer.transform(X_test)

In [9]:
model = MultinomialNB()
model.fit(X_train_transformed, y_train)

In [10]:
y_pred = model.predict(X_test_transformed)

In [11]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9220360824742269
Confusion Matrix:
 [[1120    1]
 [ 120  311]]
Classification Report:
               precision    recall  f1-score   support

           0       0.90      1.00      0.95      1121
           1       1.00      0.72      0.84       431

    accuracy                           0.92      1552
   macro avg       0.95      0.86      0.89      1552
weighted avg       0.93      0.92      0.92      1552



In [12]:
new_messages = [
"""Subject: vlc , 0 dln for sale , no prior pres . crip . tion needed
enjoy up to 80 % off
buy vlc , 0 dln online"""


]
new_messages_transformed = vectorizer.transform(new_messages)
predictions = model.predict(new_messages_transformed)
print("Predictions:", predictions)

Predictions: [1]


In [13]:
new_messages_transformed = vectorizer.transform(new_messages)
predictions = model.predict(new_messages_transformed)

In [14]:
label_mapping = {0: 'ham', 1: 'spam'}
predicted_labels = [label_mapping[pred] for pred in predictions]

for message, label in zip(new_messages, predicted_labels):
    print(f"Message: {message}\nPrediction: {label}\n")

Message: Subject: vlc , 0 dln for sale , no prior pres . crip . tion needed
enjoy up to 80 % off
buy vlc , 0 dln online
Prediction: spam



In [15]:
new_messages = [
"""Subject: adjusted deal ticket
daren / o ' neal ,
effective 1 / 18 / 01 , deal ticket 137205 has been adjusted from a daily volume"""



]
new_messages_transformed = vectorizer.transform(new_messages)
predictions = model.predict(new_messages_transformed)
print("Predictions:", predictions)

Predictions: [0]


In [16]:
new_messages_transformed = vectorizer.transform(new_messages)
predictions = model.predict(new_messages_transformed)

In [17]:
label_mapping = {0: 'ham', 1: 'spam'}
predicted_labels = [label_mapping[pred] for pred in predictions]

for message, label in zip(new_messages, predicted_labels):
    print(f"Message: {message}\nPrediction: {label}\n")

Message: Subject: adjusted deal ticket
daren / o ' neal ,
effective 1 / 18 / 01 , deal ticket 137205 has been adjusted from a daily volume
Prediction: ham

