In [None]:
#import a necessary

import pandas as pd
from sklearn.preprocessing import LabelEncoder
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#load the dataset
df= pd.read_csv("spam.csv")

# Convert labels into binary values (spam = 1, ham = 0)
df["spamORham"]= df["spamORham"].map({"spam":0 , "ham":1})

df.drop(["Unnamed: 0"], axis=1 , inplace=True)

# Text Preprocessing Function
def preprocessing(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', ' ', text)  # Remove numbers
    text = text.translate(str.maketrans('', '' , string.punctuation))  # Remove punctuation
    text = text.strip() # Remove extra spaces
    return text

df["cleanedMessage"]=df["Message"].apply(preprocessing)

df[['Message', 'cleanedMessage']].head()

#  Step 3: Convert Text to Numerical Features
vector= TfidfVectorizer(stop_words="english")

x= vector.fit_transform(df["cleanedMessage"])
y = df["spamORham"]

#Split the train, test data
x_train, x_test , y_train, y_test = train_test_split(x, y , test_size=0.3 , random_state=42)

# Create the model
model = RandomForestClassifier()

model_train= model.fit(x_train,y_train) #train the model

model_predict = model.predict(x_test) #test the model

#Evaluvate the prformance
acc= accuracy_score(y_test, model_predict)

print("\nAccurancy:\n",acc)
print("\nconfusion_matrix:\n", confusion_matrix(y_test, model_predict))
print("\nClassification report:\n", classification_report(y_test, model_predict))

#test the new message

new_messages = ["You have won $1000! Claim your prize now!", "Let's meet for coffee tomorrow."]
new_messages_cleaned = [preprocessing(msg) for msg in new_messages]
new_messages_vectorized = vector.transform(new_messages_cleaned)
predictions = model.predict(new_messages_vectorized)

for msg, label in zip(new_messages, predictions):
    print(f"Message: {msg} --> {'Spam' if label == 1 else 'Ham'}")


Accurancy:
 0.9742822966507177

confusion_matrix:
 [[ 176   43]
 [   0 1453]]

Classification report:
               precision    recall  f1-score   support

           0       1.00      0.80      0.89       219
           1       0.97      1.00      0.99      1453

    accuracy                           0.97      1672
   macro avg       0.99      0.90      0.94      1672
weighted avg       0.98      0.97      0.97      1672

Message: You have won $1000! Claim your prize now! --> Ham
Message: Let's meet for coffee tomorrow. --> Spam
