In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

df = pd.read_excel('Spam Email Detection.xlsx')
print(df.head())

# Function to clean and preprocess email text
def clean_email(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = ''.join([c for c in text if c.isalnum() or c.isspace()])
    return text.strip()

df['v2'] = df['v2'].apply(clean_email)
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['v2'])

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['v1'], test_size=0.2, random_state=42)

# Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train, y_train)

# Predictions on the test set
y_pred = nb_classifier.predict(X_test)

# Calculate accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Predict function
def predict_spam(email_content):
    cleaned_content = clean_email(email_content)
    vectorized_content = vectorizer.transform([cleaned_content])
    prediction = nb_classifier.predict(vectorized_content)[0]
    return prediction

email_content = "HOT LIVE FANTASIES call now 08707509020 Just 20p per min NTT Ltd, PO Box 1327 Croydon CR9 5WB 0870..k"
prediction = predict_spam(email_content)
if prediction == 'spam':
    print("This email is likely spam.")
else:
    print("This email is not spam.")


     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
Accuracy: 97.85%
This email is likely spam.


In [9]:
# Function to predict based on user provided row number
def predict_spam_by_row_number(df, row_number):
    # Clean 'v2' column for the specified row number
    cleaned_content = clean_email(df['v2'].iloc[row_number])
    print(cleaned_content)
    print()
    
    # Initialize CountVectorizer for Bag-of-Words representation
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform([cleaned_content])

    # 'v1' column contains labels (spam or ham)
    y = df['v1'].iloc[row_number]

    # Multinomial Naive Bayes classifier
    nb_classifier = MultinomialNB()
    nb_classifier.fit(X, [y])
    vectorized_content = vectorizer.transform([cleaned_content])
    prediction = nb_classifier.predict(vectorized_content)[0]
    
    if prediction == 'spam':
        return "spam"
    else:
        return "not spam"
        
# Enter the row number for prediction
row_number = int(input("Enter the row number for prediction (starting from 0):"))
result = predict_spam_by_row_number(df, row_number)
print(f"The content is {result}.")


Enter the row number for prediction (starting from 0): 226


will u meet ur dream partner soon is ur career off 2 a flyng start 2 find out free txt horo followed by ur star sign e g horo aries

The content is spam.
