In [23]:
import joblib
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

#Load the dataset
df = pd.read_csv('en_spam_data.csv')

#Data preprocessing
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['Text'] = df['Text'].apply(clean_text)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer(stop_words="english", max_features=1500)
X = vectorizer.fit_transform(df['Text'])

# Train-test split 20% test
X_train, X_test, y_train, y_test = train_test_split(X, df['Class'], test_size=0.2, random_state=42)

# Model training using Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Model evaluation
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print("Training Accuracy:", accuracy_score(y_train, y_pred_train))
print("Testing Accuracy:", accuracy_score(y_test, y_pred_test))

print("Classification Report on Training Data:\n", classification_report(y_train, y_pred_train))

print("Classification Report on Testing Data:\n", classification_report(y_test, y_pred_test))

# Save the model and vectorizer for later use
joblib.dump(model, 'english_spam_classifier_rf_model.pkl')
joblib.dump(vectorizer, 'english_tfidf_vectorizer.pkl')
print("Model and vectorizer saved successfully.")

0        Supply Quality China's EXCLUSIVE dimensions at...
1                               over. SidLet me know. Thx.
2        Dear Friend,Greetings to you.I wish to accost ...
3        MR. CHEUNG PUIHANG SENG BANK LTD.DES VOEUX RD....
4                Not a surprising assessment from Embassy.
                               ...                        
11924    Travel well. I'll look forward to hearing your...
11925    Dear friend, I wish to begin by way of introdu...
11926    Follow Up Flag: Follow upFlag Status: FlaggedM...
11927    sbwhoeop B6Saturday January 23 2010 4:09 PMRe:...
11928    FYI. We are revising call sheet for call to Ka...
Name: Text, Length: 11929, dtype: object
Training Accuracy: 0.98920674840197
Testing Accuracy: 0.9803017602682313
Classification Report on Training Data:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      5408
           1       1.00      0.98      0.99      4135

    accuracy                    

In [24]:
import joblib

# Load the model and vectorizer
model = joblib.load('english_spam_classifier_rf_model.pkl')
vectorizer = joblib.load('english_tfidf_vectorizer.pkl')

def predict_spam(email_text):
    email_features = vectorizer.transform([email_text])
    prediction = model.predict(email_features)
    if prediction[0] == 1:
        return "Spam"
    else:
        return "Not Spam"

input_text = """THIS IS AN OFFICIAL NOTIFICATION OF FUNDS DEPOSITED I WANT TO PUT 1500 bucks to your account"""

result = predict_spam(input_text)
print(f"The email is: {result}")

The email is: Spam
