In [9]:
# Step 1: Import necessary libraries
import pandas as pd
import joblib  # Import joblib for model saving
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Step 2: Load the dataset
data = pd.read_csv(r"C:\Users\Dharshii\OneDrive\Desktop\samp\backend\spams.csv", encoding="latin-1")
data = data[['v1', 'v2']]  # Keep only 'label' and 'text' columns
data.columns = ['label', 'text']  # Rename columns for clarity

# Step 3: Preprocess the data (Label encoding: spam=1, ham=0)
data['label'] = data['label'].map({'spam': 1, 'ham': 0})

# Step 4: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.3, random_state=42)

# Step 5: Build the text classification pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

# Step 6: Train the classifier
text_clf.fit(X_train, y_train)

# Step 7: Make predictions on the test set
predicted = text_clf.predict(X_test)

# Step 8: Evaluate the classifier performance
accuracy = accuracy_score(y_test, predicted)
print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, predicted))

# Step 9: Example of classifying new text (phishing or spam detection)
new_texts = [
    "Congratulations! You've won a free iPhone. Click here to claim your prize.",
    "Hello, let's catch up for coffee sometime this weekend!"
]
new_predictions = text_clf.predict(new_texts)

for text, prediction in zip(new_texts, new_predictions):
    print(f"Text: {text}")
    print(f"Prediction: {'Spam' if prediction == 1 else 'Ham'}\n")

# Step 10: Save the model only if accuracy is acceptable
if accuracy > 0.90:
    model_path = r"C:\Users\Dharshii\OneDrive\Desktop\samp\backend\spam_classifier.pkl"
    joblib.dump(text_clf, model_path)
    print(f"✅ Model trained and saved at {model_path}")
else:
    print("⚠️ Model accuracy is too low. Consider improving the dataset or tuning hyperparameters.")



Accuracy: 0.9677033492822966
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1453
           1       1.00      0.75      0.86       219

    accuracy                           0.97      1672
   macro avg       0.98      0.88      0.92      1672
weighted avg       0.97      0.97      0.97      1672

Text: Congratulations! You've won a free iPhone. Click here to claim your prize.
Prediction: Spam

Text: Hello, let's catch up for coffee sometime this weekend!
Prediction: Ham

✅ Model trained and saved at C:\Users\Dharshii\OneDrive\Desktop\samp\backend\spam_classifier.pkl
