In [1]:
import pandas as pd
import numpy as np
import string
import re
import joblib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()


# ----------- TEXT PREPROCESSING ------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# ----------- LOAD DATA ------------
df = pd.read_csv("dataset.csv")

print("Sample Data:")
print(df.head())

# Preprocessing
df['clean_text'] = df['text'].apply(clean_text)
df['label'] = df['label'].map({'spam': 1, 'not_spam': 0})  # Binary encoding

# ----------- TRAIN TEST SPLIT ------------
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

# ----------- FEATURE EXTRACTION (TF-IDF) ------------
tfidf_vectorizer = TfidfVectorizer(max_df=0.7)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# ----------- MODEL TRAINING ------------
model = PassiveAggressiveClassifier(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# ----------- EVALUATION ------------
y_pred = model.predict(X_test_tfidf)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ----------- SAVE MODEL ------------
joblib.dump(model, "spam_detector_model.pkl")
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

# ----------- PREDICTION FUNCTION ------------
def predict_news(news_text):
    cleaned = clean_text(news_text)
    vect = tfidf_vectorizer.transform([cleaned])
    pred = model.predict(vect)
    return "Spam" if pred[0] == 1 else "Not Spam"


# ----------- TEST SAMPLE ------------
sample_news = "Breaking: You won a free trip to Dubai! Click now!"
print(f"\nNews: {sample_news}")
print("Prediction:", predict_news(sample_news))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\swadh\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


FileNotFoundError: [Errno 2] No such file or directory: 'dataset.csv'

# Assistant
The error occurs because the code is trying to read a file called "dataset.csv" that doesn't exist in your current working directory.

Would you like me to provide the corrected code?

# User
yes

In [2]:
# Assistant
# ----------- PREPROCESSING ------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@w+|\#','', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# ----------- LOAD DATA ------------
# Fix: Provide the correct path to your dataset or create sample data
# Option 1: Specify the correct path to your dataset
# df = pd.read_csv("/path/to/your/dataset.csv")

# Option 2: Create a sample dataset for testing
import pandas as pd
import re
import string
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import joblib
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

# Initialize stemmer and stop words
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Create sample data
data = {
    'text': [
        "Free gift for you! Click now to claim",
        "Meeting scheduled for tomorrow at 10am",
        "Win a free iPhone! Limited time offer",
        "Please review the quarterly report",
        "URGENT: Your account has been compromised"
    ],
    'label': ['spam', 'not_spam', 'spam', 'not_spam', 'spam']
}
df = pd.DataFrame(data)

print("Sample Data:")
print(df.head())

# Preprocessing
df['clean_text'] = df['text'].apply(clean_text)
df['label'] = df['label'].map({'spam': 1, 'not_spam': 0})  # Binary encoding

# ----------- TRAIN TEST SPLIT ------------
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

# ----------- FEATURE EXTRACTION (TF-IDF) ------------
tfidf_vectorizer = TfidfVectorizer(max_df=0.7)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# ----------- MODEL TRAINING ------------
model = PassiveAggressiveClassifier(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# ----------- EVALUATION ------------
y_pred = model.predict(X_test_tfidf)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ----------- SAVE MODEL ------------
joblib.dump(model, "spam_detector_model.pkl")
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

# ----------- PREDICTION FUNCTION ------------
def predict_news(news_text):
    cleaned = clean_text(news_text)
    vect = tfidf_vectorizer.transform([cleaned])
    pred = model.predict(vect)
    return "Spam" if pred[0] == 1 else "Not Spam"


# ----------- TEST SAMPLE ------------
sample_news = "Breaking: You won a free trip to Dubai! Click now!"
print(f"\nNews: {sample_news}")
print("Prediction:", predict_news(sample_news))

Sample Data:
                                        text     label
0      Free gift for you! Click now to claim      spam
1     Meeting scheduled for tomorrow at 10am  not_spam
2      Win a free iPhone! Limited time offer      spam
3         Please review the quarterly report  not_spam
4  URGENT: Your account has been compromised      spam

Accuracy: 0.0

Confusion Matrix:
 [[0 1]
 [0 0]]

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0


News: Breaking: You won a free trip to Dubai! Click now!
Prediction: Spam


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
