In [1]:
# ==============================================================================
# 1. IMPORTS AND INITIAL SETUP
# ==============================================================================
print("Step 1: Importing libraries...")

import numpy as np
import pandas as pd
import string
import nltk
import os

# For text processing
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# For machine learning
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score

# All the models to be evaluated
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

print("Libraries imported successfully.")

# ==============================================================================
# 2. DOWNLOAD NLTK STOPWORDS
# ==============================================================================
print("\nStep 2: Ensuring NLTK 'stopwords' are available...")
try:
    nltk.data.find('corpora/stopwords')
    print("'stopwords' data is already available.")
except LookupError:
    print("Downloading 'stopwords' data...")
    nltk.download('stopwords', quiet=True)
    print("'stopwords' downloaded successfully.")

# ==============================================================================
# 3. TEXT TRANSFORMATION FUNCTION
# ==============================================================================
print("\nStep 3: Defining the text transformation function...")

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def transform_text_alternative(text):
    """
    A text transformation function that avoids the problematic nltk.word_tokenize.
    """
    text = text.lower()
    words = text.split()
    
    stemmed_words = []
    for word in words:
        cleaned_word = word.translate(str.maketrans('', '', string.punctuation))
        if cleaned_word.isalnum() and cleaned_word not in stop_words:
            stemmed_words.append(ps.stem(cleaned_word))
            
    return " ".join(stemmed_words)

print("Alternative text transformation function is ready.")

# ==============================================================================
# 4. DATA LOADING AND CLEANING
# ==============================================================================
print("\nStep 4: Loading and cleaning the dataset...")

try:
    df = pd.read_csv('spam.csv', encoding='latin1')
    df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True, errors='ignore')
    df.rename(columns={'v1': 'target', 'v2': 'text'}, inplace=True)
    df.drop_duplicates(keep='first', inplace=True)
    print(f"Dataset loaded and cleaned. Shape: {df.shape}")
except FileNotFoundError:
    print("ERROR: 'spam.csv' not found. Please ensure it is in the same directory.")
    exit()

# ==============================================================================
# 5. DATA PREPROCESSING
# ==============================================================================
print("\nStep 5: Preprocessing data...")

encoder = LabelEncoder()
# THIS IS THE CORRECTED LINE:
df['target'] = encoder.fit_transform(df['target'])

# Apply the reliable transformation function
df['transformed_text'] = df['text'].apply(transform_text_alternative)
print("Text transformation and encoding complete.")

# ==============================================================================
# 6. FEATURE EXTRACTION AND SPLITTING
# ==============================================================================
print("\nStep 6: Vectorizing data and splitting into train/test sets...")
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['transformed_text']).toarray()
y = df['target'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)
print("Data is ready for model training.")

# ==============================================================================
# 7. MODEL TRAINING AND EVALUATION
# ==============================================================================
print("\nStep 7: Training and evaluating all models...")

clfs = {
    'SVC': SVC(kernel="sigmoid", gamma=1.0),
    'KNN': KNeighborsClassifier(),
    'NB': MultinomialNB(),
    'DT': DecisionTreeClassifier(max_depth=5),
    'LR': LogisticRegression(solver='liblinear', penalty='l1'),
    'RF': RandomForestClassifier(n_estimators=50, random_state=2),
    'Adaboost': AdaBoostClassifier(n_estimators=50, random_state=2),
    'Bgc': BaggingClassifier(n_estimators=50, random_state=2),
    'ETC': ExtraTreesClassifier(n_estimators=50, random_state=2),
    'GBDT': GradientBoostingClassifier(n_estimators=50, random_state=2),
    'xgb': XGBClassifier(n_estimators=50, random_state=2, use_label_encoder=False, eval_metric='logloss')
}

def train_classifier(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    return accuracy, precision

for name, clf in clfs.items():
    current_accuracy, current_precision = train_classifier(clf, X_train, y_train, X_test, y_test)
    print(f"\nModel: {name}")
    print(f"  - Accuracy: {current_accuracy:.4f}")
    print(f"  - Precision: {current_precision:.4f}")

print("\n==================================================")
print("All models have been trained and evaluated successfully.")
print("==================================================")

Step 1: Importing libraries...
Libraries imported successfully.

Step 2: Ensuring NLTK 'stopwords' are available...
'stopwords' data is already available.

Step 3: Defining the text transformation function...
Alternative text transformation function is ready.

Step 4: Loading and cleaning the dataset...
Dataset loaded and cleaned. Shape: (5169, 2)

Step 5: Preprocessing data...
Text transformation and encoding complete.

Step 6: Vectorizing data and splitting into train/test sets...
Data is ready for model training.

Step 7: Training and evaluating all models...

Model: SVC
  - Accuracy: 0.9739
  - Precision: 0.9664

Model: KNN
  - Accuracy: 0.9033
  - Precision: 1.0000

Model: NB
  - Accuracy: 0.9758
  - Precision: 1.0000

Model: DT
  - Accuracy: 0.9381
  - Precision: 0.8364

Model: LR
  - Accuracy: 0.9555
  - Precision: 0.9423

Model: RF
  - Accuracy: 0.9739
  - Precision: 0.9826

Model: Adaboost
  - Accuracy: 0.9246
  - Precision: 0.8409

Model: Bgc
  - Accuracy: 0.9574
  - Precisio

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Model: xgb
  - Accuracy: 0.9710
  - Precision: 0.9500

All models have been trained and evaluated successfully.
