In [1]:
# --- Safe, fast pipeline (no punkt/downloads needed) -------------------------
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# read data (specify encoding if needed)
df = pd.read_csv('spam.csv', encoding='latin-1')

# keep only useful cols, drop extras
extra_cols = [c for c in df.columns if c.startswith('Unnamed')]
if extra_cols:
    df.drop(columns=extra_cols, inplace=True)
df.rename(columns={'v1':'target','v2':'text'}, inplace=True)
df = df[['target','text']].dropna().drop_duplicates().reset_index(drop=True)

# label encode
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

# text preprocessing: regex tokenizer + cached stopwords + Porter stemmer
from nltk.stem.porter import PorterStemmer
try:
    from nltk.corpus import stopwords
    STOPWORDS = set(stopwords.words('english'))
except Exception:
    # fallback small stopword set to avoid failure if nltk data missing
    STOPWORDS = {"a","an","the","in","on","at","for","and","or","is","it","to","of"}

ps = PorterStemmer()
RE_WORD = re.compile(r'\w+')

def transform_text_safe(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    tokens = RE_WORD.findall(text)                 # word tokens only, no punkt needed
    tokens = [t for t in tokens if t not in STOPWORDS]
    stems = [ps.stem(t) for t in tokens]
    return " ".join(stems)

# apply
df['transformed_text'] = df['text'].apply(transform_text_safe)

# TF-IDF (small dataset so dense is ok)
from sklearn.feature_extraction.text import TfidfVectorizer
tfid = TfidfVectorizer(max_features=500)
X = tfid.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

# train-test split (use stratify)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=2, stratify=y)

# models (kept as you had them; change SVC kernel if desired)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

clfs = {
    'SVC': SVC(kernel='sigmoid', gamma=1.0),
    'KNN': KNeighborsClassifier(),
    'NB': MultinomialNB(),
    'DT': DecisionTreeClassifier(max_depth=5),
    'LR': LogisticRegression(solver='liblinear', penalty='l1'),
    'RF': RandomForestClassifier(n_estimators=50, random_state=2, n_jobs=-1),
    'Adaboost': AdaBoostClassifier(n_estimators=50, random_state=2),
    'Bgc': BaggingClassifier(n_estimators=50, random_state=2, n_jobs=-1),
    'ETC': ExtraTreesClassifier(n_estimators=50, random_state=2, n_jobs=-1),
    'GBDT': GradientBoostingClassifier(n_estimators=50, random_state=2),
    'xgb': XGBClassifier(n_estimators=50, random_state=2, use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
}

# evaluation with safe precision (zero_division)
from sklearn.metrics import accuracy_score, precision_score
def train_classifier(model, X_tr, y_tr, X_te, y_te):
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    acc = accuracy_score(y_te, y_pred)
    prec = precision_score(y_te, y_pred, zero_division=0)
    return acc, prec

for name, model in clfs.items():
    try:
        a, p = train_classifier(model, X_train, y_train, X_test, y_test)
        print(f"{name}: accuracy={a:.4f}, precision={p:.4f}")
    except Exception as e:
        print(f"{name} failed: {e}")


SVC: accuracy=0.9826, precision=0.9748
KNN: accuracy=0.9255, precision=0.9500
NB: accuracy=0.9826, precision=0.9669
DT: accuracy=0.9468, precision=0.8585
LR: accuracy=0.9787, precision=0.9580
RF: accuracy=0.9816, precision=0.9667
Adaboost: accuracy=0.9217, precision=0.7778
Bgc failed: No module named '_posixsubprocess'
ETC: accuracy=0.9874, precision=0.9683
GBDT: accuracy=0.9623, precision=0.9694
xgb: accuracy=0.9749, precision=0.9268


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
