In [7]:

# 📚 Gerekli kütüphaneleri yükle
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_auc_score

from gensim.models import Word2Vec
import warnings
warnings.filterwarnings("ignore")

nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gultekinqwe/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gultekinqwe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [8]:
# 📦 Ön işleme fonksiyonu
def preprocess(text):
    stop_words = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    text = str(text).lower()
    text = re.sub(r"[^a-zA-Z ]", "", text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)


In [9]:
# 🔄 Özellik çıkarımı fonksiyonu
def extract_features(df):
    df["clean_text"] = df["text"].apply(preprocess)
    df = df[df["clean_text"].str.strip() != ""]
    df = df[df["clean_text"].str.len() > 3]

    tfidf = TfidfVectorizer(max_features=5000)
    tfidf_vectors = tfidf.fit_transform(df["clean_text"])

    lsa = TruncatedSVD(n_components=100, random_state=42)
    X_lsa = lsa.fit_transform(tfidf_vectors)

    sentences = [text.split() for text in df["clean_text"]]
    w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=2)
    word_vectors = w2v_model.wv

    kmeans = KMeans(n_clusters=50, random_state=42)
    kmeans.fit(word_vectors.vectors.astype(np.float64))

    def get_cluster_features(text):
        cluster_count = np.zeros(50)
        if not isinstance(text, str): return cluster_count
        for word in text.split():
            if word in word_vectors:
                vec = np.asarray(word_vectors[word], dtype=np.float64)
                idx = kmeans.predict([vec])[0]
                cluster_count[idx] += 1
        return cluster_count

    semantic_vectors = np.array([get_cluster_features(text) for text in df["clean_text"]])
    X = np.hstack((X_lsa, semantic_vectors))
    y = df["label"].values

    return X, y

In [10]:
# ⚙️ Model eğitim ve değerlendirme fonksiyonu
def train_and_evaluate(X, y, dataset_name):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    models = {
        "MLP": MLPClassifier(max_iter=300, random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42),
        "SVM": SVC(probability=True, random_state=42)
    }

    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f"\n--- {dataset_name} | {name} ---")
        print(classification_report(y_test, y_pred))
        print("Accuracy:", accuracy_score(y_test, y_pred))
        print("F1 Score:", f1_score(y_test, y_pred))
        print("ROC AUC:", roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

In [14]:
# Veri setlerini oku
df1 = pd.read_csv("dataFile/spam_modified.csv", sep=';', header=0, usecols=[0, 1])
df2 = pd.read_csv("dataFile/enron_spam_data_modified.csv", sep=';', header=0, usecols=[0, 1])

df1.columns = ['label', 'text']
df2.columns = ['label', 'text']

df1['label'] = df1['label'].str.strip().str.lower().map({'ham': 0, 'spam': 1})
df2['label'] = df2['label'].str.strip().str.lower().map({'ham': 0, 'spam': 1})
    
df1.dropna(subset=['label', 'text'], inplace=True)
df2.dropna(subset=['label', 'text'], inplace=True)

df1['label'] = df1['label'].astype(int)
df2['label'] = df2['label'].astype(int)

(5571, 2)
(33345, 2)


In [12]:

# Özellik çıkar ve eğit
X1, y1 = extract_features(df1)
train_and_evaluate(X1, y1, "Spam Modified Dataset")

X2, y2 = extract_features(df2)
train_and_evaluate(X2, y2, "Enron Dataset")



--- Spam Modified Dataset | MLP ---
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       956
           1       0.91      0.85      0.88       149

    accuracy                           0.97      1105
   macro avg       0.95      0.92      0.93      1105
weighted avg       0.97      0.97      0.97      1105

Accuracy: 0.9692307692307692
F1 Score: 0.8819444444444444
ROC AUC: 0.9695038050040717

--- Spam Modified Dataset | Random Forest ---
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       956
           1       0.98      0.80      0.88       149

    accuracy                           0.97      1105
   macro avg       0.98      0.90      0.93      1105
weighted avg       0.97      0.97      0.97      1105

Accuracy: 0.9710407239819004
F1 Score: 0.8814814814814815
ROC AUC: 0.9752078009603774

--- Spam Modified Dataset | SVM ---
              precision    recall  f1-score   support