In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy.stats import mode
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje2/Türkçe Spam Proje/sonuc2.csv')
data.head()

In [None]:
data.shape

In [None]:
data.isnull().sum()

In [None]:
data["sonuc"].value_counts()

In [None]:
# SentenceTransformers modelleri
sentence_transformers_models = {
    "all-MiniLM-L12-v2": SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2"),
    "multilingual-e5-large-instruct": SentenceTransformer("intfloat/multilingual-e5-large-instruct"),
    "gte-large": SentenceTransformer("thenlper/gte-large"),
    "bert-base-turkish-uncased": SentenceTransformer("dbmdz/bert-base-turkish-uncased"),
    "jina-embeddings-v3": SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)
}

# Makine öğrenimi modelleri
ml_models = {
    "SVM": SVC(kernel="linear", probability=True, random_state=42),
    "RF": RandomForestClassifier(n_estimators=100, random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
}

In [None]:
data = data.drop("Unnamed: 0", axis=1)
data.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["sonuc"], test_size=0.2,  random_state=42)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
# Hedef değişkenleri encode etme
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [None]:
# 3. Temsil Yöntemi ile Embedding Oluşturma
def create_embeddings(texts, model_name, model_type):
    if model_type == "sentence-transformers":
        return model_name.encode(texts, batch_size=32, show_progress_bar=True)

In [None]:
# Embedding'leri oluştur ve kaydet
for representation_name, representation_model in sentence_transformers_models.items():
    print(f"\nTemsil yöntemi: {representation_name}")

    # Embedding oluşturma
    X_train_embeddings = create_embeddings(X_train.tolist(), representation_model, "sentence-transformers")
    X_test_embeddings = create_embeddings(X_test.tolist(), representation_model, "sentence-transformers")

    # Numpy formatında kaydetme
    np.save(f"/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje2/Türkçe Spam Proje/türkçe_spam_transformers/{representation_name}_train.npy", X_train_embeddings)
    np.save(f"/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje2/Türkçe Spam Proje/türkçe_spam_transformers/{representation_name}_test.npy", X_test_embeddings)

    print(f"Embedding'ler {representation_name} için kaydedildi!")

In [None]:
param_grids = {
    "SVM": {"C": [0.1, 1], "kernel": ["linear", "rbf"]},
    "RF": {"n_estimators": [50, 100], "max_depth": [10, None]},
    "MLP": {"hidden_layer_sizes": [(50,), (100,)], "max_iter": [300, 500]}
}

In [None]:
# Sonuçları saklamak için sözlükler
results = {}
model_ensemble_results = {}
final_ensemble_results = {}

# Her temsil yöntemi için ensemble oluşturma
for representation_name in sentence_transformers_models:
    print(f"\nTemsil yöntemi: {representation_name}")

    # Embedding'leri yükleme
    X_train_embeddings = np.load(f"/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje2/Türkçe Spam Proje/türkçe_spam_transformers/{representation_name}_train.npy")
    X_test_embeddings = np.load(f"/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje2/Türkçe Spam Proje/türkçe_spam_transformers/{representation_name}_test.npy")

    # Temsil yöntemi için sonuçları başlatma
    results[representation_name] = {"Individual": {}, "Ensemble": None}
    optimized_models = []  # Her temsil yöntemi için optimize modeller

    # Bireysel modeller için
    for ml_name, ml_model in ml_models.items():
        print(f"\nModel: {ml_name}")

        # GridSearchCV ile parametre optimizasyonu
        grid = GridSearchCV(ml_model, param_grids[ml_name], cv=3, scoring='accuracy', verbose=1)
        grid.fit(X_train_embeddings, y_train)

        # Optimize edilmiş modeli saklama
        optimized_models.append((ml_name, grid.best_estimator_))
        print(grid.best_estimator_)

        # Test setinde doğruluk hesaplama
        y_pred = grid.best_estimator_.predict(X_test_embeddings)
        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        results[representation_name]["Individual"][ml_name] = accuracy
        print(f"{ml_name} Accuracy: {accuracy}")
        print(f"{ml_name} Confusion Matrix:\n{conf_matrix}")
        print(f"{ml_name} Precision:\n{precision}")
        print(f"{ml_name} Recall:\n{recall}")
        print(f"{ml_name} F1 Score:\n{f1}")


    # Temsil yöntemi için ensemble modeli
    representation_ensemble = VotingClassifier(estimators=optimized_models, voting='hard')
    representation_ensemble.fit(X_train_embeddings, y_train)
    y_rep_ensemble_pred = representation_ensemble.predict(X_test_embeddings)
    representation_ensemble_accuracy = accuracy_score(y_test, y_rep_ensemble_pred)
    representation_ensemble_conf_matrix = confusion_matrix(y_test, y_rep_ensemble_pred)
    representation_ensemble_precision = precision_score(y_test, y_rep_ensemble_pred, average='weighted')
    representation_ensemble_recall = recall_score(y_test, y_rep_ensemble_pred, average='weighted')
    representation_ensemble_f1 = f1_score(y_test, y_rep_ensemble_pred, average='weighted')
    results[representation_name]["Ensemble"] = representation_ensemble_accuracy
    print(f"Representation Ensemble Accuracy: {representation_ensemble_accuracy}")
    print(f"Representation Ensemble Confusion Matrix:\n{representation_ensemble_conf_matrix}")
    print(f"Representation Ensemble Precision: {representation_ensemble_precision}")
    print(f"Representation Ensemble Recall: {representation_ensemble_recall}")
    print(f"Representation Ensemble F1 Score: {representation_ensemble_f1}")


In [None]:
results

In [None]:
# Aynı model türü için ensemble oluşturma (SVM, RF, MLP)
for ml_name in ml_models.keys():
    print(f"\nModel Türü Ensemble: {ml_name}")

    # Tüm temsil yöntemlerinden alınan bireysel modelleri birleştirme
    model_estimators = [
        (representation_name, results[representation_name]["Individual"][ml_name])
        for representation_name in sentence_transformers_models
    ]

    # Ensemble modeli
    model_ensemble = VotingClassifier(
        estimators=[
            (representation_name, grid.best_estimator_)  # Doğrudan optimize edilmiş modeller
            for representation_name in sentence_transformers_models
        ],
        voting='hard'
    )
    model_ensemble.fit(X_train_embeddings, y_train)
    y_model_pred = model_ensemble.predict(X_test_embeddings)
    model_ensemble_accuracy = accuracy_score(y_test, y_model_pred)
    model_ensemble_conf_matrix = confusion_matrix(y_test, y_model_pred)
    model_ensemble_precision = precision_score(y_test, y_model_pred, average='weighted')
    model_ensemble_recall = recall_score(y_test, y_model_pred, average='weighted')
    model_ensemble_f1 = f1_score(y_test, y_model_pred, average='weighted')
    model_ensemble_results[ml_name] = model_ensemble_accuracy
    print(f"Model Ensemble ({ml_name}) Accuracy: {model_ensemble_accuracy}")
    print(f"Representation Ensemble Confusion Matrix:\n{model_ensemble_conf_matrix}")
    print(f"Representation Ensemble Confusion Matrix:\n{model_ensemble_precision}")
    print(f"Representation Ensemble Confusion Matrix:\n{model_ensemble_recall}")
    print(f"Representation Ensemble Confusion Matrix:\n{model_ensemble_f1}")

In [None]:
# Genel ensemble (Tüm temsil yöntemleri ve modeller)
print("\nGenel Ensemble")

overall_estimators = []

# Temsil yöntemi ensemble modellerini ekleme
for representation_name in sentence_transformers_models:
    representation_ensemble = VotingClassifier(
        estimators=[
            (ml_name, grid.best_estimator_)  # Optimize edilmiş modeller
            for ml_name in ml_models.keys()
        ],
        voting='hard'
    )
    representation_ensemble.fit(X_train_embeddings, y_train)
    overall_estimators.append((representation_name, representation_ensemble))

# Genel ensemble modeli
overall_ensemble = VotingClassifier(estimators=overall_estimators, voting='hard')
overall_ensemble.fit(X_train_embeddings, y_train)

# Genel ensemble modeliyle tahmin yapma
y_overall_pred = overall_ensemble.predict(X_test_embeddings)
overall_accuracy = accuracy_score(y_test, y_overall_pred)
overall_conf_matrix = confusion_matrix(y_test, y_overall_pred)
overall_precision = precision_score(y_test, y_overall_pred, average='weighted')
overall_recall = recall_score(y_test, y_overall_pred, average='weighted')
overall_f1 = f1_score(y_test, y_overall_pred, average='weighted')
final_ensemble_results["Overall"] = overall_accuracy
print(f"Overall Ensemble Accuracy: {overall_accuracy}")
print(f"Overall Ensemble Accuracy: {overall_conf_matrix}")
print(f"Overall Ensemble Accuracy: {overall_precision}")
print(f"Overall Ensemble Accuracy: {overall_recall}")
print(f"Overall Ensemble Accuracy: {overall_f1}")


# Sonuçları yazdırma
print("\nSonuçlar:")
for representation_name, metrics in results.items():
    print(f"\nTemsil Yöntemi: {representation_name}")
    for ml_name, acc in metrics["Individual"].items():
        print(f"  {ml_name} Accuracy: {acc}")
    print(f"  Representation Ensemble Accuracy: {metrics['Ensemble']}")

print("\nModel Ensemble Sonuçları:")
for ml_name, acc in model_ensemble_results.items():
    print(f"  {ml_name} Ensemble Accuracy: {acc}")

print("\nGenel Ensemble Sonucu:")
print(f"  Overall Ensemble Accuracy: {final_ensemble_results['Overall']}")