In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy.stats import mode
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel

  from tqdm.autonotebook import tqdm, trange


In [3]:
data = pd.read_csv('/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje2/Türkçe Spam Proje/sonuc2.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,text,sonuc
0,0,125 lira,norm
1,1,Baskanin aksam toplantısi fenaymis :),norm
2,2,Bilal yalçnlara ne zaman gidiyoruz?,norm
3,3,"BiP ile mesajlarimi aninda, daha eglenceli gon...",spam
4,4,DIGITURKTEN FIRSAT! SiZE OZEL YIL SONUNA KADAR...,spam


In [5]:
data.shape

(5768, 3)

In [6]:
data.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
text,0
sonuc,0


In [7]:
data["sonuc"].value_counts()

Unnamed: 0_level_0,count
sonuc,Unnamed: 1_level_1
spam,3051
norm,2717


In [10]:
# SentenceTransformers modelleri
sentence_transformers_models = {
    "all-MiniLM-L12-v2": SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2"),
    "multilingual-e5-large-instruct": SentenceTransformer("intfloat/multilingual-e5-large-instruct"),
    "gte-large": SentenceTransformer("thenlper/gte-large"),
    "bert-base-turkish-uncased": SentenceTransformer("dbmdz/bert-base-turkish-uncased"),
    "jina-embeddings-v3": SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)
}

# Makine öğrenimi modelleri
ml_models = {
    "SVM": SVC(kernel="linear", probability=True, random_state=42),
    "RF": RandomForestClassifier(n_estimators=100, random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
}

modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]



In [12]:
data = data.drop("Unnamed: 0", axis=1)
data.head()

Unnamed: 0,text,sonuc
0,125 lira,norm
1,Baskanin aksam toplantısi fenaymis :),norm
2,Bilal yalçnlara ne zaman gidiyoruz?,norm
3,"BiP ile mesajlarimi aninda, daha eglenceli gon...",spam
4,DIGITURKTEN FIRSAT! SiZE OZEL YIL SONUNA KADAR...,spam


In [13]:
X_train, X_test, y_train, y_test = train_test_split(data["text"], data["sonuc"], test_size=0.2,  random_state=42)

In [14]:
y_train.value_counts()

Unnamed: 0_level_0,count
sonuc,Unnamed: 1_level_1
spam,2433
norm,2181


In [15]:
y_test.value_counts()

Unnamed: 0_level_0,count
sonuc,Unnamed: 1_level_1
spam,618
norm,536


In [16]:
# Hedef değişkenleri encode etme
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

In [17]:
# 3. Temsil Yöntemi ile Embedding Oluşturma
def create_embeddings(texts, model_name, model_type):
    if model_type == "sentence-transformers":
        return model_name.encode(texts, batch_size=32, show_progress_bar=True)

In [19]:
# Embedding'leri oluştur ve kaydet
for representation_name, representation_model in sentence_transformers_models.items():
    print(f"\nTemsil yöntemi: {representation_name}")

    # Embedding oluşturma
    X_train_embeddings = create_embeddings(X_train.tolist(), representation_model, "sentence-transformers")
    X_test_embeddings = create_embeddings(X_test.tolist(), representation_model, "sentence-transformers")

    # Numpy formatında kaydetme
    np.save(f"/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje2/Türkçe Spam Proje/türkçe_spam_transformers/{representation_name}_train.npy", X_train_embeddings)
    np.save(f"/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje2/Türkçe Spam Proje/türkçe_spam_transformers/{representation_name}_test.npy", X_test_embeddings)

    print(f"Embedding'ler {representation_name} için kaydedildi!")


Temsil yöntemi: all-MiniLM-L12-v2


Batches:   0%|          | 0/145 [00:00<?, ?it/s]

Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Embedding'ler all-MiniLM-L12-v2 için kaydedildi!

Temsil yöntemi: multilingual-e5-large-instruct


Batches:   0%|          | 0/145 [00:00<?, ?it/s]

Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Embedding'ler multilingual-e5-large-instruct için kaydedildi!

Temsil yöntemi: gte-large


Batches:   0%|          | 0/145 [00:00<?, ?it/s]

Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Embedding'ler gte-large için kaydedildi!

Temsil yöntemi: bert-base-turkish-uncased


Batches:   0%|          | 0/145 [00:00<?, ?it/s]

Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Embedding'ler bert-base-turkish-uncased için kaydedildi!

Temsil yöntemi: jina-embeddings-v3


Batches:   0%|          | 0/145 [00:00<?, ?it/s]

Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Embedding'ler jina-embeddings-v3 için kaydedildi!


In [20]:
param_grids = {
    "SVM": {"C": [0.1, 1], "kernel": ["linear", "rbf"]},
    "RF": {"n_estimators": [50, 100], "max_depth": [10, None]},
    "MLP": {"hidden_layer_sizes": [(50,), (100,)], "max_iter": [300, 500]}
}

In [23]:
# Sonuçları saklamak için sözlükler
results = {}
model_ensemble_results = {}
final_ensemble_results = {}

# Her temsil yöntemi için ensemble oluşturma
for representation_name in sentence_transformers_models:
    print(f"\nTemsil yöntemi: {representation_name}")

    # Embedding'leri yükleme
    X_train_embeddings = np.load(f"/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje2/Türkçe Spam Proje/türkçe_spam_transformers/{representation_name}_train.npy")
    X_test_embeddings = np.load(f"/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje2/Türkçe Spam Proje/türkçe_spam_transformers/{representation_name}_test.npy")

    # Temsil yöntemi için sonuçları başlatma
    results[representation_name] = {"Individual": {}, "Ensemble": None}
    optimized_models = []  # Her temsil yöntemi için optimize modeller

    # Bireysel modeller için
    for ml_name, ml_model in ml_models.items():
        print(f"\nModel: {ml_name}")

        # GridSearchCV ile parametre optimizasyonu
        grid = GridSearchCV(ml_model, param_grids[ml_name], cv=3, scoring='accuracy', verbose=1)
        grid.fit(X_train_embeddings, y_train)

        # Optimize edilmiş modeli saklama
        optimized_models.append((ml_name, grid.best_estimator_))
        print(grid.best_estimator_)

        # Test setinde doğruluk hesaplama
        y_pred = grid.best_estimator_.predict(X_test_embeddings)
        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        results[representation_name]["Individual"][ml_name] = accuracy
        print(f"{ml_name} Accuracy: {accuracy}")
        print(f"{ml_name} Confusion Matrix:\n{conf_matrix}")
        print(f"{ml_name} Precision:\n{precision}")
        print(f"{ml_name} Recall:\n{recall}")
        print(f"{ml_name} F1 Score:\n{f1}")


    # Temsil yöntemi için ensemble modeli
    representation_ensemble = VotingClassifier(estimators=optimized_models, voting='hard')
    representation_ensemble.fit(X_train_embeddings, y_train)
    y_rep_ensemble_pred = representation_ensemble.predict(X_test_embeddings)
    representation_ensemble_accuracy = accuracy_score(y_test, y_rep_ensemble_pred)
    representation_ensemble_conf_matrix = confusion_matrix(y_test, y_rep_ensemble_pred)
    representation_ensemble_precision = precision_score(y_test, y_rep_ensemble_pred, average='weighted')
    representation_ensemble_recall = recall_score(y_test, y_rep_ensemble_pred, average='weighted')
    representation_ensemble_f1 = f1_score(y_test, y_rep_ensemble_pred, average='weighted')
    results[representation_name]["Ensemble"] = representation_ensemble_accuracy
    print(f"Representation Ensemble Accuracy: {representation_ensemble_accuracy}")
    print(f"Representation Ensemble Confusion Matrix:\n{representation_ensemble_conf_matrix}")
    print(f"Representation Ensemble Precision: {representation_ensemble_precision}")
    print(f"Representation Ensemble Recall: {representation_ensemble_recall}")
    print(f"Representation Ensemble F1 Score: {representation_ensemble_f1}")



Temsil yöntemi: all-MiniLM-L12-v2

Model: SVM
Fitting 3 folds for each of 4 candidates, totalling 12 fits
SVC(C=1, probability=True, random_state=42)
SVM Accuracy: 0.9618717504332756
SVM Confusion Matrix:
[[524  12]
 [ 32 586]]
SVM Precision:
0.9625214396395567
SVM Recall:
0.9618717504332756
SVM F1 Score:
0.9619073555439058

Model: RF
Fitting 3 folds for each of 4 candidates, totalling 12 fits
RandomForestClassifier(random_state=42)
RF Accuracy: 0.91421143847487
RF Confusion Matrix:
[[514  22]
 [ 77 541]]
RF Precision:
0.9185585961286816
RF Recall:
0.91421143847487
RF F1 Score:
0.9143071539205242

Model: MLP
Fitting 3 folds for each of 4 candidates, totalling 12 fits
MLPClassifier(hidden_layer_sizes=(50,), max_iter=300, random_state=42)
MLP Accuracy: 0.951473136915078
MLP Confusion Matrix:
[[516  20]
 [ 36 582]]
MLP Precision:
0.9519167389939996
MLP Recall:
0.951473136915078
MLP F1 Score:
0.9515117431465583
Representation Ensemble Accuracy: 0.9584055459272097
Representation Ensemble C



MLPClassifier(hidden_layer_sizes=(50,), max_iter=500, random_state=42)
MLP Accuracy: 0.9488734835355286
MLP Confusion Matrix:
[[509  27]
 [ 32 586]]
MLP Precision:
0.9489389256747716
MLP Recall:
0.9488734835355286
MLP F1 Score:
0.9488883303463954
Representation Ensemble Accuracy: 0.9549393414211439
Representation Ensemble Confusion Matrix:
[[519  17]
 [ 35 583]]
Representation Ensemble Precision: 0.9554828285229839
Representation Ensemble Recall: 0.9549393414211439
Representation Ensemble F1 Score: 0.9549784413734517

Temsil yöntemi: bert-base-turkish-uncased

Model: SVM
Fitting 3 folds for each of 4 candidates, totalling 12 fits
SVC(C=1, probability=True, random_state=42)
SVM Accuracy: 0.9809358752166378
SVM Confusion Matrix:
[[527   9]
 [ 13 605]]
SVM Precision:
0.9809685133227661
SVM Recall:
0.9809358752166378
SVM F1 Score:
0.9809403621382966

Model: RF
Fitting 3 folds for each of 4 candidates, totalling 12 fits
RandomForestClassifier(random_state=42)
RF Accuracy: 0.9688041594454073

In [24]:
results

{'all-MiniLM-L12-v2': {'Individual': {'SVM': 0.9618717504332756,
   'RF': 0.91421143847487,
   'MLP': 0.951473136915078},
  'Ensemble': 0.9584055459272097},
 'multilingual-e5-large-instruct': {'Individual': {'SVM': 0.987001733102253,
   'RF': 0.9766031195840554,
   'MLP': 0.9835355285961872},
  'Ensemble': 0.987001733102253},
 'gte-large': {'Individual': {'SVM': 0.951473136915078,
   'RF': 0.9211438474870017,
   'MLP': 0.9488734835355286},
  'Ensemble': 0.9549393414211439},
 'bert-base-turkish-uncased': {'Individual': {'SVM': 0.9809358752166378,
   'RF': 0.9688041594454073,
   'MLP': 0.9861351819757366},
  'Ensemble': 0.9818024263431543},
 'jina-embeddings-v3': {'Individual': {'SVM': 0.9835355285961872,
   'RF': 0.9714038128249567,
   'MLP': 0.9748700173310225},
  'Ensemble': 0.9818024263431543}}

In [25]:
# Aynı model türü için ensemble oluşturma (SVM, RF, MLP)
for ml_name in ml_models.keys():
    print(f"\nModel Türü Ensemble: {ml_name}")

    # Tüm temsil yöntemlerinden alınan bireysel modelleri birleştirme
    model_estimators = [
        (representation_name, results[representation_name]["Individual"][ml_name])
        for representation_name in sentence_transformers_models
    ]

    # Ensemble modeli
    model_ensemble = VotingClassifier(
        estimators=[
            (representation_name, grid.best_estimator_)  # Doğrudan optimize edilmiş modeller
            for representation_name in sentence_transformers_models
        ],
        voting='hard'
    )
    model_ensemble.fit(X_train_embeddings, y_train)
    y_model_pred = model_ensemble.predict(X_test_embeddings)
    model_ensemble_accuracy = accuracy_score(y_test, y_model_pred)
    model_ensemble_conf_matrix = confusion_matrix(y_test, y_model_pred)
    model_ensemble_precision = precision_score(y_test, y_model_pred, average='weighted')
    model_ensemble_recall = recall_score(y_test, y_model_pred, average='weighted')
    model_ensemble_f1 = f1_score(y_test, y_model_pred, average='weighted')
    model_ensemble_results[ml_name] = model_ensemble_accuracy
    print(f"Model Ensemble ({ml_name}) Accuracy: {model_ensemble_accuracy}")
    print(f"Representation Ensemble Confusion Matrix:\n{model_ensemble_conf_matrix}")
    print(f"Representation Ensemble Confusion Matrix:\n{model_ensemble_precision}")
    print(f"Representation Ensemble Confusion Matrix:\n{model_ensemble_recall}")
    print(f"Representation Ensemble Confusion Matrix:\n{model_ensemble_f1}")


Model Türü Ensemble: SVM
Model Ensemble (SVM) Accuracy: 0.9748700173310225
Representation Ensemble Confusion Matrix:
[[524  12]
 [ 17 601]]
Representation Ensemble Confusion Matrix:
0.9749213493663761
Representation Ensemble Confusion Matrix:
0.9748700173310225
Representation Ensemble Confusion Matrix:
0.9748773149160249

Model Türü Ensemble: RF
Model Ensemble (RF) Accuracy: 0.9748700173310225
Representation Ensemble Confusion Matrix:
[[524  12]
 [ 17 601]]
Representation Ensemble Confusion Matrix:
0.9749213493663761
Representation Ensemble Confusion Matrix:
0.9748700173310225
Representation Ensemble Confusion Matrix:
0.9748773149160249

Model Türü Ensemble: MLP
Model Ensemble (MLP) Accuracy: 0.9748700173310225
Representation Ensemble Confusion Matrix:
[[524  12]
 [ 17 601]]
Representation Ensemble Confusion Matrix:
0.9749213493663761
Representation Ensemble Confusion Matrix:
0.9748700173310225
Representation Ensemble Confusion Matrix:
0.9748773149160249


In [26]:
# Genel ensemble (Tüm temsil yöntemleri ve modeller)
print("\nGenel Ensemble")

overall_estimators = []

# Temsil yöntemi ensemble modellerini ekleme
for representation_name in sentence_transformers_models:
    representation_ensemble = VotingClassifier(
        estimators=[
            (ml_name, grid.best_estimator_)  # Optimize edilmiş modeller
            for ml_name in ml_models.keys()
        ],
        voting='hard'
    )
    representation_ensemble.fit(X_train_embeddings, y_train)
    overall_estimators.append((representation_name, representation_ensemble))

# Genel ensemble modeli
overall_ensemble = VotingClassifier(estimators=overall_estimators, voting='hard')
overall_ensemble.fit(X_train_embeddings, y_train)

# Genel ensemble modeliyle tahmin yapma
y_overall_pred = overall_ensemble.predict(X_test_embeddings)
overall_accuracy = accuracy_score(y_test, y_overall_pred)
overall_conf_matrix = confusion_matrix(y_test, y_overall_pred)
overall_precision = precision_score(y_test, y_overall_pred, average='weighted')
overall_recall = recall_score(y_test, y_overall_pred, average='weighted')
overall_f1 = f1_score(y_test, y_overall_pred, average='weighted')
final_ensemble_results["Overall"] = overall_accuracy
print(f"Overall Ensemble Accuracy: {overall_accuracy}")
print(f"Overall Ensemble Accuracy: {overall_conf_matrix}")
print(f"Overall Ensemble Accuracy: {overall_precision}")
print(f"Overall Ensemble Accuracy: {overall_recall}")
print(f"Overall Ensemble Accuracy: {overall_f1}")


# Sonuçları yazdırma
print("\nSonuçlar:")
for representation_name, metrics in results.items():
    print(f"\nTemsil Yöntemi: {representation_name}")
    for ml_name, acc in metrics["Individual"].items():
        print(f"  {ml_name} Accuracy: {acc}")
    print(f"  Representation Ensemble Accuracy: {metrics['Ensemble']}")

print("\nModel Ensemble Sonuçları:")
for ml_name, acc in model_ensemble_results.items():
    print(f"  {ml_name} Ensemble Accuracy: {acc}")

print("\nGenel Ensemble Sonucu:")
print(f"  Overall Ensemble Accuracy: {final_ensemble_results['Overall']}")


Genel Ensemble
Overall Ensemble Accuracy: 0.9748700173310225
Overall Ensemble Accuracy: [[524  12]
 [ 17 601]]
Overall Ensemble Accuracy: 0.9749213493663761
Overall Ensemble Accuracy: 0.9748700173310225
Overall Ensemble Accuracy: 0.9748773149160249

Sonuçlar:

Temsil Yöntemi: all-MiniLM-L12-v2
  SVM Accuracy: 0.9618717504332756
  RF Accuracy: 0.91421143847487
  MLP Accuracy: 0.951473136915078
  Representation Ensemble Accuracy: 0.9584055459272097

Temsil Yöntemi: multilingual-e5-large-instruct
  SVM Accuracy: 0.987001733102253
  RF Accuracy: 0.9766031195840554
  MLP Accuracy: 0.9835355285961872
  Representation Ensemble Accuracy: 0.987001733102253

Temsil Yöntemi: gte-large
  SVM Accuracy: 0.951473136915078
  RF Accuracy: 0.9211438474870017
  MLP Accuracy: 0.9488734835355286
  Representation Ensemble Accuracy: 0.9549393414211439

Temsil Yöntemi: bert-base-turkish-uncased
  SVM Accuracy: 0.9809358752166378
  RF Accuracy: 0.9688041594454073
  MLP Accuracy: 0.9861351819757366
  Represent