In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,f1_score
from sentence_transformers import SentenceTransformer
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import pipeline
import torch
import pandas as pd
from tqdm import tqdm

In [None]:
X_train = pd.read_csv("/content/drive/MyDrive/Müş/LLM_Aug/data/X_train.csv")
X_test = pd.read_csv("/content/drive/MyDrive/Müş/LLM_Aug/data/X_test.csv")
y_train = pd.read_csv("/content/drive/MyDrive/Müş/LLM_Aug/data/y_train.csv")
y_test = pd.read_csv("/content/drive/MyDrive/Müş/LLM_Aug/data/y_test.csv")

In [None]:
##################################

In [None]:
import torch
import pandas as pd
from sentence_transformers import SentenceTransformer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from scipy.stats import mode

In [None]:
#Veri arttırma
augmented_sentences = []
num_aug = 4  # Her test metni için 4 farklı varyasyon oluşturulacak

# Flan-T5 modelini yükle
device = 0 if torch.cuda.is_available() else -1
generator = pipeline("text2text-generation", model="google/flan-t5-large", device=device)

# Her test metni için prompt oluştur ve FLAN-T5 modelini uygula
prompts = [f"Paraphrase the following text while maintaining its original meaning: {text}" for text in X_test["Text"]]

# Modelden paraphrase'ler al
results = generator(
    prompts,
    max_length=120,
    num_return_sequences=num_aug,
    do_sample=True,
    top_k=100,
    top_p=0.95,
    temperature=1.0,
    repetition_penalty=1.3,
    early_stopping=True
)

# Üretilen metinleri düzenleme
for result_list in results:
    generated_texts = [item['generated_text'] for item in result_list]
    augmented_sentences.append(generated_texts)

In [None]:
augmented_sentences

In [None]:
# --- Test verisini 3x ve 5x artırılmış halde oluştur ---
X_test_3x = []
X_test_5x = []

for i in range(len(X_test["Text"])):
    # Orijinal metni ekle
    X_test_3x.append(X_test["Text"].iloc[i])
    X_test_5x.append(X_test["Text"].iloc[i])


    # İlk 2 artırılmış metin 3x veri setine ekleniyor
    X_test_3x.extend(augmented_sentences[i][:2])

    # Tüm 4 artırılmış metin 5x veri setine ekleniyor
    X_test_5x.extend(augmented_sentences[i])


In [None]:
# --- Embedding'lerin oluşturulması ---
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
X_train_embeddings = embedding_model.encode(X_train["Text"], batch_size=32, show_progress_bar=True)
X_test_embeddings = embedding_model.encode(X_test["Text"], batch_size=32, show_progress_bar=True)
X_test_3x_embeddings = embedding_model.encode(X_test_3x, batch_size=32, show_progress_bar=True)
X_test_5x_embeddings = embedding_model.encode(X_test_5x, batch_size=32, show_progress_bar=True)

In [None]:
!pip uninstall -y xgboost scikit-learn
!pip install xgboost scikit-learn



In [None]:
!pip uninstall -y scikit-learn
!pip install scikit-learn==1.3.1

In [None]:
# XGBoost modeli eğitimi
model_xgb = xgb.XGBClassifier(
    objective='multi:softmax',  # Çok sınıflı sınıflandırma için
    num_class=5, # Sınıf sayısını belirle
    eval_metric='mlogloss',      # Çok sınıflı log loss metriği
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100,
    use_label_encoder=False
)
model_xgb.fit(X_train_embeddings, y_train)


In [None]:
# --- Model Tahminleri ---
original_predictions = model_xgb.predict(X_test_embeddings)
predictions_3x = model_xgb.predict(X_test_3x_embeddings)
predictions_5x = model_xgb.predict(X_test_5x_embeddings)

# --- Karar Birleştirme Fonksiyonu ---
def majority_voting(original_pred, augmented_preds):
    all_preds = augmented_preds + [original_pred]
    final_decision = mode(all_preds, keepdims=True).mode[0]
    return final_decision

# --- Nihai Kararların Belirlenmesi ---
final_predictions_3x = []
final_predictions_5x = []


In [None]:
y_test["Score"].values

In [None]:
from scipy.stats import mode
import numpy as np

# Nihai kararları saklayacak listeler
final_predictions_3x = []
final_predictions_5x = []

# 3 katına çıkmış tahminler için karar birleştirme
for i in range(len(original_predictions)):
    combined_preds_3x = [
        original_predictions[i],   # Orijinal tahmin
        predictions_3x[i * 3],     # 1. artırılmış tahmin
        predictions_3x[i * 3 + 1]  # 2. artırılmış tahmin
    ]
    # Çoğunluk kararı ile nihai tahmini belirle
    final_decision_3x = mode(combined_preds_3x, keepdims=True).mode[0]
    final_predictions_3x.append(final_decision_3x)

# 5 katına çıkmış tahminler için karar birleştirme
for i in range(len(original_predictions)):
    combined_preds_5x = [
        original_predictions[i],   # Orijinal tahmin
        predictions_5x[i * 5],     # 1. artırılmış tahmin
        predictions_5x[i * 5 + 1], # 2. artırılmış tahmin
        predictions_5x[i * 5 + 2], # 3. artırılmış tahmin
        predictions_5x[i * 5 + 3]  # 4. artırılmış tahmin
    ]
    # Çoğunluk kararı ile nihai tahmini belirle
    final_decision_5x = mode(combined_preds_5x, keepdims=True).mode[0]
    final_predictions_5x.append(final_decision_5x)

# Sonuçları ekrana yazdır
print("Original Test Accuracy:", accuracy_score(y_test, original_predictions))
print("3x Augmented Test Accuracy:", accuracy_score(y_test, final_predictions_3x))
print("5x Augmented Test Accuracy:", accuracy_score(y_test, final_predictions_5x))

cm = confusion_matrix(y_test, original_predictions)
plt.figure(figsize=(4, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["0","1","2","3","4"], yticklabels=["0","1","2","3","4"])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

cm = confusion_matrix(y_test, final_predictions_3x)
plt.figure(figsize=(4, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["0","1","2","3","4"], yticklabels=["0","1","2","3","4"])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

cm = confusion_matrix(y_test, final_predictions_5x)
plt.figure(figsize=(4, 3))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["0","1","2","3","4"], yticklabels=["0","1","2","3","4"])
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

## Pegasus

In [None]:
# Pegasus modelini yükle
device = 0 if torch.cuda.is_available() else -1
generator = pipeline("text2text-generation", model="tuner007/pegasus_paraphrase", device=device)

# Her test metni için prompt oluştur ve Pegasus modelini uygula
prompts = [f"Paraphrase: {text}" for text in X_test["Text"]]

# Modelden paraphrase'ler al
results = generator(
    prompts,
    max_length=120,
    num_return_sequences=num_aug,
    do_sample=True,
    top_k=100,
    top_p=0.95,
    temperature=1.0,
    repetition_penalty=1.3,
    early_stopping=True
)

# Üretilen metinleri düzenleme
augmented_sentences = []
for result_list in results:
    generated_texts = [item['generated_text'] for item in result_list]
    augmented_sentences.append(generated_texts)

In [None]:
X_test_3x = []
X_test_5x = []

for i in range(len(X_test["Text"])):
    # Orijinal metni ekle
    X_test_3x.append(X_test["Text"].iloc[i])
    X_test_5x.append(X_test["Text"].iloc[i])

    # İlk 2 artırılmış metin 3x veri setine ekleniyor
    X_test_3x.extend(augmented_sentences[i][:2])

    # Tüm 4 artırılmış metin 5x veri setine ekleniyor
    X_test_5x.extend(augmented_sentences[i])

In [None]:
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
X_train_embeddings = embedding_model.encode(X_train["Text"], batch_size=32, show_progress_bar=True)
X_test_embeddings = embedding_model.encode(X_test["Text"], batch_size=32, show_progress_bar=True)
X_test_3x_embeddings = embedding_model.encode(X_test_3x, batch_size=32, show_progress_bar=True)
X_test_5x_embeddings = embedding_model.encode(X_test_5x, batch_size=32, show_progress_bar=True)

In [None]:
'''# XGBoost modeli eğitimi
model_xgb = xgb.XGBClassifier(
    objective='multi:softmax',  # Çok sınıflı sınıflandırma için
    num_class=5,  # Sınıf sayısını belirle
    eval_metric='mlogloss',  # Çok sınıflı log loss metriği
    learning_rate=0.1,
    max_depth=6,
    n_estimators=100,
    use_label_encoder=False
)
model_xgb.fit(X_train_embeddings, y_train)'''

In [None]:
original_predictions = model_xgb.predict(X_test_embeddings)
predictions_3x = model_xgb.predict(X_test_3x_embeddings)
predictions_5x = model_xgb.predict(X_test_5x_embeddings)


In [None]:
from scipy.stats import mode
def majority_voting(original_pred, augmented_preds):
    all_preds = augmented_preds + [original_pred]
    final_decision = mode(all_preds, keepdims=True).mode[0]
    return final_decision

# --- Nihai Kararların Belirlenmesi ---
final_predictions_3x = []
final_predictions_5x = []

for i in range(len(original_predictions)):
    combined_preds_3x = [
        original_predictions[i],   # Orijinal tahmin
        predictions_3x[i * 3],     # 1. artırılmış tahmin
        predictions_3x[i * 3 + 1]  # 2. artırılmış tahmin
    ]
    final_decision_3x = mode(combined_preds_3x, keepdims=True).mode[0]
    final_predictions_3x.append(final_decision_3x)

for i in range(len(original_predictions)):
    combined_preds_5x = [
        original_predictions[i],   # Orijinal tahmin
        predictions_5x[i * 5],     # 1. artırılmış tahmin
        predictions_5x[i * 5 + 1], # 2. artırılmış tahmin
        predictions_5x[i * 5 + 2], # 3. artırılmış tahmin
        predictions_5x[i * 5 + 3]  # 4. artırılmış tahmin
    ]
    final_decision_5x = mode(combined_preds_5x, keepdims=True).mode[0]
    final_predictions_5x.append(final_decision_5x)

# Sonuçları ekrana yazdır
print("Original Test Accuracy:", accuracy_score(y_test, original_predictions))
print("3x Augmented Test Accuracy:", accuracy_score(y_test, final_predictions_3x))
print("5x Augmented Test Accuracy:", accuracy_score(y_test, final_predictions_5x))

In [None]:
# Confusion matrix çizimi
for predictions, title in zip(
    [original_predictions, final_predictions_3x, final_predictions_5x],
    ["Original", "3x Augmented", "5x Augmented"]
):
    cm = confusion_matrix(y_test, predictions)
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["0", "1", "2", "3", "4"], yticklabels=["0", "1", "2", "3", "4"])
    plt.title(f'Confusion Matrix - {title}')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()

## Bart

In [None]:

device = 0 if torch.cuda.is_available() else -1
generator = pipeline("text2text-generation", model="eugenesiow/bart-paraphrase", device=device)

# Her test metni için prompt oluştur ve BART modelini uygula
prompts = [f"Paraphrase: {text}" for text in X_test["Text"]]

# Modelden paraphrase'ler al
results = generator(
    prompts,
    max_length=120,
    num_return_sequences=num_aug,
    do_sample=True,
    top_k=100,
    top_p=0.95,
    temperature=1.0,
    repetition_penalty=1.3,
    early_stopping=True
)

# Üretilen metinleri düzenleme
augmented_sentences = []
for result_list in results:
    generated_texts = [item['generated_text'] for item in result_list]
    augmented_sentences.append(generated_texts)

In [None]:
# --- Test verisini 3x ve 5x artırılmış halde oluştur ---
X_test_3x = []
X_test_5x = []

for i in range(len(X_test["Text"])):
    X_test_3x.append(X_test["Text"].iloc[i])
    X_test_5x.append(X_test["Text"].iloc[i])

    # İlk 2 artırılmış metin 3x veri setine ekleniyor
    X_test_3x.extend(augmented_sentences[i][:2])

    # Tüm 4 artırılmış metin 5x veri setine ekleniyor
    X_test_5x.extend(augmented_sentences[i])

In [None]:
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
X_train_embeddings = embedding_model.encode(X_train["Text"], batch_size=32, show_progress_bar=True)
X_test_embeddings = embedding_model.encode(X_test["Text"], batch_size=32, show_progress_bar=True)
X_test_3x_embeddings = embedding_model.encode(X_test_3x, batch_size=32, show_progress_bar=True)
X_test_5x_embeddings = embedding_model.encode(X_test_5x, batch_size=32, show_progress_bar=True)

In [None]:
# --- Model Tahminleri ---
original_predictions = model_xgb.predict(X_test_embeddings)
predictions_3x = model_xgb.predict(X_test_3x_embeddings)
predictions_5x = model_xgb.predict(X_test_5x_embeddings)

# --- Karar Birleştirme Fonksiyonu ---
from scipy.stats import mode
def majority_voting(original_pred, augmented_preds):
    all_preds = augmented_preds + [original_pred]
    final_decision = mode(all_preds, keepdims=True).mode[0]
    return final_decision

# --- Nihai Kararların Belirlenmesi ---
final_predictions_3x = []
final_predictions_5x = []

for i in range(len(original_predictions)):
    combined_preds_3x = [
        original_predictions[i],   # Orijinal tahmin
        predictions_3x[i * 3],     # 1. artırılmış tahmin
        predictions_3x[i * 3 + 1]  # 2. artırılmış tahmin
    ]
    final_decision_3x = mode(combined_preds_3x, keepdims=True).mode[0]
    final_predictions_3x.append(final_decision_3x)

for i in range(len(original_predictions)):
    combined_preds_5x = [
        original_predictions[i],   # Orijinal tahmin
        predictions_5x[i * 5],     # 1. artırılmış tahmin
        predictions_5x[i * 5 + 1], # 2. artırılmış tahmin
        predictions_5x[i * 5 + 2], # 3. artırılmış tahmin
        predictions_5x[i * 5 + 3]  # 4. artırılmış tahmin
    ]
    final_decision_5x = mode(combined_preds_5x, keepdims=True).mode[0]
    final_predictions_5x.append(final_decision_5x)

# Sonuçları ekrana yazdır
print("Original Test Accuracy:", accuracy_score(y_test, original_predictions))
print("3x Augmented Test Accuracy:", accuracy_score(y_test, final_predictions_3x))
print("5x Augmented Test Accuracy:", accuracy_score(y_test, final_predictions_5x))

In [None]:
# Confusion matrix çizimi
for predictions, title in zip(
    [original_predictions, final_predictions_3x, final_predictions_5x],
    ["Original", "3x Augmented", "5x Augmented"]
):
    cm = confusion_matrix(y_test, predictions)
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["0", "1", "2", "3", "4"], yticklabels=["0", "1", "2", "3", "4"])
    plt.title(f'Confusion Matrix - {title}')
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.show()

## Gemini

In [None]:
import google.generativeai as genai
import time

genai.configure(api_key="AIzaSyD7Q4qRPkIFwVIYbB8P8YNQt3LnPzXgqEY")
# Model yüklemesi
model = genai.GenerativeModel("gemini-1.5-flash")
# Veri artırma işlemi
augmented_sentences = []
num_aug = 4  # Her test metni için 4 farklı varyasyon oluşturulacak

# Prompt oluştur
prompts = [f"Generate {num_aug} diverse paraphrases for the following text: {text}" for text in X_test["Text"]]

'''# Modelle paraphrasing işlemi
for prompt in tqdm(prompts, desc="Generating paraphrases with Gemini-1.5-Flash"):
    response = model.generate_content(prompt)
    if response.text:
        generated_texts = response.text.split("\n")[:num_aug]  # İlk num_aug satırı al
        augmented_sentences.append(generated_texts)
    else:
        augmented_sentences.append([""] * num_aug)'''

# Modelle paraphrasing işlemi
for prompt in tqdm(prompts, desc="Generating paraphrases with Gemini-1.5-Flash"):
    success = False
    retry_attempts = 5  # Maksimum 5 kez tekrar dene
    while not success and retry_attempts > 0:
        try:
            response = model.generate_content(prompt)
            if response.text:
                generated_texts = response.text.split("\n")[:num_aug]  # İlk num_aug satırı al
                augmented_sentences.append(generated_texts)
            else:
                augmented_sentences.append([""] * num_aug)
            success = True  # Başarıyla çalıştıysa çık
        except Exception as e:
            print(f"Hata: {e}. Yeniden denemeden önce bekleniyor...")
            time.sleep(10)  # 10 saniye bekle ve tekrar dene
            retry_attempts -= 1


In [None]:
# --- Test verisini 3x ve 5x artırılmış halde oluştur ---
X_test_3x = []
X_test_5x = []

for i in range(len(X_test["Text"])):
    X_test_3x.append(X_test["Text"].iloc[i])
    X_test_5x.append(X_test["Text"].iloc[i])

    # İlk 2 artırılmış metin 3x veri setine ekleniyor
    X_test_3x.extend(augmented_sentences[i][:2])

    # Tüm 4 artırılmış metin 5x veri setine ekleniyor
    X_test_5x.extend(augmented_sentences[i])

In [None]:
# --- Embedding'lerin oluşturulması ---
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
X_train_embeddings = embedding_model.encode(X_train["Text"], batch_size=32, show_progress_bar=True)
X_test_embeddings = embedding_model.encode(X_test["Text"], batch_size=32, show_progress_bar=True)
X_test_3x_embeddings = embedding_model.encode(X_test_3x, batch_size=32, show_progress_bar=True)
X_test_5x_embeddings = embedding_model.encode(X_test_5x, batch_size=32, show_progress_bar=True)

In [None]:
original_predictions = model_xgb.predict(X_test_embeddings)
predictions_3x = model_xgb.predict(X_test_3x_embeddings)
predictions_5x = model_xgb.predict(X_test_5x_embeddings)

# --- Karar Birleştirme Fonksiyonu ---
from scipy.stats import mode
def majority_voting(original_pred, augmented_preds):
    all_preds = augmented_preds + [original_pred]
    final_decision = mode(all_preds, keepdims=True).mode[0]
    return final_decision

# --- Nihai Kararların Belirlenmesi ---
final_predictions_3x = []
final_predictions_5x = []

for i in range(len(original_predictions)):
    combined_preds_3x = [
        original_predictions[i],
        predictions_3x[i * 3],
        predictions_3x[i * 3 + 1]
    ]
    final_decision_3x = mode(combined_preds_3x, keepdims=True).mode[0]
    final_predictions_3x.append(final_decision_3x)

for i in range(len(original_predictions)):
    combined_preds_5x = [
        original_predictions[i],
        predictions_5x[i * 5],
        predictions_5x[i * 5 + 1],
        predictions_5x[i * 5 + 2],
        predictions_5x[i * 5 + 3]
    ]
    final_decision_5x = mode(combined_preds_5x, keepdims=True).mode[0]
    final_predictions_5x.append(final_decision_5x)

# Sonuçları ekrana yazdır
print("Original Test Accuracy:", accuracy_score(y_test, original_predictions))
print("3x Augmented Test Accuracy:", accuracy_score(y_test, final_predictions_3x))
print("5x Augmented Test Accuracy:", accuracy_score(y_test, final_predictions_5x))