In [None]:
import pandas as pd
import numpy as np
#from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje3/instructions.csv')
data.head()

In [None]:
data.info()

In [None]:
#giriş sütunu boş ve dolu olanlardan eşit miktarda seçilir.
data_with_nan = data[data[' giriş'].isnull()].iloc[0:1000]
data_without_nan = data[data[" giriş"].notnull()].iloc[0:1000]

In [None]:
#Seçilen veriler birleştirilir.
df = pd.concat([data_with_nan,data_without_nan],ignore_index=True)

In [None]:
df.info()

In [None]:
#talimat ve giriş sütunlarının birleştirildiği 'birleştirilmiş' sütunu oluşturulur.
data = df.copy()
data['birleştirilmiş'] = data.apply(
    lambda row: row['talimat'] if pd.isna(row[' giriş']) else row['talimat'] + ' ' + row[' giriş'], axis=1
)

In [None]:
data.head()

In [None]:
#data kaydedilir.
data = data[["birleştirilmiş", " çıktı"]]
data.to_csv('/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje3/veri_seti.csv', index=False)

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje3/veri_seti.csv')
df.head()

In [None]:
df.info()

In [None]:
queries = df["birleştirilmiş"].tolist()
answers = df[" çıktı"].tolist()

In [None]:
#Temsil Yöntemleri Tanımlama
sentence_transformers_models = {
    "all-MiniLM-L12-v2": SentenceTransformer("sentence-transformers/all-MiniLM-L12-v2"),
    "multilingual-e5-large-instruct": SentenceTransformer("intfloat/multilingual-e5-large-instruct"),
    "gte-large": SentenceTransformer("thenlper/gte-large"),
    "colbert": SentenceTransformer("bert-base-uncased"), #yanlış isim tanımlaması bert modeli kullanılmıştır.
    "jina-embeddings-v3": SentenceTransformer("jinaai/jina-embeddings-v3", trust_remote_code=True)
}

In [None]:
# 3. Temsil Yöntemi ile Embedding Oluşturma
def create_embeddings(texts, model):
    return model.encode(texts, batch_size=32, show_progress_bar=True)

In [None]:
# 5. Temsil Yöntemi için Embedding Hesapla ve Kaydet

for representation_name, representation_model in sentence_transformers_models.items():
    print(f"\nTemsil yöntemi: {representation_name}")

    # Sorgular ve Cevaplar için Embedding Oluşturma
    query_embeddings = create_embeddings(queries, representation_model)
    answer_embeddings = create_embeddings(answers, representation_model)

    # Embedding'leri Kaydetme
    np.save(f"/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje3/temsiller/{representation_name}_queries.npy", query_embeddings)
    np.save(f"/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje3/temsiller/{representation_name}_answers.npy", answer_embeddings)

    print(f"Embedding'ler ve sonuçlar {representation_name} için kaydedildi!")

In [None]:
# Bireysel Embeding İçin Benzerlik Hesabı
representation_names = [
    "all-MiniLM-L12-v2",
    "multilingual-e5-large-instruct",
    "gte-large",
    "colbert",
    "jina-embeddings-v3"
]

embeding_similarity_matrices = []
for representation_name in representation_names:
    query_embeddings = np.load(f"/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje3/temsiller/{representation_name}_queries.npy")
    answer_embeddings = np.load(f"/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje3/temsiller/{representation_name}_answers.npy")
    similarity_matrix = cosine_similarity(query_embeddings, answer_embeddings)
    embeding_similarity_matrices.append(similarity_matrix)

In [None]:
# Ensemble Yöntemleri
def ensemble_average(similarity_matrices):
    return np.mean(similarity_matrices, axis=0)

def ensemble_max_voting(similarity_matrices):
    return np.max(similarity_matrices, axis=0)

def ensemble_weighted_average(similarity_matrices, weights):
    weighted_matrices = [matrix * weight for matrix, weight in zip(similarity_matrices, weights)]
    return np.sum(weighted_matrices, axis=0) / np.sum(weights)

In [None]:
#Top1 ve Top5 Hesaplama
def compute_top_k_from_matrix(similarity_matrix, answers, k=5):
    top1_results = []
    top5_results = []
    for query_idx in range(similarity_matrix.shape[0]):
        # En yüksek skora göre sıralama
        top_indices = np.argsort(similarity_matrix[query_idx])[::-1]
        top1_results.append(answers[top_indices[0]])
        top5_results.append([answers[i] for i in top_indices[:k]])
    return top1_results, top5_results

In [None]:
#Doğruluk Hesabı
def calculate_accuracy(top1_results, top5_results, ground_truths):
    top1_correct = 0
    top5_correct = 0
    total = len(ground_truths)

    for i in range(total):
        if top1_results[i] == ground_truths[i]:
            top1_correct += 1
        if ground_truths[i] in top5_results[i]:
            top5_correct += 1

    top1_accuracy = top1_correct / total
    top5_accuracy = top5_correct / total
    return top1_accuracy, top5_accuracy

In [None]:
#Bireysel Performans Hesabı
individual_model_results = {}

for representation_name, similarity_matrix in zip(representation_names, embeding_similarity_matrices):
    print(f"\nPerformans analizi: {representation_name}")

    # Top1 ve Top5 Sonuçlarını Hesapla
    top1_results, top5_results = compute_top_k_from_matrix(similarity_matrix, answers)

    # Doğruluk Oranlarını Hesapla
    top1_accuracy, top5_accuracy = calculate_accuracy(top1_results, top5_results, answers)
    individual_model_results[representation_name] = {
        "top1_accuracy": top1_accuracy,
        "top5_accuracy": top5_accuracy
    }

    # Performans Sonuçlarını Yazdır
    print(f"Top1 Accuracy: {top1_accuracy:.4f}")
    print(f"Top5 Accuracy: {top5_accuracy:.4f}")

    # Sonuçları CSV Dosyasına Kaydet
    pd.DataFrame({
        "Query": queries,
        "Ground Truth": answers,
        "Top1_Result": top1_results,
        "Top5_Results": ["; ".join(top5) for top5 in top5_results]
    }).to_csv(f"/content/drive/MyDrive/Yüksek Lisans 2. Dönem/Kolektif Öğrenme/Proje3/sonuclar/{representation_name}_results.csv", index=False)


In [None]:
#Performans Analizi
performance_df = pd.DataFrame(individual_model_results).T
performance_df.reset_index(inplace=True)
performance_df.columns = ["Model", "Top1 Accuracy", "Top5 Accuracy"]
melted_df = performance_df.melt(id_vars="Model", var_name="Metric", value_name="Accuracy")
melted_df

In [None]:
#Grafikte doğru isimle göstermek için değişiklik yapıldı.
melted_df['Model'] = melted_df['Model'].replace('colbert', 'bert-base-turkish-uncased')
melted_df

In [None]:
plt.figure(figsize=(8,4))
sns.barplot(data=melted_df, x="Model", y="Accuracy", hue="Metric");
plt.xticks(rotation=25)
plt.title("Bireysel Modellerin Performans Karşılaştırması")
plt.xlabel("Model")
plt.ylabel("Accuracy")
plt.show()

In [None]:
#Ensemble
weights = [1, 10, 1, 1, 10]
#weights = [1, 2, 1, 1, 2] Top1 Accuracy: 0.7865 Top5 Accuracy: 0.8920
#weights = [1, 2, 1, 2, 1] #Top1 Accuracy: 0.6960 Top5 Accuracy: 0.8040
#Ensemble Skorları ve Matrisler Üzerinde Performans
ensemble_similarity_matrices = {
    "average": ensemble_average(embeding_similarity_matrices),
    "weighted": ensemble_weighted_average(embeding_similarity_matrices, weights),
    "max_voting": ensemble_max_voting(embeding_similarity_matrices),
}

# Ensemble matrisi üzerinden Top1 ve Top5 hesaplama
ensemble_results = {}
for method_name, similarity_matrix in ensemble_similarity_matrices.items():
    top1_results, top5_results = compute_top_k_from_matrix(similarity_matrix, answers)
    ensemble_results[method_name] = {"top1": top1_results, "top5": top5_results}

In [None]:
# Ensemble başarı hesaplama
ensemble_results = {}
for method_name, similarity_matrix in ensemble_similarity_matrices.items():
    # Ensemble matrisinden Top1 ve Top5 sonuçlarını hesapla
    top1_results, top5_results = compute_top_k_from_matrix(similarity_matrix, answers)

    # Top1 ve Top5 doğruluk oranlarını hesapla
    top1_accuracy, top5_accuracy = calculate_accuracy(top1_results, top5_results, answers)

    # Sonuçları sakla
    ensemble_results[method_name] = {
        "top1_accuracy": top1_accuracy,
        "top5_accuracy": top5_accuracy
    }

    # Sonuçları yazdır
    print(f"\nEnsemble Yöntemi: {method_name}")
    print(f"Top1 Accuracy: {top1_accuracy:.4f}")
    print(f"Top5 Accuracy: {top5_accuracy:.4f}")

In [None]:
data_ensemble_results = pd.DataFrame(ensemble_results).reset_index()
ensemble_melted = data_ensemble_results.melt(id_vars="index",var_name="Method", value_name="Accuracy")
ensemble_melted

In [None]:
plt.figure(figsize=(8,4))
sns.barplot(data=ensemble_melted, x="Method", y="Accuracy", hue="index");
plt.title("Ensemble Performans Karşılaştırması")
plt.xlabel("Method")
plt.ylabel("Accuracy")
plt.show()