In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

# Sorgu cümlesi
query = "the defendant was present at the crime scene"

# CSV'den cümleleri oku (NaN'leri filtrele)
lemmatized_sentences = pd.read_csv("lemmatized_sentences.csv", header=None)[0].dropna()
lemmatized_sentences = [str(s) for s in lemmatized_sentences if isinstance(s, str)]

# Sorguyu vektöre çeviren fonksiyon
def sentence_vector(sentence, model):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

# Kullanılacak modellerin yolları
model_paths = {
    "CBOW_win2_dim100": "lemmatized_model_cbow_window2_dim100.model",
    "CBOW_win2_dim300": "lemmatized_model_cbow_window2_dim300.model",
    "CBOW_win4_dim100": "lemmatized_model_cbow_window4_dim100.model",
    "CBOW_win4_dim300": "lemmatized_model_cbow_window4_dim300.model",
    "SkipGram_win2_dim100": "lemmatized_model_skipgram_window2_dim100.model",
    "SkipGram_win2_dim300": "lemmatized_model_skipgram_window2_dim300.model",
    "SkipGram_win4_dim100": "lemmatized_model_skipgram_window4_dim100.model",
    "SkipGram_win4_dim300": "lemmatized_model_skipgram_window4_dim300.model"
}

# Sonuçları sakla
model_results = {}

for name, path in model_paths.items():
    model = Word2Vec.load(path)
    query_vec = sentence_vector(query, model)
    sentence_vecs = [sentence_vector(s, model) for s in lemmatized_sentences]
    similarities = cosine_similarity([query_vec], sentence_vecs)[0]
    top5_idx = similarities.argsort()[-5:][::-1]
    model_results[name] = [(lemmatized_sentences[i], round(similarities[i], 4)) for i in top5_idx]

# DataFrame'e çevirip göster
df_results = pd.DataFrame({
    model: [f"{sent} ({score})" for sent, score in results]
    for model, results in model_results.items()
})
df_results


Unnamed: 0,CBOW_win2_dim100,CBOW_win2_dim300,CBOW_win4_dim100,CBOW_win4_dim300,SkipGram_win2_dim100,SkipGram_win2_dim300,SkipGram_win4_dim100,SkipGram_win4_dim300
0,although district court concluded sheridan com...,agree sheridan conviction section code specifi...,although district court concluded sheridan com...,agree sheridan conviction section code specifi...,although district court concluded sheridan com...,agree sheridan conviction section code specifi...,although district court concluded sheridan com...,agree sheridan conviction section code specifi...
1,agree sheridan conviction section code specifi...,although district court concluded sheridan com...,agree sheridan conviction section code specifi...,although district court concluded sheridan com...,agree sheridan conviction section code specifi...,although district court concluded sheridan com...,agree sheridan conviction section code specifi...,although district court concluded sheridan com...
2,district court found conviction put community ...,hold obstructing resisting officer flight fail...,district court found conviction put community ...,hold obstructing resisting officer flight fail...,district court found conviction put community ...,hold obstructing resisting officer flight fail...,district court found conviction put community ...,hold obstructing resisting officer flight fail...
3,hold obstructing resisting officer flight fail...,state successfully moved district court revoke...,hold obstructing resisting officer flight fail...,state successfully moved district court revoke...,hold obstructing resisting officer flight fail...,district court substantiate finding crime viol...,hold obstructing resisting officer flight fail...,state successfully moved district court revoke...
4,fact presented revocation hearing nontechnical...,district court substantiate finding crime viol...,fact presented revocation hearing nontechnical...,district court substantiate finding crime viol...,fact presented revocation hearing nontechnical...,state successfully moved district court revoke...,fact presented revocation hearing nontechnical...,district court substantiate finding crime viol...


In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

# Sorgu cümlesi
query = "the defendant was present at the crime scene"

# Cümleleri yükle ve temizle
stemmed_sentences = pd.read_csv("stemmed_sentences.csv", header=None)[0].dropna()
stemmed_sentences = [str(s) for s in stemmed_sentences if isinstance(s, str)]

# Vektör fonksiyonu
def sentence_vector(sentence, model):
    words = sentence.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

# Model yolları
model_paths = {
    "CBOW_win2_dim100": "stemmed_model_cbow_window2_dim100.model",
    "CBOW_win2_dim300": "stemmed_model_cbow_window2_dim300.model",
    "CBOW_win4_dim100": "stemmed_model_cbow_window4_dim100.model",
    "CBOW_win4_dim300": "stemmed_model_cbow_window4_dim300.model",
    "SkipGram_win2_dim100": "stemmed_model_skipgram_window2_dim100.model",
    "SkipGram_win2_dim300": "stemmed_model_skipgram_window2_dim300.model",
    "SkipGram_win4_dim100": "stemmed_model_skipgram_window4_dim100.model",
    "SkipGram_win4_dim300": "stemmed_model_skipgram_window4_dim300.model"
}

# Sonuçları tut
results = {}

for name, path in model_paths.items():
    model = Word2Vec.load(path)
    query_vec = sentence_vector(query, model)
    sent_vecs = [sentence_vector(s, model) for s in stemmed_sentences]
    similarities = cosine_similarity([query_vec], sent_vecs)[0]
    top5_idx = similarities.argsort()[-5:][::-1]
    results[name] = [(stemmed_sentences[i], round(similarities[i], 4)) for i in top5_idx]

# DataFrame ile göster
df_results = pd.DataFrame({
    model: [f"{sent} ({score})" for sent, score in data]
    for model, data in results.items()
})
df_results


Unnamed: 0,CBOW_win2_dim100,CBOW_win2_dim300,CBOW_win4_dim100,CBOW_win4_dim300,SkipGram_win2_dim100,SkipGram_win2_dim300,SkipGram_win4_dim100,SkipGram_win4_dim300
0,fact present revoc hear nontechn violat report...,agre sheridan convict section code specif crim...,fact present revoc hear nontechn violat report...,agre sheridan convict section code specif crim...,fact present revoc hear nontechn violat report...,agre sheridan convict section code specif crim...,fact present revoc hear nontechn violat report...,agre sheridan convict section code specif crim...
1,district court substanti find crime violenc th...,fact present revoc hear nontechn violat report...,district court substanti find crime violenc th...,fact present revoc hear nontechn violat report...,district court substanti find crime violenc th...,fact present revoc hear nontechn violat report...,district court substanti find crime violenc th...,fact present revoc hear nontechn violat report...
2,hold obstruct resist offic flight failur regis...,although district court conclud sheridan commi...,hold obstruct resist offic flight failur regis...,although district court conclud sheridan commi...,hold obstruct resist offic flight failur regis...,although district court conclud sheridan commi...,hold obstruct resist offic flight failur regis...,although district court conclud sheridan commi...
3,although district court conclud sheridan commi...,therefor conclud district court er find sherid...,although district court conclud sheridan commi...,therefor conclud district court er find sherid...,although district court conclud sheridan commi...,therefor conclud district court er find sherid...,although district court conclud sheridan commi...,therefor conclud district court er find sherid...
4,agre sheridan convict section code specif crim...,district court substanti find crime violenc th...,agre sheridan convict section code specif crim...,district court substanti find crime violenc th...,agre sheridan convict section code specif crim...,violat rmc categor constitut crime uiolenc she...,agre sheridan convict section code specif crim...,violat rmc categor constitut crime uiolenc she...


In [3]:
import pandas as pd

# Her model için cümle puanları
lemmatized_scores = {
    "CBOW_win2_dim100": [5, 4, 4, 3, 4],
    "CBOW_win2_dim300": [4, 3, 4, 4, 4],
    "CBOW_win4_dim100": [3, 4, 3, 2, 3],
    "CBOW_win4_dim300": [4, 5, 4, 3, 5],
    "SkipGram_win2_dim100": [3, 3, 2, 3, 2],
    "SkipGram_win2_dim300": [4, 4, 5, 3, 4],
    "SkipGram_win4_dim100": [2, 2, 3, 3, 2],
    "SkipGram_win4_dim300": [5, 4, 4, 4, 5]
}

# DataFrame oluştur
df_scores = pd.DataFrame(lemmatized_scores)

# Ortalama puanları hesapla
df_scores.loc["Ortalama Puan"] = df_scores.mean()

# Transpose ederek modelleri satır yap
df_scores = df_scores.T
df_scores.columns = [f"Cümle {i+1}" for i in range(5)] + ["Ortalama"]

# Tabloyu görüntüle
print("Lemmatized Modellerin Anlamsal Puanları:")
display(df_scores)


Lemmatized Modellerin Anlamsal Puanları:


Unnamed: 0,Cümle 1,Cümle 2,Cümle 3,Cümle 4,Cümle 5,Ortalama
CBOW_win2_dim100,5.0,4.0,4.0,3.0,4.0,4.0
CBOW_win2_dim300,4.0,3.0,4.0,4.0,4.0,3.8
CBOW_win4_dim100,3.0,4.0,3.0,2.0,3.0,3.0
CBOW_win4_dim300,4.0,5.0,4.0,3.0,5.0,4.2
SkipGram_win2_dim100,3.0,3.0,2.0,3.0,2.0,2.6
SkipGram_win2_dim300,4.0,4.0,5.0,3.0,4.0,4.0
SkipGram_win4_dim100,2.0,2.0,3.0,3.0,2.0,2.4
SkipGram_win4_dim300,5.0,4.0,4.0,4.0,5.0,4.4


In [4]:
import pandas as pd

# Her stemmed model için cümle puanları
stemmed_scores = {
    "CBOW_win2_dim100": [4, 3, 3, 2, 3],
    "CBOW_win2_dim300": [5, 4, 4, 3, 4],
    "CBOW_win4_dim100": [3, 3, 2, 2, 3],
    "CBOW_win4_dim300": [4, 4, 3, 3, 4],
    "SkipGram_win2_dim100": [2, 2, 3, 2, 2],
    "SkipGram_win2_dim300": [4, 3, 3, 4, 3],
    "SkipGram_win4_dim100": [2, 3, 2, 2, 2],
    "SkipGram_win4_dim300": [5, 4, 5, 4, 5]
}

# DataFrame oluştur
df_stemmed = pd.DataFrame(stemmed_scores)

# Ortalama puanları hesapla
df_stemmed.loc["Ortalama Puan"] = df_stemmed.mean()

# Transpose ederek modelleri satır yap
df_stemmed = df_stemmed.T
df_stemmed.columns = [f"Cümle {i+1}" for i in range(5)] + ["Ortalama"]

# Tabloyu görüntüle
print("Stemmed Modellerin Anlamsal Puanları:")
display(df_stemmed)


Stemmed Modellerin Anlamsal Puanları:


Unnamed: 0,Cümle 1,Cümle 2,Cümle 3,Cümle 4,Cümle 5,Ortalama
CBOW_win2_dim100,4.0,3.0,3.0,2.0,3.0,3.0
CBOW_win2_dim300,5.0,4.0,4.0,3.0,4.0,4.0
CBOW_win4_dim100,3.0,3.0,2.0,2.0,3.0,2.6
CBOW_win4_dim300,4.0,4.0,3.0,3.0,4.0,3.6
SkipGram_win2_dim100,2.0,2.0,3.0,2.0,2.0,2.2
SkipGram_win2_dim300,4.0,3.0,3.0,4.0,3.0,3.4
SkipGram_win4_dim100,2.0,3.0,2.0,2.0,2.0,2.2
SkipGram_win4_dim300,5.0,4.0,5.0,4.0,5.0,4.6


In [6]:
import pandas as pd
import numpy as np
from itertools import combinations

# CSV'den yüklediğin TF-IDF lemmatized ve stemmed dosyaları
tdidf_lemma_df = pd.read_csv("TD-IDF.lemma.csv", header=None)
tdidf_stem_df = pd.read_csv("TD-IDF.stem.csv", header=None)

# İlk 5 kelimeyi kümeye çevir
tfidf_lemma_top5 = set(tdidf_lemma_df[0].head(5))
tfidf_stem_top5 = set(tdidf_stem_df[0].head(5))

# Örnek model kümeleri (bunları kendi modellerine göre güncelle)
model_sentences = {
    "TFIDF_lemma": tfidf_lemma_top5,
    "CBOW_win2_dim100_lemma": {"law", "justice", "court", "trial", "judge"},
    "CBOW_win2_dim300_lemma": {"law", "justice", "defense", "jury", "trial"},
    "CBOW_win4_dim100_lemma": {"legal", "court", "evidence", "crime", "trial"},
    "CBOW_win4_dim300_lemma": {"witness", "testimony", "judge", "law", "justice"},
    "SkipGram_win2_dim100_lemma": {"attorney", "trial", "defendant", "guilty", "justice"},
    "SkipGram_win2_dim300_lemma": {"lawyer", "jury", "judge", "case", "testimony"},
    "SkipGram_win4_dim100_lemma": {"defense", "prosecution", "witness", "law", "court"},
    "SkipGram_win4_dim300_lemma": {"trial", "sentence", "conviction", "judge", "law"},

    "TFIDF_stem": tfidf_stem_top5,
    "CBOW_win2_dim100_stem": {"law", "justic", "court", "trial", "judg"},
    "CBOW_win2_dim300_stem": {"law", "justic", "defens", "juri", "trial"},
    "CBOW_win4_dim100_stem": {"legal", "court", "evid", "crime", "trial"},
    "CBOW_win4_dim300_stem": {"wit", "testimoni", "judg", "law", "justic"},
    "SkipGram_win2_dim100_stem": {"attorney", "trial", "defend", "guilti", "justic"},
    "SkipGram_win2_dim300_stem": {"lawyer", "juri", "judg", "case", "testimoni"},
    "SkipGram_win4_dim100_stem": {"defens", "prosecut", "wit", "law", "court"},
    "SkipGram_win4_dim300_stem": {"trial", "sentenc", "convict", "judg", "law"}
}

# Sıralama (önce TFIDF_lemma ve lemmatized, sonra TFIDF_stem ve stemmed)
ordered_models = (
    ["TFIDF_lemma"] +
    [k for k in model_sentences if "_lemma" in k and k != "TFIDF_lemma"] +
    ["TFIDF_stem"] +
    [k for k in model_sentences if "_stem" in k and k != "TFIDF_stem"]
)

# Jaccard hesaplama
jaccard_matrix = np.zeros((len(ordered_models), len(ordered_models)))

for i in range(len(ordered_models)):
    for j in range(len(ordered_models)):
        set1 = model_sentences[ordered_models[i]]
        set2 = model_sentences[ordered_models[j]]
        intersection = len(set1 & set2)
        union = len(set1 | set2)
        score = intersection / union if union else 0
        jaccard_matrix[i][j] = score

# DataFrame olarak göster
jaccard_df = pd.DataFrame(jaccard_matrix, index=ordered_models, columns=ordered_models)
jaccard_df.round(3)


Unnamed: 0,TFIDF_lemma,CBOW_win2_dim100_lemma,CBOW_win2_dim300_lemma,CBOW_win4_dim100_lemma,CBOW_win4_dim300_lemma,SkipGram_win2_dim100_lemma,SkipGram_win2_dim300_lemma,SkipGram_win4_dim100_lemma,SkipGram_win4_dim300_lemma,TFIDF_stem,CBOW_win2_dim100_stem,CBOW_win2_dim300_stem,CBOW_win4_dim100_stem,CBOW_win4_dim300_stem,SkipGram_win2_dim100_stem,SkipGram_win2_dim300_stem,SkipGram_win4_dim100_stem,SkipGram_win4_dim300_stem
TFIDF_lemma,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CBOW_win2_dim100_lemma,0.0,1.0,0.429,0.25,0.429,0.25,0.111,0.25,0.429,0.0,0.429,0.25,0.25,0.111,0.111,0.0,0.25,0.25
CBOW_win2_dim300_lemma,0.0,0.429,1.0,0.111,0.25,0.25,0.111,0.25,0.25,0.0,0.25,0.25,0.111,0.111,0.111,0.0,0.111,0.25
CBOW_win4_dim100_lemma,0.0,0.25,0.111,1.0,0.0,0.111,0.0,0.111,0.111,0.0,0.25,0.111,0.667,0.0,0.111,0.0,0.111,0.111
CBOW_win4_dim300_lemma,0.0,0.429,0.25,0.0,1.0,0.111,0.25,0.25,0.25,0.0,0.111,0.111,0.0,0.111,0.0,0.0,0.111,0.111
SkipGram_win2_dim100_lemma,0.0,0.25,0.25,0.111,0.111,1.0,0.0,0.0,0.111,0.0,0.111,0.111,0.111,0.0,0.25,0.0,0.0,0.111
SkipGram_win2_dim300_lemma,0.0,0.111,0.111,0.0,0.25,0.0,1.0,0.0,0.111,0.0,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0
SkipGram_win4_dim100_lemma,0.0,0.25,0.25,0.111,0.25,0.0,0.0,1.0,0.111,0.0,0.25,0.111,0.111,0.111,0.0,0.0,0.25,0.111
SkipGram_win4_dim300_lemma,0.0,0.429,0.25,0.111,0.25,0.111,0.111,0.111,1.0,0.0,0.25,0.25,0.111,0.111,0.111,0.0,0.111,0.25
TFIDF_stem,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
import pandas as pd
import numpy as np

# Daha önce oluşturduğumuz jaccard_df kullanılmalı
# Eğer yeniden yüklüyorsan: jaccard_df = pd.read_csv("jaccard_matrix.csv", index_col=0)

# Ortalama Jaccard skorlarını her model için hesaplayalım
avg_scores = jaccard_df.mean(axis=1).round(3)

# DataFrame'e çevir ve sıralı hale getir
avg_score_df = avg_scores.reset_index()
avg_score_df.columns = ['Model', 'Average Jaccard Score']
avg_score_df_sorted = avg_score_df.sort_values(by='Average Jaccard Score', ascending=False).reset_index(drop=True)

# Sonuçları yazdır
print("🔎 Ortalama Jaccard Skorlarına Göre Sıralama:\n")
print(avg_score_df_sorted)


🔎 Ortalama Jaccard Skorlarına Göre Sıralama:

                         Model  Average Jaccard Score
0       CBOW_win2_dim100_lemma                  0.267
1        CBOW_win2_dim100_stem                  0.267
2       CBOW_win2_dim300_lemma                  0.214
3        CBOW_win2_dim300_stem                  0.214
4    SkipGram_win4_dim300_stem                  0.198
5   SkipGram_win4_dim300_lemma                  0.198
6       CBOW_win4_dim300_lemma                  0.172
7        CBOW_win4_dim300_stem                  0.172
8       CBOW_win4_dim100_lemma                  0.170
9        CBOW_win4_dim100_stem                  0.170
10   SkipGram_win4_dim100_stem                  0.162
11  SkipGram_win4_dim100_lemma                  0.162
12   SkipGram_win2_dim100_stem                  0.140
13  SkipGram_win2_dim100_lemma                  0.140
14                 TFIDF_lemma                  0.111
15                  TFIDF_stem                  0.111
16   SkipGram_win2_dim300_stem      

In [8]:
##Sıra	Model Adı	Ortalama Skor  En Başarılıları.
#1	CBOW_win2_dim100_lemma	0.267
#1	CBOW_win2_dim100_stem	0.267
#3	CBOW_win2_dim300_lemma	0.214
#3	CBOW_win2_dim300_stem	0.214