In [2]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Gerekli bileşenleri indir (ilk kez çalıştıracaksan)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Dosyayı oku
with open("karar_metni_2025-05-03_16-17-56.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Cümlelere ayır
sentences = sent_tokenize(text)

# Lemmatizer ve stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# İngilizce stopwords
stop_words = set(stopwords.words('english'))

# Ön işleme fonksiyonu
def preprocess_sentence(sentence):
    tokens = word_tokenize(sentence)
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return lemmatized_tokens, stemmed_tokens

# Lemmatize ve stem yapılmış corpus
tokenized_corpus_lemmatized = []
tokenized_corpus_stemmed = []

for sentence in sentences:
    lem, stem = preprocess_sentence(sentence)
    tokenized_corpus_lemmatized.append(lem)
    tokenized_corpus_stemmed.append(stem)

# TF-IDF için metinleştirme
lemmatized_texts = [' '.join(tokens) for tokens in tokenized_corpus_lemmatized]

# TF-IDF vektörleştirme
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(lemmatized_texts)

# Kelime isimleri (sütunlar)
feature_names = vectorizer.get_feature_names_out()

# DataFrame'e çevir
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# İlk 5 satırı göster
print(tfidf_df.head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   aaron  absconding  absent  abuse  act  acting  administrative  \
0    0.0         0.0     0.0    0.0  0.0     0.0             0.0   
1    0.0         0.0     0.0    0.0  0.0     0.0             0.0   
2    0.0         0.0     0.0    0.0  0.0     0.0             0.0   
3    0.0         0.0     0.0    0.0  0.0     0.0             0.0   
4    0.0         0.0     0.0    0.0  0.0     0.0             0.0   

   administratively  admitted  advance  ...  violation  violence  violent  \
0               0.0       0.0   0.4349  ...        0.0       0.0      0.0   
1               0.0       0.0   0.0000  ...        0.0       0.0      0.0   
2               0.0       0.0   0.0000  ...        0.0       0.0      0.0   
3               0.0       0.0   0.0000  ...        0.0       0.0      0.0   
4               0.0       0.0   0.0000  ...        0.0       0.0      0.0   

   walk   washoe  williams  within  without        yl   ôl  
0   0.0  0.00000       0.0     0.0      0.0  0.000000  0.0  
1   0.

In [4]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Gerekli NLTK verilerini indir (ilk çalıştırmada gerekebilir)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# TXT dosyasından metni oku
with open("karar_metni_2025-05-03_16-17-56.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Cümlelere ayır
sentences = sent_tokenize(text)

# Temizleme ve önişleme fonksiyonu
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_sentence(sentence):
    tokens = word_tokenize(sentence)
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return lemmatized_tokens

# Cümleleri temizle
tokenized_corpus = [' '.join(preprocess_sentence(sentence)) for sentence in sentences]

# TF-IDF vektörizasyonu
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(tokenized_corpus)
feature_names = vectorizer.get_feature_names_out()

# DataFrame formatında görelim (istersen)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# === Cosine Similarity Hesaplama === #
target_word = "justice"  # İstediğin kelimeyi burada değiştir

if target_word in feature_names:
    target_index = list(feature_names).index(target_word)
    target_vector = tfidf_matrix[:, target_index].toarray()
    all_vectors = tfidf_matrix.toarray()
    similarities = cosine_similarity(target_vector.T, all_vectors.T).flatten()

    # En yakın 5 kelimeyi bulalım (kendisi dahil, bu yüzden 6 alıp kendini dışlayacağız)
    top_indices = similarities.argsort()[-6:][::-1]
    print(f"\nKelime: '{target_word}' ile en çok ilişkili 5 kelime:")
    for idx in top_indices:
        if feature_names[idx] != target_word:
            print(f"{feature_names[idx]}: {similarities[idx]:.4f}")
else:
    print(f"'{target_word}' kelimesi TF-IDF vektörleri arasında bulunamadı.")


'justice' kelimesi TF-IDF vektörleri arasında bulunamadı.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
import nltk
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Gerekli NLTK verileri
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Metni oku
with open("karar_metni_2025-05-03_16-17-56.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Cümlelere ayır
sentences = sent_tokenize(text)

# Ön işleme fonksiyonu
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(sentence):
    tokens = word_tokenize(sentence)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

# Tüm cümleleri temizle
processed_sentences = [preprocess(sentence) for sentence in sentences]

# TF-IDF hesapla
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(processed_sentences)
feature_names = vectorizer.get_feature_names_out()

# === COSINE SIMILARITY === #
target_word = "justice"  # Burayı istediğin kelime ile değiştir: 'law', 'lawyer', 'court' vs.

if target_word in feature_names:
    word_index = feature_names.tolist().index(target_word)
    word_vector = tfidf_matrix[:, word_index].toarray()

    # Tüm kelime vektörlerini al
    all_vectors = tfidf_matrix.toarray()

    # Cosine similarity hesapla
    similarities = cosine_similarity(word_vector.T, all_vectors.T).flatten()

    # En çok benzeyen 5 kelimeyi seç (kendisi dahil olduğu için 6 alıp 1 dışlayacağız)
    top_indices = similarities.argsort()[-6:][::-1]

    print(f"\n'{target_word}' kelimesiyle en çok ilişkili 5 kelime:")
    for idx in top_indices:
        if feature_names[idx] != target_word:
            print(f"{feature_names[idx]}: {similarities[idx]:.4f}")
else:
    print(f"'{target_word}' kelimesi metinde yer almıyor.")


'justice' kelimesi metinde yer almıyor.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Örnek cümleler — test amaçlı, içinde justice geçiyor
sample_sentences = [
    "The justice system must be fair and impartial.",
    "Every citizen deserves equal justice under the law.",
    "A lawyer fights for justice in court.",
    "Law and order are essential to maintain justice.",
    "Many people trust the justice system."
]

# TF-IDF vektörleştirme
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(sample_sentences)
feature_names = vectorizer.get_feature_names_out()

# Anahtar kelime
target_word = "justice"

# Eğer kelime TF-IDF'de varsa
if target_word in feature_names:
    target_index = feature_names.tolist().index(target_word)
    word_vector = tfidf_matrix[:, target_index].toarray()
    all_vectors = tfidf_matrix.toarray()
    similarities = cosine_similarity(word_vector.T, all_vectors.T).flatten()

    top_indices = similarities.argsort()[-6:][::-1]

    print(f"\n'{target_word}' kelimesiyle en çok ilişkili 5 kelime:")
    for idx in top_indices:
        if feature_names[idx] != target_word:
            print(f"{feature_names[idx]}: {similarities[idx]:.4f}")
else:
    print(f"'{target_word}' kelimesi metinde yok.")



'justice' kelimesiyle en çok ilişkili 5 kelime:
the: 0.7859
system: 0.6650
and: 0.5964
law: 0.5877
people: 0.5065


In [7]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

# Gerekli bileşenleri indir (ilk kez çalıştıracaksan)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Dosyayı oku
with open("karar_metni_2025-05-03_16-17-56.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Cümlelere ayır
sentences = sent_tokenize(text)

# Lemmatizer ve stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# İngilizce stopwords
stop_words = set(stopwords.words('english'))

# Ön işleme fonksiyonu
def preprocess_sentence(sentence):
    tokens = word_tokenize(sentence)
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return lemmatized_tokens, stemmed_tokens

# Lemmatize ve stem yapılmış corpus
tokenized_corpus_lemmatized = []
tokenized_corpus_stemmed = []

for sentence in sentences:
    lem, stem = preprocess_sentence(sentence)
    tokenized_corpus_lemmatized.append(lem)
    tokenized_corpus_stemmed.append(stem)

# TF-IDF için metinleştirme
lemmatized_texts = [' '.join(tokens) for tokens in tokenized_corpus_lemmatized]

# TF-IDF vektörleştirme
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(lemmatized_texts)

# Kelime isimleri (sütunlar)
feature_names = vectorizer.get_feature_names_out()

# DataFrame'e çevir
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Rasgele 5 cümle seç
random_sentences = random.sample(sentences, 5)

# Anahtar kelime
target_word = "justice"  # Burada istediğiniz kelimeyi yazabilirsiniz

# TF-IDF vektörleştirme
tfidf_matrix_random = vectorizer.fit_transform(random_sentences)
feature_names_random = vectorizer.get_feature_names_out()

# Eğer kelime TF-IDF'de varsa
if target_word in feature_names_random:
    target_index = feature_names_random.tolist().index(target_word)
    word_vector = tfidf_matrix_random[:, target_index].toarray()
    all_vectors = tfidf_matrix_random.toarray()
    similarities = cosine_similarity(word_vector.T, all_vectors.T).flatten()

    top_indices = similarities.argsort()[-6:][::-1]

    print(f"\n'{target_word}' kelimesiyle en çok ilişkili 5 kelime:")
    for idx in top_indices:
        if feature_names_random[idx] != target_word:
            print(f"{feature_names_random[idx]}: {similarities[idx]:.4f}")
else:
    print(f"'{target_word}' kelimesi metinde yok.")

# İlk 5 satırı göster
print("\nTF-IDF Matrisi (ilk 5 satır):")
print(tfidf_df.head())


'justice' kelimesi metinde yok.

TF-IDF Matrisi (ilk 5 satır):
   aaron  absconding  absent  abuse  act  acting  administrative  \
0    0.0         0.0     0.0    0.0  0.0     0.0             0.0   
1    0.0         0.0     0.0    0.0  0.0     0.0             0.0   
2    0.0         0.0     0.0    0.0  0.0     0.0             0.0   
3    0.0         0.0     0.0    0.0  0.0     0.0             0.0   
4    0.0         0.0     0.0    0.0  0.0     0.0             0.0   

   administratively  admitted  advance  ...  violation  violence  violent  \
0               0.0       0.0   0.4349  ...        0.0       0.0      0.0   
1               0.0       0.0   0.0000  ...        0.0       0.0      0.0   
2               0.0       0.0   0.0000  ...        0.0       0.0      0.0   
3               0.0       0.0   0.0000  ...        0.0       0.0      0.0   
4               0.0       0.0   0.0000  ...        0.0       0.0      0.0   

   walk   washoe  williams  within  without        yl   ôl  
0   

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [15]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Gerekli NLTK bileşenlerini indir
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Karar metnini oku
with open("karar_metni_2025-05-03_16-17-56.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Cümlelere ayır
sentences = sent_tokenize(text)

# Lemmatizer ve stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# İngilizce stopwords
stop_words = set(stopwords.words('english'))

# Ön işleme fonksiyonu
def preprocess_sentence(sentence):
    tokens = word_tokenize(sentence)
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return lemmatized_tokens, stemmed_tokens

# Metni işle
tokenized_corpus_lemmatized = []
tokenized_corpus_stemmed = []

for sentence in sentences:
    lem, stem = preprocess_sentence(sentence)
    tokenized_corpus_lemmatized.append(lem)
    tokenized_corpus_stemmed.append(stem)

# TF-IDF için metinleştirme
lemmatized_texts = [' '.join(tokens) for tokens in tokenized_corpus_lemmatized]

# Vektörleştirme
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(lemmatized_texts)

# Kelime isimleri (özellik adları)
feature_names = vectorizer.get_feature_names_out()

# DataFrame'e dönüştür
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Anlamlı 5 cümle (örnek)
sample_sentences = [
    "The court's decision reflects a deep understanding of the legal principles involved.",
    "Justice should be served in every case, ensuring fairness and equality.",
    "The defendant's actions were deemed illegal by the court based on the evidence presented.",
    "In legal matters, it is crucial to uphold the rule of law for a just society.",
    "The judge ruled that the evidence was sufficient to convict the defendant of the crime."
]

# Örnek cümleleri dönüştür (vectorizer.fit zaten yapıldı)
tfidf_matrix_sample = vectorizer.transform(sample_sentences)

# Cosine Similarity hesapla
similarities = cosine_similarity(tfidf_matrix_sample, tfidf_matrix)

# En çok benzeyen cümlelerin indeksleri
top_indices = similarities[0].argsort()[-5:][::-1]

# Benzerlik sonuçlarını yazdır
print(f"\n5 anlamlı cümle ile metindeki benzerlik analizi:")
for idx in top_indices:
    similarity_value = float(similarities[0, idx])  # HATA BURADA DÜZELTİLDİ
    print(f"\nBenzerlik {similarity_value:.4f}: {sentences[idx]}")

# İlk 5 satır TF-IDF matrisi
print("\nTF-IDF Matrisi (ilk 5 satır):")
print(tfidf_df.head())





5 anlamlı cümle ile metindeki benzerlik analizi:

Benzerlik 0.3989: But when, as
here,   the   district   court's   determination   was    based   on   statutory
interpretation, we review the district court's decision de novo.

Benzerlik 0.3140: DISCUSSION
A district court's decision to revoke probation is within its broad
discretion and will not be disturbed absent a clear showing of abuse.

Benzerlik 0.1630: BEFORE THE        SUPREME      COURT,    PARRAGUIRRE.

Benzerlik 0.1445: SUPREME COURT
OF
NEVADA
1 g 2-9
0) 1947A    e
OPINION
By the Court, PARRAGUIRRE, J.:
NRS 176A.510 requires the imposition of graduated sanctions
for technical probation violations.

Benzerlik 0.1420: Because the district court failed to support its findings with facts
showing that Sheridan's convictions constitute crimes of violence
amounting to nontechnical probation violations, we reverse the district
court's revocation of Sheridan's probation and remand this matter to the
district court for further proce

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Gerekli bileşenleri indir
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Dosyayı oku
with open("karar_metni_2025-05-03_16-17-56.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Cümlelere ayır
sentences = sent_tokenize(text)

# Lemmatizer ve stemmer
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# İngilizce stopwords
stop_words = set(stopwords.words('english'))

# Ön işleme fonksiyonu
def preprocess_sentence(sentence):
    tokens = word_tokenize(sentence)
    filtered_tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return lemmatized_tokens, stemmed_tokens

# Lemmatize ve stem yapılmış corpus
tokenized_corpus_lemmatized = []
tokenized_corpus_stemmed = []

for sentence in sentences:
    lem, stem = preprocess_sentence(sentence)
    tokenized_corpus_lemmatized.append(lem)
    tokenized_corpus_stemmed.append(stem)

# TF-IDF için metinleştirme
lemmatized_texts = [' '.join(tokens) for tokens in tokenized_corpus_lemmatized]
stemmed_texts = [' '.join(tokens) for tokens in tokenized_corpus_stemmed]

# Lemmatized TF-IDF
vectorizer_lem = TfidfVectorizer()
tfidf_matrix_lem = vectorizer_lem.fit_transform(lemmatized_texts)
feature_names_lem = vectorizer_lem.get_feature_names_out()
tfidf_df_lem = pd.DataFrame(tfidf_matrix_lem.toarray(), columns=feature_names_lem)

# Stemmed TF-IDF
vectorizer_stem = TfidfVectorizer()
tfidf_matrix_stem = vectorizer_stem.fit_transform(stemmed_texts)
feature_names_stem = vectorizer_stem.get_feature_names_out()
tfidf_df_stem = pd.DataFrame(tfidf_matrix_stem.toarray(), columns=feature_names_stem)

# CSV olarak kaydet
tfidf_df_lem.to_csv("TD-IDF.lemma.csv", index=False)
tfidf_df_stem.to_csv("TD-IDF.stem.csv", index=False)

print("CSV dosyaları başarıyla oluşturuldu: TD-IDF.lemma.csv ve TD-IDF.stem.csv")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


CSV dosyaları başarıyla oluşturuldu: TD-IDF.lemma.csv ve TD-IDF.stem.csv
