In [1]:
import gensim
from gensim.models import Word2Vec
import pandas as pd
import re

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [2]:
parameters = [
    {'model_type': 'cbow', 'window': 2, 'vector_size': 100},
    {'model_type': 'skipgram', 'window': 2, 'vector_size': 100},
    {'model_type': 'cbow', 'window': 4, 'vector_size': 100},
    {'model_type': 'skipgram', 'window': 4, 'vector_size': 100},
    {'model_type': 'cbow', 'window': 2, 'vector_size': 300},
    {'model_type': 'skipgram', 'window': 2, 'vector_size': 300},
    {'model_type': 'cbow', 'window': 4, 'vector_size': 300},
    {'model_type': 'skipgram', 'window': 4, 'vector_size': 300}
]

In [3]:
df1 = pd.read_csv("../data/lemmatized_sentences.csv")
df2 = pd.read_csv("../data/stemmed_sentences.csv")



In [4]:
df1.columns = ["0"]

# NaN değerleri ve boş stringleri temizle
df1 = df1.dropna()
df1 = df1[df1["0"].str.strip() != ""]

df2.columns = ["0"]

# NaN değerleri ve boş stringleri temizle
df2 = df2.dropna()
df2 = df2[df2["0"].str.strip() != ""]

In [5]:
# Doğru tokenizasyon fonksiyonu
def proper_tokenize(text):
    # Özel karakterleri kaldır ve küçük harfe çevir
    text = re.sub(r'[^a-zA-ZğüşıöçĞÜŞİÖÇ\s]', '', text.lower())
    # NLTK ile tokenize et
    tokens = word_tokenize(text)
    # Stopwords'leri ve tek karakterli kelimeleri kaldır
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words and len(word) > 1]

In [6]:
# Doğru tokenizasyon uygula
df1['tokens'] = df1['0'].apply(proper_tokenize)
df2['tokens'] = df2['0'].apply(proper_tokenize)



In [7]:
# Token listelerini oluştur
tokenized_corpus_lemmatized = df1['tokens'].tolist()
tokenized_corpus_stemmed = df2['tokens'].tolist()

In [8]:

def train_and_save_model(corpus, param, model_prefix):
    model_type = param['model_type']
    vector_size = param['vector_size']
    window = param['window']
    
    # CBOW (sg=0) veya Skip-gram (sg=1)
    sg = 0 if model_type == 'cbow' else 1

    model = Word2Vec(
        sentences=corpus,
        vector_size=vector_size,
        window=window,
        min_count=1,
        workers=4,
        sg=sg
    )

    model_filename = f"{model_prefix}_{model_type}_vs{vector_size}_w{window}.model"
    model.save(model_filename)
    print(f"Model saved as {model_filename}")

In [10]:
# Lemmatize edilmiş corpus ile modelleri eğitme ve kaydetme
for param in parameters:
    train_and_save_model(tokenized_corpus_lemmatized, param, "../models/lemmatized_model")

# Stemlenmiş corpus ile modelleri eğitme ve kaydetme
for param in parameters:
    train_and_save_model(tokenized_corpus_stemmed, param, "../models/stemmed_model")


Model saved as ../models/lemmatized_model_cbow_vs100_w2.model
Model saved as ../models/lemmatized_model_skipgram_vs100_w2.model
Model saved as ../models/lemmatized_model_cbow_vs100_w4.model
Model saved as ../models/lemmatized_model_skipgram_vs100_w4.model
Model saved as ../models/lemmatized_model_cbow_vs300_w2.model
Model saved as ../models/lemmatized_model_skipgram_vs300_w2.model
Model saved as ../models/lemmatized_model_cbow_vs300_w4.model
Model saved as ../models/lemmatized_model_skipgram_vs300_w4.model
Model saved as ../models/stemmed_model_cbow_vs100_w2.model
Model saved as ../models/stemmed_model_skipgram_vs100_w2.model
Model saved as ../models/stemmed_model_cbow_vs100_w4.model
Model saved as ../models/stemmed_model_skipgram_vs100_w4.model
Model saved as ../models/stemmed_model_cbow_vs300_w2.model
Model saved as ../models/stemmed_model_skipgram_vs300_w2.model
Model saved as ../models/stemmed_model_cbow_vs300_w4.model
Model saved as ../models/stemmed_model_skipgram_vs300_w4.model


In [11]:
# Model dosyalarını yüklemek
model_1 = Word2Vec.load("../models/lemmatized_model_cbow_vs100_w2.model")
model_2 = Word2Vec.load("../models/lemmatized_model_cbow_vs100_w4.model")
model_3 = Word2Vec.load("../models/lemmatized_model_cbow_vs300_w2.model")
model_4 = Word2Vec.load("../models/lemmatized_model_cbow_vs300_w4.model")
model_5 = Word2Vec.load("../models/lemmatized_model_skipgram_vs100_w2.model")
model_6 = Word2Vec.load("../models/lemmatized_model_skipgram_vs100_w4.model")
model_7 = Word2Vec.load("../models/lemmatized_model_skipgram_vs300_w2.model")
model_8 = Word2Vec.load("../models/lemmatized_model_skipgram_vs300_w4.model")
model_9  = Word2Vec.load("../models/stemmed_model_cbow_vs100_w2.model")
model_10 = Word2Vec.load("../models/stemmed_model_cbow_vs100_w4.model")
model_11 = Word2Vec.load("../models/stemmed_model_cbow_vs300_w2.model")
model_12 = Word2Vec.load("../models/stemmed_model_cbow_vs300_w4.model")
model_13 = Word2Vec.load("../models/stemmed_model_skipgram_vs100_w2.model")
model_14 = Word2Vec.load("../models/stemmed_model_skipgram_vs100_w4.model")
model_15 = Word2Vec.load("../models/stemmed_model_skipgram_vs300_w2.model")
model_16 = Word2Vec.load("../models/stemmed_model_skipgram_vs300_w4.model")

In [12]:
# 'angara' kelimesi ile en benzer 3 kelimeyi ve skorlarını yazdırmak
def print_similar_words(model, model_name):
    similarity = model.wv.most_similar("angara", topn=3)
    print(f"\n{model_name} Modeli - 'angara' ile En Benzer 3 Kelime:")
    for word, score in similarity:
        print(f"Kelime: {word}, Benzerlik Skoru: {score}")

In [13]:
# 16 model için benzer kelimeleri yazdır
print_similar_words(model_1, "Lemmatized CBOW Window 2 Dim 100")
print_similar_words(model_2, "Stemmed Skipgram Window 4 Dim 100")
print_similar_words(model_3, "Lemmatized Skipgram Window 2 Dim 300")
print_similar_words(model_4, "lemmatized skipgram window 4 dim 100")
print_similar_words(model_5, "lemmatized cbow window 2 dim 300")
print_similar_words(model_6, "lemmatizedskipgramwindow 2 dim300")
print_similar_words(model_7, "lemmatized_cbow_window 4_dim300")
print_similar_words(model_8, "lemmatized_skipgram_window4_dim300.model")
print_similar_words(model_9, "stemmed_cbow_window2_dim100")
print_similar_words(model_10, "stemmed_skipgram_window2_dim100")
print_similar_words(model_11, "stemmed_cbow_window4_dim100")
print_similar_words(model_12, "stemmed_skipgram_window4_dim100")
print_similar_words(model_13, "stemmed_cbow_window2_dim300")
print_similar_words(model_14, "stemmed_skipgram_window2_dim300")
print_similar_words(model_15, "stemmed_cbow_window4_dim300")
print_similar_words(model_16, "stemmed_skipgram_window4_dim300")


Lemmatized CBOW Window 2 Dim 100 Modeli - 'angara' ile En Benzer 3 Kelime:
Kelime: jamaican, Benzerlik Skoru: 0.9783456921577454
Kelime: pc, Benzerlik Skoru: 0.9757558107376099
Kelime: grilled, Benzerlik Skoru: 0.9722403883934021

Stemmed Skipgram Window 4 Dim 100 Modeli - 'angara' ile En Benzer 3 Kelime:
Kelime: jamaican, Benzerlik Skoru: 0.9821490049362183
Kelime: fry, Benzerlik Skoru: 0.9809262752532959
Kelime: grilled, Benzerlik Skoru: 0.9781044125556946

Lemmatized Skipgram Window 2 Dim 300 Modeli - 'angara' ile En Benzer 3 Kelime:
Kelime: jamaican, Benzerlik Skoru: 0.9948694705963135
Kelime: grilled, Benzerlik Skoru: 0.9799842834472656
Kelime: chicken, Benzerlik Skoru: 0.9784950017929077

lemmatized skipgram window 4 dim 100 Modeli - 'angara' ile En Benzer 3 Kelime:
Kelime: jamaican, Benzerlik Skoru: 0.9957451224327087
Kelime: fried, Benzerlik Skoru: 0.9880256056785583
Kelime: tangdi, Benzerlik Skoru: 0.9879208207130432

lemmatized cbow window 2 dim 300 Modeli - 'angara' ile En 

In [14]:
# Veri setinizde en sık geçen 20 kelime
from collections import Counter
all_words = [word for sentence in tokenized_corpus_lemmatized for word in sentence]
print("En sık kullanılan 20 kelime:", Counter(all_words).most_common(20))

En sık kullanılan 20 kelime: [('chicken', 9241), ('pizza', 8531), ('peri', 6018), ('grilled', 5230), ('garlic', 2711), ('bone', 2707), ('bread', 2525), ('tender', 2513), ('jamaican', 2505), ('paneer', 2491), ('melt', 2373), ('fry', 2083), ('bageecha', 2051), ('cheese', 1934), ('pide', 1872), ('murgh', 1731), ('amritsari', 1731), ('fried', 1728), ('seekh', 1714), ('angara', 1473)]
