In [None]:
import pandas as pd
import nltk

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

In [None]:
def check_df(dataframe):
    """
    Checks the overall structure and key metrics of a DataFrame.

    Args:
        dataframe (pd.DataFrame): DataFrame to inspect.

    Returns:
        None: Prints shape, data types, head, tail, missing values, and quantiles.
    """
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(5))
    print("##################### Tail #####################")
    print(dataframe.tail(5))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print('##################### Unique Values #####################')
    print(dataframe.nunique())
    print("##################### Duplicates #####################")
    print(dataframe.duplicated().sum())
    print("##################### Quantiles #####################")
    # Uncomment below to include quantile information
    #print(dataframe[[col for col in dataframe.columns if dataframe[col].dtypes != "O"]].quantile([0, 0.05, 0.50, 0.75, 0.95, 0.99, 1]).T)
    print(dataframe.describe().T)


In [None]:
def load_train():
    df_train = pd.read_csv("data/train.csv", encoding="UTF-8", engine="python", encoding_errors="replace")#replaces damaged bytes with "\ufffd"
    return df_train

def load_test():
    df_test = pd.read_csv("data/test.csv", encoding="UTF-8", engine="python", encoding_errors="replace")
    return df_test

In [None]:
df_train = load_train()
df_test = load_test()

In [None]:
check_df(df_train)

In [None]:
check_df(df_test)

- damaged rows filtering, these can be considered to be dropped
- also other (#NAME?) damage can be seen during data read, these rows will be dropped

In [None]:
damaged_rows_train = df_train[df_train["text"].str.contains("\ufffd", na=False)]
damaged_rows_test = df_test[df_test["text"].str.contains("\ufffd", na=False)]

print(f"Total damaged rows in train: {len(damaged_rows_train)}")

print(damaged_rows_train.head())

print(f"Total damaged rows in test: {len(damaged_rows_test)}")

print(damaged_rows_test.head())

In [None]:
df_train.drop(index=damaged_rows_train.index, inplace=True)

In [None]:
df_train = df_train[df_train['text'] != "#NAME?"]
df_test = df_test[df_test['text'] != "#NAME?"]

In [None]:
for col in df_train.columns:
    df_train[col] = df_train[col].str.lower() # Normalizing Case Folding
    df_train[col] = df_train[col].str.replace(r'[^\w\s]', '', regex=True) # Punctuations
    df_train[col] = df_train[col].str.replace(r'\d+', '', regex=True) # Numbers

In [None]:
for col in df_test.columns:
    df_test[col] = df_test[col].str.lower() # Normalizing Case Folding
    df_test[col] = df_test[col].str.replace(r'[^\w\s]', '', regex=True) # Punctuations
    df_test[col] = df_test[col].str.replace(r'\d+', '', regex=True) # Numbers

In [None]:
check_df(df_train)

In [None]:
check_df(df_test)

In [None]:
df_train.drop_duplicates(inplace=True)
df_test.drop_duplicates(inplace=True)

In [None]:
check_df(df_train)
check_df(df_test)

**TASK**

4 different models ([TF-IDF with Multinomial Naive Bayes and Binary Naive Bayes] + [ANN with Word2Vec and FastText]) will be trained and compared.

**ROADMAP**

Preprocessing steps will be applied on data according to models they will be fed to.

***For Bayesian Model:***
- Lowecase transformation
- Special characters cleaning (Punctuations etc.)

In [None]:
def concat_df_on_y_axis(df_1, df_2):
    """
    Concatenates two DataFrames along the Y-axis (rows).

    Args:
        df_1 (pd.DataFrame): First DataFrame.
        df_2 (pd.DataFrame): Second DataFrame.

    Returns:
        pd.DataFrame: Concatenated DataFrame.
    """
    return pd.concat([df_1, df_2])

In [None]:
df_train_test = concat_df_on_y_axis(df_train, df_test)

In [None]:
check_df(df_train_test)

**OBSERVATIONS**
- df_train has 0 duplicates, duplicates dropped.
- df_test has 0 duplicates, duplicates dropped.
- df_train_test has 515 duplicates.
- **Data Leakage observed**
- Set of {df_train INTERSECT df_test} has to be removed from df_train.

In [None]:
test_texts = set(df_test['text'])
df_train = df_train[~df_train['text'].isin(test_texts)]

In [None]:
df_train_test = concat_df_on_y_axis(df_train, df_test)

In [None]:
check_df(df_train_test)

**Data Leakage problem solved**

## Naive Bayes Modeling

**STOPWORDS REMOVAL**

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

sw = stopwords.words('turkish')

In [None]:
df_train_test_sw_removed = df_train_test.copy()

In [None]:
df_train_test_sw_removed['text'] = df_train_test_sw_removed['text'].apply(lambda x: " ".join(x for x in str(x).split() if x not in sw))

**STEMMING**
- Stemming is easy and will produce enough efficiency with bayesian models
- Lemmatization can be alternative

In [None]:
from TurkishStemmer import TurkishStemmer
stemmer = TurkishStemmer()

In [None]:
df_train_test_sw_removed['text'] = df_train_test_sw_removed['text'].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))

In [None]:
check_df(df_train_test_sw_removed)

In [None]:
len_train = len(df_train)

df_train_sw_removed_stemmed = df_train_test_sw_removed.iloc[:len_train].copy()

df_test_sw_removed_stemmed = df_train_test_sw_removed.iloc[len_train:].copy()

In [None]:
X_train = df_train_sw_removed_stemmed['text']
y_train = df_train_sw_removed_stemmed['label']
X_test = df_test_sw_removed_stemmed['text']
y_test = df_test_sw_removed_stemmed['label']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

In [None]:
from sklearn.metrics import classification_report

### Multinomial Naive Bayes

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

**TF-IDF Vectorization**

In [None]:
# for multinomial NB
X_train_nb = tfidf_vectorizer.fit_transform(X_train)
X_test_nb = tfidf_vectorizer.transform(X_test)

**Multinomial NB Model**

In [None]:
nb_model = MultinomialNB().fit(X_train_nb, y_train)

**Multinomial NB Model Evaluation**

In [None]:
nb_model_pred = nb_model.predict(X_test_nb)

In [None]:
print(classification_report(y_test, nb_model_pred, digits=3))

### Binary Naive Bayes

In [None]:
tfidf_vectorizer_binary = TfidfVectorizer(ngram_range=(1,2), binary=True)

**Binary TF-IDF Vectorization**

In [None]:
# for binary NB
X_train_nb_binary = tfidf_vectorizer_binary.fit_transform(X_train)
X_test_nb_binary = tfidf_vectorizer_binary.transform(X_test)

**Binary NB Model**

In [None]:
nb_model_binary = BernoulliNB().fit(X_train_nb_binary, y_train)

**Binary NB Model Evaluation**

In [None]:
nb_binary_model_pred = nb_model.predict(X_test_nb_binary)

In [None]:
print(classification_report(y_test, nb_binary_model_pred, digits=3))

## ANN MODELÄ°NG

In [None]:
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab') #necessary for tokenization method

In [None]:
# Tokenizing sentences
tokenized_sentences_train = [word_tokenize(sentence.lower()) for sentence in df_train['text']]
tokenized_sentences_test = [word_tokenize(sentence.lower()) for sentence in df_test['text']]

### SKIPGRAM-ANN MODEL
- Skipgram has better performance modeling semantics, which we desperately need in this case

In [None]:
from collections import Counter

In [None]:
# Hyperparameters
EMBEDDING_DIM = 10      # Increased from 5 to 100 for better representation
WINDOW_SIZE = 2
MAX_VOCAB_SIZE = 20000   # Limit vocabulary to top 20k words to prevent OOM errors
BATCH_SIZE = 128         
NUM_EPOCHS = 10       
LEARNING_RATE = 0.001

In [None]:
sentences = tokenized_sentences_train

In [None]:
# Flatten the list of sentences to a single list of words
all_words = [word for sentence in sentences for word in sentence]

In [None]:
# Select only the most common words to keep the vocabulary size manageable
# We reserve index 0 for <UNK>, so we take MAX_VOCAB_SIZE - 1
word_counts = Counter(all_words).most_common(MAX_VOCAB_SIZE - 1)

In [None]:
# Create vocabulary mapping: <UNK> is always at index 0
word_to_ix = {"<UNK>": 0}
for word, count in word_counts:
    word_to_ix[word] = len(word_to_ix)

In [None]:
# Create reverse mapping (Index -> Word)
ix_to_word = {ix: word for word, ix in word_to_ix.items()}
VOCAB_SIZE = len(word_to_ix)

print(f"Total words scanned: {len(all_words)}")
print(f"Final Vocabulary Size: {VOCAB_SIZE}")

In [None]:
# Generate Skip-gram Pairs (Input -> Target)
inputs = []
targets = []

print("Generating training pairs...")
for sentence in sentences:
    # Convert words to indices. If a word is not in top 20k, it becomes 0 (<UNK>)
    sentence_indices = [word_to_ix.get(word, 0) for word in sentence]
    
    for i in range(len(sentence_indices)):
        target_word_idx = sentence_indices[i] # Center word
        
        # Optimization: If the target word is unknown (<UNK>), 
        # we skip training on it to avoid noise.
        if target_word_idx == 0:
            continue
            
        # Define context window
        start_idx = max(0, i - WINDOW_SIZE)
        end_idx = min(len(sentence_indices), i + WINDOW_SIZE + 1)
        
        for j in range(start_idx, end_idx):
            if i != j: # Skip the target word itself
                context_word_idx = sentence_indices[j]
                inputs.append(target_word_idx)
                targets.append(context_word_idx)

print(f"Total training pairs generated: {len(inputs)}")

#### Tensorflow

In [None]:
import os
import site

try:
    site_packages = site.getsitepackages()[0]
    nvidia_path = os.path.join(site_packages, 'nvidia')
    
    cudnn_path = os.path.join(nvidia_path, 'cudnn', 'lib')
    cuda_path = os.path.join(nvidia_path, 'cuda_runtime', 'lib')
    
    old_ld = os.environ.get('LD_LIBRARY_PATH', '')
    os.environ['LD_LIBRARY_PATH'] = f"{cudnn_path}:{cuda_path}:{old_ld}"
    
    # This specific flag often fixes 'DNN library initialization failed' errors
    # by disabling some auto-tuning features that might crash on certain GPUs.
    os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0' 
    
    print("NVIDIA Library paths arranged successfully")
    
except Exception as e:
    print(f"Path warning: {e}")

# --- 2. IMPORT TENSORFLOW AND CONFIGURE GPU MEMORY ---
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers, models, optimizers, callbacks

print(f"TensorFlow Version: {tf.__version__}")

# GPU Memory Growth
# This is CRITICAL. It prevents TensorFlow from hogging all VRAM at start-up.
# Must be run immediately after importing TF.
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"GPU Detected and memory growth set: {gpus}")
    except RuntimeError as e:
        print(f"GPU Error: {e}")
else:
    print("No GPU detected.")

In [None]:
# Convert lists to NumPy arrays (TensorFlow prefers typed arrays)
inputs = np.array(inputs, dtype=np.int32)
targets = np.array(targets, dtype=np.int32)

In [None]:
# Use tf.data.Dataset for efficient Batching and Prefetching on GPU
dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))

# Shuffle buffer size should ideally be >= number of training samples
# Prefetch allows the CPU to prepare the next batch while GPU processes the current one
dataset = dataset.shuffle(buffer_size=1024).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
def build_skipgram_model(vocab_size, embedding_dim):
    # Input layer: Receives a single integer (word index)
    input_word = layers.Input(shape=(1,), name="target_word_input")
    
    # Embedding layer: Converts index to dense vector
    # input_dim: Vocabulary size
    # output_dim: Size of the vector space
    x = layers.Embedding(input_dim=vocab_size, 
                         output_dim=embedding_dim, 
                         input_length=1, 
                         name="embedding_layer")(input_word)
    
    # Flatten: Converts (Batch, 1, Dim) -> (Batch, Dim)
    x = layers.Flatten()(x)
    
    # Output layer: Predicts probability for every word in vocabulary
    # Softmax ensures output sums to 1 (probability distribution)
    output = layers.Dense(vocab_size, activation='softmax', name="context_prediction")(x)
    
    model = models.Model(inputs=input_word, outputs=output)
    return model

In [None]:
class WordSimilarityCallback(callbacks.Callback):
    """
    A custom callback to print the most similar words to a specific query word
    at the end of each epoch. This helps in monitoring the semantic learning progress.
    """
    def __init__(self, test_word, word_to_ix, ix_to_word, top_k=5):
        super(WordSimilarityCallback, self).__init__()
        self.test_word = test_word
        self.word_to_ix = word_to_ix
        self.ix_to_word = ix_to_word
        self.top_k = top_k

    def on_epoch_end(self, epoch, logs=None):
        # Check if the test word is in vocabulary
        if self.test_word not in self.word_to_ix:
            return

        # 1. Retrieve the weights from the embedding layer
        embedding_layer = self.model.get_layer("embedding_layer")
        embeddings = embedding_layer.get_weights()[0]
        
        # 2. Get the vector for the test word
        test_idx = self.word_to_ix[self.test_word]
        test_vector = embeddings[test_idx]
        
        # 3. Calculate Cosine Similarity
        # Normalize embeddings and the test vector to unit length
        norm_embeddings = tf.math.l2_normalize(embeddings, axis=1)
        norm_test_vector = tf.math.l2_normalize(test_vector, axis=0)
        
        # Dot product of normalized vectors equals cosine similarity
        cosine_similarities = tf.tensordot(norm_embeddings, norm_test_vector, axes=1)
        
        # 4. Find the indices of the words with the highest similarity scores
        # We take top_k + 1 because the most similar word is the word itself (score=1.0)
        top_indices = tf.math.top_k(cosine_similarities, k=self.top_k + 1).indices.numpy()
        
        # 5. Print the results
        closest_words = [self.ix_to_word[idx] for idx in top_indices if idx != test_idx]
        print(f"\n[Validation] End of Epoch {epoch+1} - Closest words to '{self.test_word}':")
        print(f"  -> {', '.join(closest_words)}")

In [None]:
# Initialize the model
model = build_skipgram_model(VOCAB_SIZE, EMBEDDING_DIM)

In [None]:
top_k_metric = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name='top_5_acc')

In [None]:
# We use 'sparse_categorical_crossentropy' because our targets are integers (indexes),
# not one-hot encoded vectors. This saves memory and is computationally efficient.
model.compile(optimizer=optimizers.Adam(learning_rate=LEARNING_RATE),
              loss='sparse_categorical_crossentropy',
              metrics=[top_k_metric])

In [None]:
# Define the custom callback (e.g., check neighbors of "learning")
# Note: Ensure the test_word exists in your training data
visual_callback = WordSimilarityCallback(test_word="araba", 
                                         word_to_ix=word_to_ix, 
                                         ix_to_word=ix_to_word,
                                         top_k=3)

In [None]:
# Print model architecture
model.summary()

In [None]:
# Start Training
print("\nStarting Training...")
history = model.fit(dataset, epochs=NUM_EPOCHS, callbacks=[visual_callback])
print("Training Complete.")

In [None]:
# Get weights from the embedding layer
# The shape will be (VOCAB_SIZE, EMBEDDING_DIM)
vectors = model.get_layer("embedding_layer").get_weights()[0]

#### Torch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

In [None]:
from tqdm import tqdm

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f'Using device: {device}')

In [None]:
# Skip-gram Model
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target_word_idx):
        # target_word_idx shape: [batch_size]
        embed = self.embeddings(target_word_idx) # shape: [batch_size, embedding_dim]
        output = self.linear(embed)              # shape: [batch_size, vocab_size]
        log_probs = torch.log_softmax(output, dim=1)
        return log_probs

In [None]:
# transforming into torch tensors for compatibility with data loaders which will be implementing batching also
inputs_tensor = torch.tensor(inputs, dtype=torch.long)
targets_tensor = torch.tensor(targets, dtype=torch.long)

In [None]:
# defining data loader for batching
train_data = TensorDataset(inputs_tensor, targets_tensor)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
# Initialize models
skipgram_model = SkipGramModel(VOCAB_SIZE, EMBEDDING_DIM).to(device)

In [None]:
# Define loss function and optimizer
criterion = nn.NLLLoss()
skipgram_optimizer = optim.SGD(skipgram_model.parameters(), lr=LEARNING_RATE)

In [None]:
print("Training Skip-gram model (Conceptual)...")
for epoch in tqdm(range(NUM_EPOCHS), desc="Epochs", position=0, leave=True):
    total_loss = 0
    
    # DataLoader fetches batch by batch
    for batch_inputs, batch_targets in tqdm(train_loader, desc=f"Epoch {epoch+1} Batches", leave=False):
        
        # Place data on GPU
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)
        
        # Zeroing grads
        skipgram_optimizer.zero_grad()
        
        # forward prop
        log_probs = skipgram_model(batch_inputs)
        
        # loss calculation
        loss = criterion(log_probs, batch_targets)
        
        # backward prop
        loss.backward()
        skipgram_optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {total_loss/len(train_loader):.4f}")
print("Skip-gram training complete (Conceptual).")