In [None]:
import os
import site

try:
    site_packages = site.getsitepackages()[0]
    nvidia_path = os.path.join(site_packages, 'nvidia')
    
    cudnn_path = os.path.join(nvidia_path, 'cudnn', 'lib')
    cuda_path = os.path.join(nvidia_path, 'cuda_runtime', 'lib')
    
    old_ld = os.environ.get('LD_LIBRARY_PATH', '')
    os.environ['LD_LIBRARY_PATH'] = f"{cudnn_path}:{cuda_path}:{old_ld}"
    
    os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0'
    
    print("NVIDIA Library paths arranged successfully")
    
except Exception as e:
    print(f"Path warning: {e}")


import tensorflow as tf

print(f"TensorFlow Version: {tf.__version__}")
gpus = tf.config.list_physical_devices('GPU')
print(f"GPUs: {gpus}")

if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import RMSprop

# GPU Bellek Ayarı (Memory Growth)
# Bu kısım importlardan hemen sonra çalışmalı
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"GPU Detected and memory setting done: {gpus}")
    except RuntimeError as e:
        print(e)

NVIDIA Library paths arranged successfully


2025-12-18 19:43:14.940379: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-12-18 19:43:14.940471: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-12-18 19:43:15.011949: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-12-18 19:43:15.182644: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow Version: 2.15.0
GPUs: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


2025-12-18 19:43:18.612951: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-12-18 19:43:19.100256: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-12-18 19:43:19.101084: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

GPU Detected and memory setting done: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [188]:
import pandas as pd
import numpy as np

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

In [4]:
def check_df(dataframe):
    """
    Checks the overall structure and key metrics of a DataFrame.

    Args:
        dataframe (pd.DataFrame): DataFrame to inspect.

    Returns:
        None: Prints shape, data types, head, tail, missing values, and quantiles.
    """
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(5))
    print("##################### Tail #####################")
    print(dataframe.tail(5))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print('##################### Unique Values #####################')
    print(dataframe.nunique())
    print("##################### Duplicates #####################")
    print(dataframe.duplicated().sum())
    print("##################### Quantiles #####################")
    # Uncomment below to include quantile information
    #print(dataframe[[col for col in dataframe.columns if dataframe[col].dtypes != "O"]].quantile([0, 0.05, 0.50, 0.75, 0.95, 0.99, 1]).T)
    print(dataframe.describe().T)


In [5]:
def load_train():
    df_train = pd.read_csv("data/train.csv", encoding="UTF-8", engine="python", encoding_errors="replace")#replaces damaged bytes with "\ufffd"
    return df_train

def load_test():
    df_test = pd.read_csv("data/test.csv", encoding="UTF-8", engine="python", encoding_errors="replace")
    return df_test

In [169]:
df_train = load_train()
df_test = load_test()

In [170]:
check_df(df_train)

##################### Shape #####################
(440679, 2)
##################### Types #####################
text     object
label    object
dtype: object
##################### Head #####################
                                                text     label
0  ürünü hepsiburadadan alalı 3 hafta oldu. orjin...  Positive
1  ürünlerden çok memnunum, kesinlikle herkese ta...  Positive
2      hızlı kargo, temiz alışveriş.teşekkür ederim.  Positive
3               Çünkü aranan tapınak bu bölgededir .      Notr
4  bu telefonu başlıca alma nedenlerim ise elimde...  Positive
##################### Tail #####################
                                                     text     label
440674  Ayrıca burç yorumları ve çapraz bulmaca da der...      Notr
440675  günümüz de ssd olmazsa olmaz bir donanım artık...  Positive
440676  kullandım ve çok memnun kaldım. ocak başında d...  Positive
440677                Adını Lenkeran şehrinden almıştır .      Notr
440678  Bu dergilerde sosy

In [171]:
check_df(df_test)

##################### Shape #####################
(48965, 2)
##################### Types #####################
text     object
label    object
dtype: object
##################### Head #####################
                                                text     label
0      Kral akbaba dikkat çekici renklere sahiptir .      Notr
1   ısrarla korkutmayı başarıyor. sanki korku çok...  Positive
2  Neşe ve Üzüntü köprünün kırılmaya başlamasıyla...      Notr
3  i phone 5 ten sonra gene 4'' ekranı tercih ett...  Positive
4    Beşinci sezonda diziye yeni oyuncular katıldı .      Notr
##################### Tail #####################
                                                    text     label
48960  Fransa bayrağı diğer kırmızı zeminden beyaz bi...      Notr
48961  Yine aynı yıl türkü dalında Murat Çobanoğlu il...      Notr
48962                           Kurgunu skiyim oç evladı  Negative
48963  Şarkı daha sonrasında Damian Marley tarafından...      Notr
48964  berrak bir ürün ancak ken

- damaged rows filtering, these can be considered to be dropped
- also other (#NAME?) damage can be seen during data read, these rows will be dropped

In [172]:
damaged_rows_train = df_train[df_train["text"].str.contains("\ufffd", na=False)]
damaged_rows_test = df_test[df_test["text"].str.contains("\ufffd", na=False)]

print(f"Total damaged rows in train: {len(damaged_rows_train)}")

print(damaged_rows_train.head())

print(f"Total damaged rows in test: {len(damaged_rows_test)}")

print(damaged_rows_test.head())

Total damaged rows in train: 7
                                                     text     label
31512   - su akıtmıyor: adamlar kullanam klavuzuna yaz...  Positive
55634   -kargocu arkadaşlar ürünü bir bayan olarak taş...  Positive
64093   - kullanım tarifindeki 'hazneye sıcak su koyun...  Positive
102817  -kamerasına laf edilmiş. çıktığı dönemin en iy...  Positive
332479  - karşı taraf sesimden çok memnun ama ben karş...  Positive
Total damaged rows in test: 0
Empty DataFrame
Columns: [text, label]
Index: []


In [173]:
df_train.drop(index=damaged_rows_train.index, inplace=True)

In [174]:
df_train = df_train[df_train['text'] != "#NAME?"]
df_test = df_test[df_test['text'] != "#NAME?"]

In [175]:
for col in df_train.columns:
    df_train[col] = df_train[col].str.lower() # Normalizing Case Folding
    df_train[col] = df_train[col].str.replace(r'[^\w\s]', '', regex=True) # Punctuations
    df_train[col] = df_train[col].str.replace(r'\d+', '', regex=True) # Numbers

In [176]:
for col in df_test.columns:
    df_test[col] = df_test[col].str.lower() # Normalizing Case Folding
    df_test[col] = df_test[col].str.replace(r'[^\w\s]', '', regex=True) # Punctuations
    df_test[col] = df_test[col].str.replace(r'\d+', '', regex=True) # Numbers

In [177]:
check_df(df_train)

##################### Shape #####################
(439610, 2)
##################### Types #####################
text     object
label    object
dtype: object
##################### Head #####################
                                                text     label
0  ürünü hepsiburadadan alalı  hafta oldu orjinal...  positive
1  ürünlerden çok memnunum kesinlikle herkese tav...  positive
2         hızlı kargo temiz alışverişteşekkür ederim  positive
3                çünkü aranan tapınak bu bölgededir       notr
4  bu telefonu başlıca alma nedenlerim ise elimde...  positive
##################### Tail #####################
                                                     text     label
440674  ayrıca burç yorumları ve çapraz bulmaca da der...      notr
440675  günümüz de ssd olmazsa olmaz bir donanım artık...  positive
440676  kullandım ve çok memnun kaldım ocak başında da...  positive
440677                 adını lenkeran şehrinden almıştır       notr
440678  bu dergilerde sosy

In [178]:
check_df(df_test)

##################### Shape #####################
(48846, 2)
##################### Types #####################
text     object
label    object
dtype: object
##################### Head #####################
                                                text     label
0       kral akbaba dikkat çekici renklere sahiptir       notr
1   ısrarla korkutmayı başarıyor sanki korku çok ...  positive
2  neşe ve üzüntü köprünün kırılmaya başlamasıyla...      notr
3  i phone  ten sonra gene  ekranı tercih ettim t...  positive
4     beşinci sezonda diziye yeni oyuncular katıldı       notr
##################### Tail #####################
                                                    text     label
48960  fransa bayrağı diğer kırmızı zeminden beyaz bi...      notr
48961  yine aynı yıl türkü dalında murat çobanoğlu il...      notr
48962                           kurgunu skiyim oç evladı  negative
48963  şarkı daha sonrasında damian marley tarafından...      notr
48964  berrak bir ürün ancak ken

In [179]:
df_train.drop_duplicates(inplace=True)
df_test.drop_duplicates(inplace=True)

In [180]:
check_df(df_train)
check_df(df_test)

##################### Shape #####################
(436611, 2)
##################### Types #####################
text     object
label    object
dtype: object
##################### Head #####################
                                                text     label
0  ürünü hepsiburadadan alalı  hafta oldu orjinal...  positive
1  ürünlerden çok memnunum kesinlikle herkese tav...  positive
2         hızlı kargo temiz alışverişteşekkür ederim  positive
3                çünkü aranan tapınak bu bölgededir       notr
4  bu telefonu başlıca alma nedenlerim ise elimde...  positive
##################### Tail #####################
                                                     text     label
440674  ayrıca burç yorumları ve çapraz bulmaca da der...      notr
440675  günümüz de ssd olmazsa olmaz bir donanım artık...  positive
440676  kullandım ve çok memnun kaldım ocak başında da...  positive
440677                 adını lenkeran şehrinden almıştır       notr
440678  bu dergilerde sosy

**"kullanam klavuzu", encoding="hırt"**

**TASK**

4 different models ([TF-IDF with Multinomial Naive Bayes and Binary Naive Bayes] + [ANN with Word2Vec and FastText]) will be trained and compared.

**ROADMAP**

Preprocessing steps will be applied on data according to models they will be fed to.

***For Bayesian Model:***
- Lowecase transformation
- Special characters cleaning (Punctuations etc.)

In [181]:
def concat_df_on_y_axis(df_1, df_2):
    """
    Concatenates two DataFrames along the Y-axis (rows).

    Args:
        df_1 (pd.DataFrame): First DataFrame.
        df_2 (pd.DataFrame): Second DataFrame.

    Returns:
        pd.DataFrame: Concatenated DataFrame.
    """
    return pd.concat([df_1, df_2])

In [182]:
df_train_test = concat_df_on_y_axis(df_train, df_test)

In [183]:
check_df(df_train_test)

##################### Shape #####################
(485387, 2)
##################### Types #####################
text     object
label    object
dtype: object
##################### Head #####################
                                                text     label
0  ürünü hepsiburadadan alalı  hafta oldu orjinal...  positive
1  ürünlerden çok memnunum kesinlikle herkese tav...  positive
2         hızlı kargo temiz alışverişteşekkür ederim  positive
3                çünkü aranan tapınak bu bölgededir       notr
4  bu telefonu başlıca alma nedenlerim ise elimde...  positive
##################### Tail #####################
                                                    text     label
48960  fransa bayrağı diğer kırmızı zeminden beyaz bi...      notr
48961  yine aynı yıl türkü dalında murat çobanoğlu il...      notr
48962                           kurgunu skiyim oç evladı  negative
48963  şarkı daha sonrasında damian marley tarafından...      notr
48964  berrak bir ürün ancak ke

**OBSERVATIONS**
- df_train has 0 duplicates, duplicates dropped.
- df_test has 0 duplicates, duplicates dropped.
- df_train_test has 515 duplicates.
- **Data Leakage observed**
- Set of {df_train INTERSECT df_test} has to be removed from df_train.

In [184]:
test_texts = set(df_test['text'])
df_train = df_train[~df_train['text'].isin(test_texts)]

In [185]:
df_train_test = concat_df_on_y_axis(df_train, df_test)

In [186]:
check_df(df_train_test)

##################### Shape #####################
(484835, 2)
##################### Types #####################
text     object
label    object
dtype: object
##################### Head #####################
                                                text     label
0  ürünü hepsiburadadan alalı  hafta oldu orjinal...  positive
1  ürünlerden çok memnunum kesinlikle herkese tav...  positive
2         hızlı kargo temiz alışverişteşekkür ederim  positive
3                çünkü aranan tapınak bu bölgededir       notr
4  bu telefonu başlıca alma nedenlerim ise elimde...  positive
##################### Tail #####################
                                                    text     label
48960  fransa bayrağı diğer kırmızı zeminden beyaz bi...      notr
48961  yine aynı yıl türkü dalında murat çobanoğlu il...      notr
48962                           kurgunu skiyim oç evladı  negative
48963  şarkı daha sonrasında damian marley tarafından...      notr
48964  berrak bir ürün ancak ke

**Data Leakage problem solved**

## Naive Bayes Modeling

**STOPWORDS REMOVAL**

In [189]:
import nltk
from nltk.corpus import stopwords

In [190]:
nltk.download('stopwords')

sw = stopwords.words('turkish')

[nltk_data] Downloading package stopwords to /home/ghost/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [196]:
df_train_test_sw_removed = df_train_test.copy()

In [197]:
df_train_test_sw_removed['text'] = df_train_test_sw_removed['text'].apply(lambda x: " ".join(x for x in str(x).split() if x not in sw))

**STEMMING**
- Stemming is easy and will produce enough efficiency with bayesian models
- Lemmatization can be alternative

In [198]:
from TurkishStemmer import TurkishStemmer
stemmer = TurkishStemmer()

In [199]:
df_train_test_sw_removed['text'] = df_train_test_sw_removed['text'].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))

In [201]:
check_df(df_train_test_sw_removed)

##################### Shape #####################
(484835, 2)
##################### Types #####################
text     object
label    object
dtype: object
##################### Head #####################
                                                text     label
0  ürün hepsiburada alal haf olt orjinal eksiks ş...  positive
1  ürün memnu kesinlik herk tavsi eder ayrı hepsi...  positive
2              hızl kargo tem alışverişteşekkür eder  positive
3                               aranan tapınak bölge      notr
4  telefon başl al neden elim samsung j ar yeters...  positive
##################### Tail #####################
                                                    text     label
48960  fran bayrak diğer kır zem beyaz bir çerçev ayr...      notr
48961  yin aynı yıl türk dal murat çobanok birlik bir...      notr
48962                                  kurg skiy oç evla  negative
48963         şark sonra damian marley taraf seslendiril      notr
48964  berrak bir ürün ancak ke

In [215]:
len_train = len(df_train)

df_train_sw_removed_stemmed = df_train_test_sw_removed.iloc[:len_train].copy()

df_test_sw_removed_stemmed = df_train_test_sw_removed.iloc[len_train:].copy()

In [216]:
X_train = df_train_sw_removed_stemmed['text']
y_train = df_train_sw_removed_stemmed['label']
X_test = df_test_sw_removed_stemmed['text']
y_test = df_test_sw_removed_stemmed['label']

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

In [225]:
from sklearn.metrics import classification_report

### Multinomial Naive Bayes

In [218]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))

**TF-IDF Vectorization**

In [219]:
# for multinomial NB
X_train_nb = tfidf_vectorizer.fit_transform(X_train)
X_test_nb = tfidf_vectorizer.transform(X_test)

**Multinomial NB Model**

In [220]:
nb_model = MultinomialNB().fit(X_train_nb, y_train)

**Multinomial NB Model Evaluation**

In [224]:
nb_model_pred = nb_model.predict(X_test_nb)

In [227]:
print(classification_report(y_test, nb_model_pred))

              precision    recall  f1-score   support

    negative       0.99      0.13      0.22      5636
        notr       0.98      0.89      0.94     17087
    positive       0.79      0.99      0.88     26053

    accuracy                           0.86     48776
   macro avg       0.92      0.67      0.68     48776
weighted avg       0.88      0.86      0.82     48776



### Binary Naive Bayes

In [221]:
tfidf_vectorizer_binary = TfidfVectorizer(ngram_range=(1,2), binary=True)

**Binary TF-IDF Vectorization**

In [228]:
# for binary NB
X_train_nb_binary = tfidf_vectorizer_binary.fit_transform(X_train)
X_test_nb_binary = tfidf_vectorizer_binary.transform(X_test)

**Binary NB Model**

In [229]:
nb_model_binary = BernoulliNB().fit(X_train_nb_binary, y_train)

**Binary NB Model Evaluation**

In [231]:
nb_binary_model_pred = nb_model.predict(X_test_nb_binary)

In [232]:
print(classification_report(y_test, nb_binary_model_pred))

              precision    recall  f1-score   support

    negative       0.99      0.13      0.22      5636
        notr       0.98      0.89      0.94     17087
    positive       0.79      0.99      0.88     26053

    accuracy                           0.86     48776
   macro avg       0.92      0.67      0.68     48776
weighted avg       0.88      0.86      0.82     48776

