In [1]:
import pandas as pd
import nltk
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

In [3]:
def check_df(dataframe):
    """
    Checks the overall structure and key metrics of a DataFrame.

    Args:
        dataframe (pd.DataFrame): DataFrame to inspect.

    Returns:
        None: Prints shape, data types, head, tail, missing values, and quantiles.
    """
    print("##################### Shape #####################")
    print(dataframe.shape)
    print("##################### Types #####################")
    print(dataframe.dtypes)
    print("##################### Head #####################")
    print(dataframe.head(5))
    print("##################### Tail #####################")
    print(dataframe.tail(5))
    print("##################### NA #####################")
    print(dataframe.isnull().sum())
    print('##################### Unique Values #####################')
    print(dataframe.nunique())
    print("##################### Duplicates #####################")
    print(dataframe.duplicated().sum())
    print("##################### Quantiles #####################")
    # Uncomment below to include quantile information
    #print(dataframe[[col for col in dataframe.columns if dataframe[col].dtypes != "O"]].quantile([0, 0.05, 0.50, 0.75, 0.95, 0.99, 1]).T)
    print(dataframe.describe().T)


In [4]:
def load_train():
    df_train = pd.read_csv("data/train.csv", encoding="UTF-8", engine="python", encoding_errors="replace")#replaces damaged bytes with "\ufffd"
    return df_train

def load_test():
    df_test = pd.read_csv("data/test.csv", encoding="UTF-8", engine="python", encoding_errors="replace")
    return df_test

In [5]:
df_train = load_train()
df_test = load_test()

In [6]:
check_df(df_train)

##################### Shape #####################
(440679, 2)
##################### Types #####################
text     object
label    object
dtype: object
##################### Head #####################
                                                text     label
0  ürünü hepsiburadadan alalı 3 hafta oldu. orjin...  Positive
1  ürünlerden çok memnunum, kesinlikle herkese ta...  Positive
2      hızlı kargo, temiz alışveriş.teşekkür ederim.  Positive
3               Çünkü aranan tapınak bu bölgededir .      Notr
4  bu telefonu başlıca alma nedenlerim ise elimde...  Positive
##################### Tail #####################
                                                     text     label
440674  Ayrıca burç yorumları ve çapraz bulmaca da der...      Notr
440675  günümüz de ssd olmazsa olmaz bir donanım artık...  Positive
440676  kullandım ve çok memnun kaldım. ocak başında d...  Positive
440677                Adını Lenkeran şehrinden almıştır .      Notr
440678  Bu dergilerde sosy

In [7]:
check_df(df_test)

##################### Shape #####################
(48965, 2)
##################### Types #####################
text     object
label    object
dtype: object
##################### Head #####################
                                                text     label
0      Kral akbaba dikkat çekici renklere sahiptir .      Notr
1   ısrarla korkutmayı başarıyor. sanki korku çok...  Positive
2  Neşe ve Üzüntü köprünün kırılmaya başlamasıyla...      Notr
3  i phone 5 ten sonra gene 4'' ekranı tercih ett...  Positive
4    Beşinci sezonda diziye yeni oyuncular katıldı .      Notr
##################### Tail #####################
                                                    text     label
48960  Fransa bayrağı diğer kırmızı zeminden beyaz bi...      Notr
48961  Yine aynı yıl türkü dalında Murat Çobanoğlu il...      Notr
48962                           Kurgunu skiyim oç evladı  Negative
48963  Şarkı daha sonrasında Damian Marley tarafından...      Notr
48964  berrak bir ürün ancak ken

- damaged rows filtering, these can be considered to be dropped
- also other (#NAME?) damage can be seen during data read, these rows will be dropped

In [8]:
damaged_rows_train = df_train[df_train["text"].str.contains("\ufffd", na=False)]
damaged_rows_test = df_test[df_test["text"].str.contains("\ufffd", na=False)]

print(f"Total damaged rows in train: {len(damaged_rows_train)}")

print(damaged_rows_train.head())

print(f"Total damaged rows in test: {len(damaged_rows_test)}")

print(damaged_rows_test.head())

Total damaged rows in train: 7
                                                     text     label
31512   - su akıtmıyor: adamlar kullanam klavuzuna yaz...  Positive
55634   -kargocu arkadaşlar ürünü bir bayan olarak taş...  Positive
64093   - kullanım tarifindeki 'hazneye sıcak su koyun...  Positive
102817  -kamerasına laf edilmiş. çıktığı dönemin en iy...  Positive
332479  - karşı taraf sesimden çok memnun ama ben karş...  Positive
Total damaged rows in test: 0
Empty DataFrame
Columns: [text, label]
Index: []


In [9]:
df_train.drop(index=damaged_rows_train.index, inplace=True)

In [10]:
df_train = df_train[df_train['text'] != "#NAME?"]
df_test = df_test[df_test['text'] != "#NAME?"]

In [11]:
for col in df_train.columns:
    df_train[col] = df_train[col].str.lower() # Normalizing Case Folding
    df_train[col] = df_train[col].str.replace(r'[^\w\s]', '', regex=True) # Punctuations
    df_train[col] = df_train[col].str.replace(r'\d+', '', regex=True) # Numbers

In [12]:
for col in df_test.columns:
    df_test[col] = df_test[col].str.lower() # Normalizing Case Folding
    df_test[col] = df_test[col].str.replace(r'[^\w\s]', '', regex=True) # Punctuations
    df_test[col] = df_test[col].str.replace(r'\d+', '', regex=True) # Numbers

In [13]:
check_df(df_train)

##################### Shape #####################
(439610, 2)
##################### Types #####################
text     object
label    object
dtype: object
##################### Head #####################
                                                text     label
0  ürünü hepsiburadadan alalı  hafta oldu orjinal...  positive
1  ürünlerden çok memnunum kesinlikle herkese tav...  positive
2         hızlı kargo temiz alışverişteşekkür ederim  positive
3                çünkü aranan tapınak bu bölgededir       notr
4  bu telefonu başlıca alma nedenlerim ise elimde...  positive
##################### Tail #####################
                                                     text     label
440674  ayrıca burç yorumları ve çapraz bulmaca da der...      notr
440675  günümüz de ssd olmazsa olmaz bir donanım artık...  positive
440676  kullandım ve çok memnun kaldım ocak başında da...  positive
440677                 adını lenkeran şehrinden almıştır       notr
440678  bu dergilerde sosy

In [14]:
check_df(df_test)

##################### Shape #####################
(48846, 2)
##################### Types #####################
text     object
label    object
dtype: object
##################### Head #####################
                                                text     label
0       kral akbaba dikkat çekici renklere sahiptir       notr
1   ısrarla korkutmayı başarıyor sanki korku çok ...  positive
2  neşe ve üzüntü köprünün kırılmaya başlamasıyla...      notr
3  i phone  ten sonra gene  ekranı tercih ettim t...  positive
4     beşinci sezonda diziye yeni oyuncular katıldı       notr
##################### Tail #####################
                                                    text     label
48960  fransa bayrağı diğer kırmızı zeminden beyaz bi...      notr
48961  yine aynı yıl türkü dalında murat çobanoğlu il...      notr
48962                           kurgunu skiyim oç evladı  negative
48963  şarkı daha sonrasında damian marley tarafından...      notr
48964  berrak bir ürün ancak ken

In [15]:
df_train.drop_duplicates(inplace=True)
df_test.drop_duplicates(inplace=True)

In [16]:
check_df(df_train)
check_df(df_test)

##################### Shape #####################
(436611, 2)
##################### Types #####################
text     object
label    object
dtype: object
##################### Head #####################
                                                text     label
0  ürünü hepsiburadadan alalı  hafta oldu orjinal...  positive
1  ürünlerden çok memnunum kesinlikle herkese tav...  positive
2         hızlı kargo temiz alışverişteşekkür ederim  positive
3                çünkü aranan tapınak bu bölgededir       notr
4  bu telefonu başlıca alma nedenlerim ise elimde...  positive
##################### Tail #####################
                                                     text     label
440674  ayrıca burç yorumları ve çapraz bulmaca da der...      notr
440675  günümüz de ssd olmazsa olmaz bir donanım artık...  positive
440676  kullandım ve çok memnun kaldım ocak başında da...  positive
440677                 adını lenkeran şehrinden almıştır       notr
440678  bu dergilerde sosy

**TASK**

4 different models ([TF-IDF with Multinomial Naive Bayes and Binary Naive Bayes] + [ANN with Word2Vec and FastText]) will be trained and compared.

**ROADMAP**

Preprocessing steps will be applied on data according to models they will be fed to.

***For Bayesian Model:***
- Lowecase transformation
- Special characters cleaning (Punctuations etc.)

In [17]:
def concat_df_on_y_axis(df_1, df_2):
    """
    Concatenates two DataFrames along the Y-axis (rows).

    Args:
        df_1 (pd.DataFrame): First DataFrame.
        df_2 (pd.DataFrame): Second DataFrame.

    Returns:
        pd.DataFrame: Concatenated DataFrame.
    """
    return pd.concat([df_1, df_2])

In [18]:
df_train_test = concat_df_on_y_axis(df_train, df_test)

In [19]:
check_df(df_train_test)

##################### Shape #####################
(485387, 2)
##################### Types #####################
text     object
label    object
dtype: object
##################### Head #####################
                                                text     label
0  ürünü hepsiburadadan alalı  hafta oldu orjinal...  positive
1  ürünlerden çok memnunum kesinlikle herkese tav...  positive
2         hızlı kargo temiz alışverişteşekkür ederim  positive
3                çünkü aranan tapınak bu bölgededir       notr
4  bu telefonu başlıca alma nedenlerim ise elimde...  positive
##################### Tail #####################
                                                    text     label
48960  fransa bayrağı diğer kırmızı zeminden beyaz bi...      notr
48961  yine aynı yıl türkü dalında murat çobanoğlu il...      notr
48962                           kurgunu skiyim oç evladı  negative
48963  şarkı daha sonrasında damian marley tarafından...      notr
48964  berrak bir ürün ancak ke

**OBSERVATIONS**
- df_train has 0 duplicates, duplicates dropped.
- df_test has 0 duplicates, duplicates dropped.
- df_train_test has 515 duplicates.
- **Data Leakage observed**
- Set of {df_train INTERSECT df_test} has to be removed from df_train.

In [20]:
test_texts = set(df_test['text'])
df_train = df_train[~df_train['text'].isin(test_texts)]

In [21]:
df_train_test = concat_df_on_y_axis(df_train, df_test)

In [22]:
check_df(df_train_test)

##################### Shape #####################
(484835, 2)
##################### Types #####################
text     object
label    object
dtype: object
##################### Head #####################
                                                text     label
0  ürünü hepsiburadadan alalı  hafta oldu orjinal...  positive
1  ürünlerden çok memnunum kesinlikle herkese tav...  positive
2         hızlı kargo temiz alışverişteşekkür ederim  positive
3                çünkü aranan tapınak bu bölgededir       notr
4  bu telefonu başlıca alma nedenlerim ise elimde...  positive
##################### Tail #####################
                                                    text     label
48960  fransa bayrağı diğer kırmızı zeminden beyaz bi...      notr
48961  yine aynı yıl türkü dalında murat çobanoğlu il...      notr
48962                           kurgunu skiyim oç evladı  negative
48963  şarkı daha sonrasında damian marley tarafından...      notr
48964  berrak bir ürün ancak ke

**Data Leakage problem solved**

## Naive Bayes Modeling

**STOPWORDS REMOVAL**

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

sw = stopwords.words('turkish')

In [None]:
df_train_test_sw_removed = df_train_test.copy()

In [None]:
df_train_test_sw_removed['text'] = df_train_test_sw_removed['text'].apply(lambda x: " ".join(x for x in str(x).split() if x not in sw))

#### Stemming
- Stemming is easy and will produce enough efficiency with bayesian models
- Lemmatization can be alternative

In [None]:
from TurkishStemmer import TurkishStemmer
stemmer = TurkishStemmer()

In [None]:
df_train_test_sw_removed['text'] = df_train_test_sw_removed['text'].apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))

In [None]:
check_df(df_train_test_sw_removed)

In [None]:
len_train = len(df_train)

df_train_sw_removed_stemmed = df_train_test_sw_removed.iloc[:len_train].copy()

df_test_sw_removed_stemmed = df_train_test_sw_removed.iloc[len_train:].copy()

In [None]:
X_train = df_train_sw_removed_stemmed['text']
y_train = df_train_sw_removed_stemmed['label']
X_test = df_test_sw_removed_stemmed['text']
y_test = df_test_sw_removed_stemmed['label']

#### Lemmatization

In [23]:
import stanza

In [24]:
# Download tr model
stanza.download('tr') 

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 6.72MB/s]                    
2025-12-21 19:25:00 INFO: Downloaded file to C:\Users\borab\stanza_resources\resources.json
2025-12-21 19:25:00 INFO: Downloading default packages for language: tr (Turkish) ...
2025-12-21 19:25:00 INFO: File exists: C:\Users\borab\stanza_resources\tr\default.zip
2025-12-21 19:25:02 INFO: Finished downloading models and saved to C:\Users\borab\stanza_resources


In [25]:
# Pipeline (use_gpu=True for gpu usage)
nlp = stanza.Pipeline('tr', processors='tokenize,lemma', use_gpu=True)

def stanza_lemmatizer(text):
    doc = nlp(text)
    lemmas = []
    # Stanza splits documents into sentences and words
    for sentence in doc.sentences:
        for word in sentence.words:
            if word.lemma is not None:
                lemmas.append(word.lemma)
            else:
                lemmas.append(word.text)
    return " ".join(lemmas)

2025-12-21 19:25:06 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json: 435kB [00:00, 7.81MB/s]                    
2025-12-21 19:25:06 INFO: Downloaded file to C:\Users\borab\stanza_resources\resources.json
2025-12-21 19:25:06 INFO: Loading these models for language: tr (Turkish):
| Processor | Package       |
-----------------------------
| tokenize  | imst          |
| mwt       | imst          |
| lemma     | imst_nocharlm |

2025-12-21 19:25:06 INFO: Using device: cuda
2025-12-21 19:25:06 INFO: Loading: tokenize
2025-12-21 19:25:07 INFO: Loading: mwt
2025-12-21 19:25:07 INFO: Loading: lemma
2025-12-21 19:25:07 INFO: Done loading processors!


In [26]:
df_train_test_lemmatized = df_train_test.copy()

In [27]:
texts = df_train_test_lemmatized['text'].tolist()

In [28]:
tqdm.pandas()

**STANZA FOR LEMMATIZATION**
- In order to perform Lemmatization contextually, stanza library is used.
- stanza utilizes cuda and gpu technology to accelerate computation.
- Still, pandas methods creates bottleneck so we say hi to our old friend, batching.

In [None]:
BATCH_SIZE = 512 

processed_results = []

print(f"Total {len(texts)} row, {BATCH_SIZE}: batch size")

for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="GPU is processing"):
    batch_texts = texts[i : i + BATCH_SIZE]
    
    in_docs = [stanza.Document([], text=d) for d in batch_texts]
    
    out_docs = nlp(in_docs)
    
    for doc in out_docs:
        lemmas = []
        for sentence in doc.sentences:
            for word in sentence.words:
                if word.lemma:
                    lemmas.append(word.lemma)
                else:
                    lemmas.append(word.text)
        
        processed_results.append(" ".join(lemmas))

df_train_test_lemmatized['text'] = processed_results

In [29]:
df_train_test_lemmatized.to_csv("data/df_train_test_prep.csv", index=False)

In [None]:
#df_train_test_lemmatized['text'] = df_train_test_lemmatized['text'].progress_apply(lambda x: stanza_lemmatizer(x))

In [30]:
len_train = len(df_train)

df_train_lemmatized = df_train_test_lemmatized.iloc[:len_train].copy()

df_test_lemmatized = df_train_test_lemmatized.iloc[len_train:].copy()

In [31]:
X_train = df_train_lemmatized['text']
y_train = df_train_lemmatized['label']
X_test = df_test_lemmatized['text']
y_test = df_test_lemmatized['label']

### Multinomial Naive Bayes

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

In [34]:
from sklearn.metrics import classification_report

In [35]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))# unigram and bigrams

**TF-IDF Vectorization**

In [36]:
# for multinomial NB
X_train_nb = tfidf_vectorizer.fit_transform(X_train)
X_test_nb = tfidf_vectorizer.transform(X_test)

**Multinomial NB Model**

In [37]:
nb_model = MultinomialNB().fit(X_train_nb, y_train)

**Multinomial NB Model Evaluation**

In [38]:
nb_model_pred = nb_model.predict(X_test_nb)

In [58]:
nb_model_pred_train = nb_model.predict(X_train_nb)

In [39]:
print(classification_report(y_test, nb_model_pred, digits=3))

              precision    recall  f1-score   support

    negative      0.996     0.120     0.214      5636
        notr      0.988     0.918     0.952     17087
    positive      0.804     0.995     0.889     26053

    accuracy                          0.867     48776
   macro avg      0.929     0.677     0.685     48776
weighted avg      0.891     0.867     0.833     48776



### Binary Naive Bayes

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [41]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

In [42]:
from sklearn.metrics import classification_report

In [43]:
tfidf_vectorizer_binary = TfidfVectorizer(ngram_range=(1,2), binary=True)#unigrams and bigrams

**Binary TF-IDF Vectorization**

In [44]:
# for binary NB
X_train_nb_binary = tfidf_vectorizer_binary.fit_transform(X_train)
X_test_nb_binary = tfidf_vectorizer_binary.transform(X_test)

**Binary NB Model**

In [45]:
nb_model_binary = BernoulliNB().fit(X_train_nb_binary, y_train)

**Binary NB Model Evaluation**

In [46]:
nb_binary_model_pred = nb_model.predict(X_test_nb_binary)

In [57]:
nb_binary_model_pred_train = nb_model.predict(X_train_nb_binary)

In [47]:
print(classification_report(y_test, nb_binary_model_pred, digits=3))

              precision    recall  f1-score   support

    negative      0.996     0.123     0.219      5636
        notr      0.988     0.919     0.952     17087
    positive      0.805     0.995     0.890     26053

    accuracy                          0.867     48776
   macro avg      0.930     0.679     0.687     48776
weighted avg      0.891     0.867     0.834     48776



In [54]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def get_classification_report(y_true, y_pred, algorithm_name="Algorithm Model Name"):

    target_labels = ['positive', 'negative', 'notr'] 
    
    # A) Micro Average
    # In multi-class classification, Micro Precision = Micro Recall = Micro F1 = Accuracy
    micro_p, micro_r, micro_f, _ = precision_recall_fscore_support(
        y_true, y_pred, average='micro'
    )
    
    # B) Macro Average
    macro_p, macro_r, macro_f, _ = precision_recall_fscore_support(
        y_true, y_pred, average='macro'
    )
    
    # C) Class-Specific Metrics (Positive, Negative, Notr)
    # average=None returns the scores for each class in the order of 'labels'
    class_p, class_r, class_f, _ = precision_recall_fscore_support(
        y_true, y_pred, average=None, labels=target_labels
    )
    
    # We create a dictionary to hold the data
    report_data = {
        "Metric": [
            "Micro Average Recall", "Micro Average Precision", "Micro Average F-Score",
            "Macro Average Recall", "Macro Average Precision", "Macro Average F-Score",
            "Class: Positive Recall", "Class: Positive Precision", "Class: Positive F-Score",
            "Class: Negative Recall", "Class: Negative Precision", "Class: Negative F-Score",
            "Class: Notr Recall", "Class: Notr Precision", "Class: Notr F-Score"
        ],
        algorithm_name: [
            # Micro
            f"{micro_r:.3f}", f"{micro_p:.3f}", f"{micro_f:.3f}",
            # Macro
            f"{macro_r:.3f}", f"{macro_p:.3f}", f"{macro_f:.3f}",
            # Positive (Index 0 in target_labels)
            f"{class_r[0]:.3f}", f"{class_p[0]:.3f}", f"{class_f[0]:.3f}",
            # Negative (Index 1 in target_labels)
            f"{class_r[1]:.3f}", f"{class_p[1]:.3f}", f"{class_f[1]:.3f}",
            # Notr (Index 2 in target_labels)
            f"{class_r[2]:.3f}", f"{class_p[2]:.3f}", f"{class_f[2]:.3f}"
        ]
    }
    
    # Create DataFrame and set metric as the index
    df_report = pd.DataFrame(report_data)
    df_report = df_report.set_index("Metric")
    
    return df_report

In [55]:
cls_rep_MNB = get_classification_report(y_test, nb_model_pred, "Multinomial Naive Bayes")
cls_rep_BNB = get_classification_report(y_test, nb_binary_model_pred, "Binary Naive Bayes")

In [60]:
cls_rep_MNB_train = get_classification_report(y_train, nb_model_pred_train, "Multinomial Naive Bayes")
cls_rep_BNB_train = get_classification_report(y_train, nb_binary_model_pred_train, "Binary Naive Bayes")

**TEST**

In [59]:
final_table_test = pd.concat([cls_rep_MNB, cls_rep_BNB], axis=1)
print(final_table_test)

                          Multinomial Naive Bayes Binary Naive Bayes
Metric                                                              
Micro Average Recall                        0.867              0.867
Micro Average Precision                     0.867              0.867
Micro Average F-Score                       0.867              0.867
Macro Average Recall                        0.677              0.679
Macro Average Precision                     0.929              0.930
Macro Average F-Score                       0.685              0.687
Class: Positive Recall                      0.995              0.995
Class: Positive Precision                   0.804              0.805
Class: Positive F-Score                     0.889              0.890
Class: Negative Recall                      0.120              0.123
Class: Negative Precision                   0.996              0.996
Class: Negative F-Score                     0.214              0.219
Class: Notr Recall                

**TRAIN**

In [61]:
final_table_train = pd.concat([cls_rep_MNB_train, cls_rep_BNB_train], axis=1)
print(final_table_train)

                          Multinomial Naive Bayes Binary Naive Bayes
Metric                                                              
Micro Average Recall                        0.898              0.900
Micro Average Precision                     0.898              0.900
Micro Average F-Score                       0.898              0.900
Macro Average Recall                        0.735              0.740
Macro Average Precision                     0.945              0.945
Macro Average F-Score                       0.761              0.767
Class: Positive Recall                      0.997              0.997
Class: Positive Precision                   0.843              0.845
Class: Positive F-Score                     0.913              0.915
Class: Negative Recall                      0.243              0.257
Class: Negative Precision                   0.999              0.999
Class: Negative F-Score                     0.391              0.409
Class: Notr Recall                

## ANN MODELİNG

In [None]:
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab') #necessary for tokenization method

In [None]:
# Tokenizing sentences
tokenized_sentences_train = [word_tokenize(sentence.lower()) for sentence in df_train['text']]
tokenized_sentences_test = [word_tokenize(sentence.lower()) for sentence in df_test['text']]

In [None]:
import pickle
# Save training tokens
with open('data/tokenized_train.pkl', 'wb') as f: # 'wb' stands for write binary
    pickle.dump(tokenized_sentences_train, f)

# Save test tokens
with open('data/tokenized_test.pkl', 'wb') as f:
    pickle.dump(tokenized_sentences_test, f)

### SKIPGRAM-ANN MODEL
- Skipgram has better performance modeling semantics, which we desperately need in this case

In [None]:
from collections import Counter

In [None]:
# Hyperparameters
EMBEDDING_DIM = 150      # Increased from 5 to 100 for better representation
WINDOW_SIZE = 2
MAX_VOCAB_SIZE = 20000   # Limit vocabulary to top 20k words to prevent OOM errors
BATCH_SIZE = 1024         
NUM_EPOCHS = 10       
LEARNING_RATE = 0.001

In [None]:
sentences = tokenized_sentences_train

In [None]:
# Flatten the list of sentences to a single list of words
all_words = [word for sentence in sentences for word in sentence]

In [None]:
# Select only the most common words to keep the vocabulary size manageable
# We reserve index 0 for <UNK>, so we take MAX_VOCAB_SIZE - 1
word_counts = Counter(all_words).most_common(MAX_VOCAB_SIZE - 1)

In [None]:
# Create vocabulary mapping: <UNK> is always at index 0
word_to_ix = {"<UNK>": 0}
for word, count in word_counts:
    word_to_ix[word] = len(word_to_ix)

In [None]:
# Create reverse mapping (Index -> Word)
ix_to_word = {ix: word for word, ix in word_to_ix.items()}
VOCAB_SIZE = len(word_to_ix)

print(f"Total words scanned: {len(all_words)}")
print(f"Final Vocabulary Size: {VOCAB_SIZE}")

In [None]:
# Generate Skip-gram Pairs (Input -> Target)
inputs = []
targets = []

print("Generating training pairs...")
for sentence in sentences:
    # Convert words to indices. If a word is not in top 20k, it becomes 0 (<UNK>)
    sentence_indices = [word_to_ix.get(word, 0) for word in sentence]
    
    for i in range(len(sentence_indices)):
        target_word_idx = sentence_indices[i] # Center word
        
        # Optimization: If the target word is unknown (<UNK>), 
        # we skip training on it to avoid noise.
        if target_word_idx == 0:
            continue
            
        # Define context window
        start_idx = max(0, i - WINDOW_SIZE)
        end_idx = min(len(sentence_indices), i + WINDOW_SIZE + 1)
        
        for j in range(start_idx, end_idx):
            if i != j: # Skip the target word itself
                context_word_idx = sentence_indices[j]
                inputs.append(target_word_idx)
                targets.append(context_word_idx)

print(f"Total training pairs generated: {len(inputs)}")

#### Tensorflow

In [None]:
import os
import site

try:
    site_packages = site.getsitepackages()[0]
    nvidia_path = os.path.join(site_packages, 'nvidia')
    
    cudnn_path = os.path.join(nvidia_path, 'cudnn', 'lib')
    cuda_path = os.path.join(nvidia_path, 'cuda_runtime', 'lib')
    
    old_ld = os.environ.get('LD_LIBRARY_PATH', '')
    os.environ['LD_LIBRARY_PATH'] = f"{cudnn_path}:{cuda_path}:{old_ld}"
    
    # This specific flag often fixes 'DNN library initialization failed' errors
    # by disabling some auto-tuning features that might crash on certain GPUs.
    os.environ['TF_CUDNN_USE_AUTOTUNE'] = '0' 
    
    print("NVIDIA Library paths arranged successfully")
    
except Exception as e:
    print(f"Path warning: {e}")

# --- 2. IMPORT TENSORFLOW AND CONFIGURE GPU MEMORY ---
import tensorflow as tf
import numpy as np
from tensorflow.keras import layers, models, optimizers, callbacks

print(f"TensorFlow Version: {tf.__version__}")

# GPU Memory Growth
# This is CRITICAL. It prevents TensorFlow from hogging all VRAM at start-up.
# Must be run immediately after importing TF.
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"GPU Detected and memory growth set: {gpus}")
    except RuntimeError as e:
        print(f"GPU Error: {e}")
else:
    print("No GPU detected.")

In [None]:
# Convert lists to NumPy arrays (TensorFlow prefers typed arrays)
inputs = np.array(inputs, dtype=np.int32)
targets = np.array(targets, dtype=np.int32)

In [None]:
# Use tf.data.Dataset for efficient Batching and Prefetching on GPU
dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))

# Shuffle buffer size should ideally be >= number of training samples
# Prefetch allows the CPU to prepare the next batch while GPU processes the current one
dataset = dataset.shuffle(buffer_size=1024).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [None]:
def build_skipgram_model(vocab_size, embedding_dim):
    # Input layer: Receives a single integer (word index)
    input_word = layers.Input(shape=(1,), name="target_word_input")
    
    # Embedding layer: Converts index to dense vector
    # input_dim: Vocabulary size
    # output_dim: Size of the vector space
    x = layers.Embedding(input_dim=vocab_size, 
                         output_dim=embedding_dim, 
                         input_length=1, 
                         name="embedding_layer")(input_word)
    
    # Flatten: Converts (Batch, 1, Dim) -> (Batch, Dim)
    x = layers.Flatten()(x)
    
    # Output layer: Predicts probability for every word in vocabulary
    # Softmax ensures output sums to 1 (probability distribution)
    output = layers.Dense(vocab_size, activation='softmax', name="context_prediction")(x)
    
    model = models.Model(inputs=input_word, outputs=output)
    return model

In [None]:
class WordSimilarityCallback(callbacks.Callback):
    """
    A custom callback to print the most similar words to a specific query word
    at the end of each epoch. This helps in monitoring the semantic learning progress.
    """
    def __init__(self, test_word, word_to_ix, ix_to_word, top_k=5):
        super(WordSimilarityCallback, self).__init__()
        self.test_word = test_word
        self.word_to_ix = word_to_ix
        self.ix_to_word = ix_to_word
        self.top_k = top_k

    def on_epoch_end(self, epoch, logs=None):
        # Check if the test word is in vocabulary
        if self.test_word not in self.word_to_ix:
            return

        # 1. Retrieve the weights from the embedding layer
        embedding_layer = self.model.get_layer("embedding_layer")
        embeddings = embedding_layer.get_weights()[0]
        
        # 2. Get the vector for the test word
        test_idx = self.word_to_ix[self.test_word]
        test_vector = embeddings[test_idx]
        
        # 3. Calculate Cosine Similarity
        # Normalize embeddings and the test vector to unit length
        norm_embeddings = tf.math.l2_normalize(embeddings, axis=1)
        norm_test_vector = tf.math.l2_normalize(test_vector, axis=0)
        
        # Dot product of normalized vectors equals cosine similarity
        cosine_similarities = tf.tensordot(norm_embeddings, norm_test_vector, axes=1)
        
        # 4. Find the indices of the words with the highest similarity scores
        # We take top_k + 1 because the most similar word is the word itself (score=1.0)
        top_indices = tf.math.top_k(cosine_similarities, k=self.top_k + 1).indices.numpy()
        
        # 5. Print the results
        closest_words = [self.ix_to_word[idx] for idx in top_indices if idx != test_idx]
        print(f"\n[Validation] End of Epoch {epoch+1} - Closest words to '{self.test_word}':")
        print(f"  -> {', '.join(closest_words)}")

In [None]:
# Initialize the model
model = build_skipgram_model(VOCAB_SIZE, EMBEDDING_DIM)

In [None]:
top_k_metric = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name='top_5_acc')

In [None]:
# We use 'sparse_categorical_crossentropy' because our targets are integers (indexes),
# not one-hot encoded vectors. This saves memory and is computationally efficient.
model.compile(optimizer=optimizers.Adam(learning_rate=LEARNING_RATE),
              loss='sparse_categorical_crossentropy',
              metrics=[top_k_metric])

In [None]:
# Define the custom callback (e.g., check neighbors of "learning")
# Note: Ensure the test_word exists in your training data
visual_callback = WordSimilarityCallback(test_word="araba", 
                                         word_to_ix=word_to_ix, 
                                         ix_to_word=ix_to_word,
                                         top_k=3)

In [None]:
# Print model architecture
model.summary()

In [None]:
# Start Training
print("\nStarting Training...")
history = model.fit(dataset, epochs=NUM_EPOCHS, callbacks=[visual_callback])
print("Training Complete.")

In [None]:
# Get weights from the embedding layer
# The shape will be (VOCAB_SIZE, EMBEDDING_DIM)
vectors = model.get_layer("embedding_layer").get_weights()[0]

#### Torch

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
import json


In [None]:
from tqdm import tqdm
import numpy as np

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f'Using device: {device}')

In [None]:
# Skip-gram Model
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target_word_idx):
        # target_word_idx shape: [batch_size]
        embed = self.embeddings(target_word_idx) # shape: [batch_size, embedding_dim]
        output = self.linear(embed)              # shape: [batch_size, vocab_size]
        log_probs = torch.log_softmax(output, dim=1)
        return log_probs

In [None]:
# transforming into torch tensors for compatibility with data loaders which will be implementing batching also
inputs_tensor = torch.tensor(inputs, dtype=torch.long)
targets_tensor = torch.tensor(targets, dtype=torch.long)

In [None]:
# defining data loader for batching
train_data = TensorDataset(inputs_tensor, targets_tensor)
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
# Initialize model
skipgram_model = SkipGramModel(VOCAB_SIZE, EMBEDDING_DIM).to(device) # to cuda

In [None]:
# Define loss function and optimizer
criterion = nn.NLLLoss()
skipgram_optimizer = optim.Adam(skipgram_model.parameters(), lr=LEARNING_RATE)

In [None]:
print("Training Skip-gram model (Conceptual)...")
for epoch in tqdm(range(NUM_EPOCHS), desc="Epochs", position=0, leave=True):
    total_loss = 0
    
    # DataLoader fetches batch by batch
    for batch_inputs, batch_targets in tqdm(train_loader, desc=f"Epoch {epoch+1} Batches", leave=False):
        
        # Place data on GPU
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)
        
        # Zeroing grads
        skipgram_optimizer.zero_grad()
        
        # forward prop
        log_probs = skipgram_model(batch_inputs)
        
        # loss calculation
        loss = criterion(log_probs, batch_targets)
        
        # backward prop
        loss.backward()
        skipgram_optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{NUM_EPOCHS}, Loss: {total_loss/len(train_loader):.4f}")
print("Skip-gram training complete (Conceptual).")

In [None]:
VOCAB_PATH = "data/word_to_ix.json"
with open(VOCAB_PATH, 'w', encoding='utf-8') as f:
    json.dump(word_to_ix, f, ensure_ascii=False)

In [None]:
SKIPGRAM_PATH = "model/skipgram_model.pth"
torch.save(skipgram_model.state_dict(), SKIPGRAM_PATH)

**ANN PREDICTION MODEL**

In [None]:
# Skip-gram Model
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, target_word_idx):
        # target_word_idx shape: [batch_size]
        embed = self.embeddings(target_word_idx) # shape: [batch_size, embedding_dim]
        output = self.linear(embed)              # shape: [batch_size, vocab_size]
        log_probs = torch.log_softmax(output, dim=1)
        return log_probs

In [None]:
VOCAB_SIZE = 20000
EMBEDDING_DIM = 150
device = torch.device('cuda')

In [None]:
import pickle

In [None]:
# reload tokens
with open('data/tokenized_train.pkl', 'rb') as f: # 'rb' stands for read binary
    tokenized_sentences_train = pickle.load(f)

with open('data/tokenized_test.pkl', 'rb') as f:
    tokenized_sentences_test = pickle.load(f)

In [None]:
# reload index dictionary
with open("data/word_to_ix.json", "r", encoding="utf-8") as f:
    word_to_ix = json.load(f)

In [None]:
# Re-initialize the SkipGram model structure (Must match training config)
loaded_skipgram = SkipGramModel(len(word_to_ix), EMBEDDING_DIM).to(device)

In [None]:
# Load the trained weights
loaded_skipgram.load_state_dict(torch.load("model/skipgram_model.pth"))
loaded_skipgram.eval() # Set to evaluation mode (no gradient calculation for this part)

In [None]:
def get_sentence_embedding(sentence_tokens, model, word_to_ix, embedding_dim, device):
    """
    Converts a list of tokens (sentence) into a single vector by averaging
    the word embeddings of its constituent words.
    """
    # Convert words to indices, using 0 (<UNK>) for unknown words
    indices = [word_to_ix.get(word, 0) for word in sentence_tokens]
    
    # Convert to tensor and move to device
    indices_tensor = torch.tensor(indices, dtype=torch.long).to(device)
    
    # Get embeddings for these words from the pre-trained Skip-gram model
    with torch.no_grad(): # We don't need gradients here just the values
        word_vectors = model.embeddings(indices_tensor)
    
    # Handle empty sentences or sentences with no known words
    if len(word_vectors) == 0:
        return np.zeros(embedding_dim)
    
    # Average the vectors to get one vector per sentence
    # shape: [sentence_length, embedding_dim] -> [embedding_dim]
    sentence_vector = torch.mean(word_vectors, dim=0).cpu().numpy()
    
    return sentence_vector

In [None]:
X_train_list = [get_sentence_embedding(s, loaded_skipgram, word_to_ix, EMBEDDING_DIM, device) 
                for s in tokenized_sentences_train]
X_test_list = [get_sentence_embedding(s, loaded_skipgram, word_to_ix, EMBEDDING_DIM, device) 
               for s in tokenized_sentences_test]

In [None]:
# Convert lists to NumPy arrays first then to PyTorch Tensors
X_train_tensor = torch.tensor(np.array(X_train_list), dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(np.array(X_test_list), dtype=torch.float32).to(device)

In [None]:
# Map text labels to integers
label_mapping = {
    'negative': 0,
    'notr': 1,     
    'positive': 2
}

In [None]:
# Apply mapping to DataFrames
df_train['label_idx'] = df_train['label'].map(label_mapping)
df_test['label_idx'] = df_test['label'].map(label_mapping)

In [None]:
# Prepare Target Tensors
# IMPORTANT: For Multi-class CrossEntropyLoss, targets must be 1D LongTensor (int64)
y_train_tensor = torch.tensor(df_train['label_idx'].values, dtype=torch.long).to(device)
y_test_tensor = torch.tensor(df_test['label_idx'].values, dtype=torch.long).to(device)

In [None]:
class SentimentClassifierMLP(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(SentimentClassifierMLP, self).__init__()
        
        # input -> hidden
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        
        # Activation ReLu
        self.relu = nn.ReLU()
        
        # Dropout
        self.dropout = nn.Dropout(p=0.3) 
        
        # hidden -> output
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

In [None]:
# Initialize Model
HIDDEN_DIM = 64
NUM_CLASSES = 3
sentiment_model = SentimentClassifierMLP(EMBEDDING_DIM, HIDDEN_DIM, NUM_CLASSES).to(device)

In [None]:
BATCH_SIZE = 512

model_train_data = TensorDataset(X_train_tensor, y_train_tensor)
model_train_loader = DataLoader(model_train_data, batch_size=BATCH_SIZE, shuffle=True)

model_test_data = TensorDataset(X_test_tensor, y_test_tensor)
model_test_loader = DataLoader(model_test_data, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# CrossEntropyLoss is used for multi-class classification
criterion_cls = nn.CrossEntropyLoss()
optimizer_cls = optim.Adam(sentiment_model.parameters(), lr=0.01)

In [None]:
EPOCHS_CLS = 50

# Training Loop with Batches
print("Starting Batch Training for Sentiment Analysis...")

for epoch in tqdm(range(EPOCHS_CLS), desc="Epochs", position=0, leave=True):
    sentiment_model.train() # Set model to training mode
    total_loss = 0
    
    # Batch Loop
    for batch_inputs, batch_targets in model_train_loader:
        
        # Move data to GPU/CPU
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device) # Must be torch.long dtype
        
        # Zero gradients
        optimizer_cls.zero_grad()
        
        # Forward prop
        outputs = sentiment_model(batch_inputs) # Shape: [Batch_Size, 3]
        
        # Calculate loss
        loss = criterion(outputs, batch_targets)
        
        # Backward prop
        loss.backward()
        optimizer_cls.step()
        
        # Accumulate loss
        total_loss += loss.item()
    
    # Calculate average loss for the epoch
    avg_loss = total_loss / len(model_train_loader)
    
    # Print every 10 epochs
    if (epoch + 1) % 10 == 0:
        tqdm.write(f"Epoch {epoch+1}/{EPOCHS_CLS}, Average Loss: {avg_loss:.4f}")

print("Training complete.")

In [None]:
#Evaluation Loop with Batches
print("\nStarting Evaluation on Test Set...")

sentiment_model.eval() # Set model to evaluation mode
total_correct = 0
total_samples = 0

with torch.no_grad(): # Disable gradient calculation for efficiency
    for batch_inputs, batch_targets in model_test_loader:
        
        # Move data to device
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)
        
        # Forward pass
        outputs = sentiment_model(batch_inputs)
        
        # Get predictions: Find the index with the highest score
        # torch.max returns (values, indices) -> we need indices (predicted class)
        _, predicted = torch.max(outputs, 1)
        
        # Update counts
        total_samples += batch_targets.size(0) # Batch size
        total_correct += (predicted == batch_targets).sum().item()

# Calculate final accuracy
accuracy = (total_correct / total_samples) * 100
print(f"Final Test Accuracy: {accuracy:.2f}%")

In [None]:
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []

EPOCHS_CLS = 50

print("Starting Training with Live Validation...")

for epoch in tqdm(range(EPOCHS_CLS), desc="Training Progress"):
    

    # TRAINING PHASE
    sentiment_model.train() # Switch to training mode
    total_train_loss = 0
    correct_train = 0
    total_train_samples = 0
    
    for batch_inputs, batch_targets in model_train_loader:
        # Move to device
        batch_inputs = batch_inputs.to(device)
        batch_targets = batch_targets.to(device)
        
        # Zero gradients
        optimizer_cls.zero_grad()
        
        # Forward pass
        outputs = sentiment_model(batch_inputs)
        loss = criterion_cls(outputs, batch_targets)
        
        # Backward pass
        loss.backward()
        optimizer_cls.step()
        
        # Track Loss & Accuracy
        total_train_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        correct_train += (predicted == batch_targets).sum().item()
        total_train_samples += batch_targets.size(0)
        
    # Calculate average loss and accuracy for this epoch
    avg_train_loss = total_train_loss / len(model_train_loader)
    avg_train_acc = correct_train / total_train_samples
    
    # Append to history
    train_losses.append(avg_train_loss)
    train_accuracies.append(avg_train_acc)

    # VALIDATION PHASE
    sentiment_model.eval() # Switch to evaluation mode
    total_val_loss = 0
    correct_val = 0
    total_val_samples = 0
    
    with torch.no_grad(): # No gradient needed for validation
        for batch_inputs, batch_targets in model_test_loader:
            batch_inputs = batch_inputs.to(device)
            batch_targets = batch_targets.to(device)
            
            outputs = sentiment_model(batch_inputs)
            loss = criterion_cls(outputs, batch_targets)
            
            total_val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct_val += (predicted == batch_targets).sum().item()
            total_val_samples += batch_targets.size(0)
            
    # Calculate average validation loss and accuracy
    avg_val_loss = total_val_loss / len(model_test_loader)
    avg_val_acc = correct_val / total_val_samples
    
    # Append to history
    val_losses.append(avg_val_loss)
    val_accuracies.append(avg_val_acc)
    
    # Optional: Print stats every 5 epochs
    if (epoch + 1) % 5 == 0:
        tqdm.write(f"Epoch {epoch+1} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {avg_val_acc:.4f}")

print("Training Complete.")

In [None]:
SKIPGRAM_ANN_PATH = "model/skipgram_ann_model.pth"
torch.save(sentiment_model.state_dict(), SKIPGRAM_ANN_PATH)

In [None]:
import matplotlib.pyplot as plt

# --- Plotting the Learning Curves ---

plt.figure(figsize=(12, 5))

# Plot 1: Loss Curve (Overfitting Detection)
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss', color='blue')
plt.plot(val_losses, label='Validation (Test) Loss', color='orange')
plt.title('Training vs Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

# Plot 2: Accuracy Curve
plt.subplot(1, 2, 2)
plt.plot(train_accuracies, label='Train Accuracy', color='green')
plt.plot(val_accuracies, label='Validation (Test) Accuracy', color='red')
plt.title('Training vs Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

### FASTTEXT-ANN MODEL