## IMPORT LIBRARY

In [1]:
import pandas as pd
import numpy as np
import joblib
from tqdm import tqdm
from gensim.models import Word2Vec, KeyedVectors
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

## LOADING DATASET

In [8]:
df = pd.read_parquet('./dataset/svm_data_preprocessed.parquet')

In [2]:
df_big_data = pd.read_parquet('./svm_data_lemma/part-00000-7f47dc80-a5b2-4bb3-b23a-c4a18142a727-c000.snappy.parquet')

In [3]:
df_big_data.count()

clean_text    14892505
sentiment      4722234
lemma_text     4722117
dtype: int64

In [4]:
df_big_data.isnull()

Unnamed: 0,clean_text,sentiment,lemma_text
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False
4,False,False,False
...,...,...,...
14892617,False,True,True
14892618,False,False,False
14892619,False,True,True
14892620,False,True,True


In [5]:
df_big_data = df_big_data.dropna()

In [5]:
df.count()

clean_text    60000
lemma_text    60000
pos_tags      60000
sentiment     60000
dtype: int64

In [6]:
df.head()

Unnamed: 0,clean_text,lemma_text,pos_tags,sentiment
0,if you decide to eat here just be aware it is ...,if you decide to eat here just be aware it be ...,"[[if, IN], [you, PRP], [decide, VBP], [to, TO]...",1
1,a couple friends and i stopped by for some lat...,a couple friend and i stop by for some late ni...,"[[a, DT], [couple, JJ], [friends, NNS], [and, ...",1
2,sometimes this food is very very good unfortu...,sometimes this food be very very good unfortun...,"[[sometimes, RB], [this, DT], [food, NN], [is,...",1
3,after trying a few ramen places with crazy var...,after try a few ramen place with crazy variety...,"[[after, IN], [trying, VBG], [a, DT], [few, JJ...",1
4,great food terrible customer service ive been ...,great food terrible customer service ive be th...,"[[great, JJ], [food, NN], [terrible, JJ], [cus...",1


## Model's Architecture

**Glove Model**

In [7]:
def load_glove_model(file_path):
    glove_model = {}
    with open(file_path, 'r', encoding='utf8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            glove_model[word] = vector
    print(f"Loaded {len(glove_model):,} words from {file_path}")
    return glove_model

glove_model = load_glove_model("./glove/glove.6B.300d.txt")

Loaded 400,000 words from ./glove/glove.6B.300d.txt


In [8]:
def glove_to_keyedvectors(glove_model):
    kv = KeyedVectors(vector_size=len(next(iter(glove_model.values()))))
    kv.add_vectors(list(glove_model.keys()), list(glove_model.values()))
    return kv

glove_kv = glove_to_keyedvectors(glove_model)

Code cells ini memuat word embedding GloVe vektor kata yang sudah dilatih sebelumnya dari file teks ke dalam memori. Setiap kata dipetakan ke vektor berdimensi 300. Setelah itu, data GloVe diubah ke format KeyedVector untuk training SVM

**Word2Vec GoogleNews Model**

In [4]:
w2v_pre = KeyedVectors.load_word2vec_format(
    "GoogleNews-vectors-negative300.bin",
    binary=True,
    limit=500_000
)
print("Loaded:", len(w2v_pre.key_to_index), "words")

Loaded: 500000 words


In [5]:
def vectorize_sentence_pretrained(sentence, model, dim=300):
    words = sentence.split()
    valid = [w for w in words if w in model.key_to_index]
    if not valid:
        return np.zeros(dim)
    return np.mean(model[valid], axis=0)

tqdm.pandas()
X_pretrained = np.vstack(df['lemma_text'].progress_apply(
    lambda s: vectorize_sentence_pretrained(s, w2v_pre)
))

100%|██████████| 60000/60000 [00:18<00:00, 3305.89it/s]


Code cells ini memuat word embedding Word2Vec pretraining dari model Google News yang berisi vektor untuk kata-kata umum,fungsi vectorize_sentence_pretrained kemudian mengubah setiap kalimat menjadi vektor dengan cara menghitung rata-rata dari vektor kata-kata yang dikenal dalam model, proses ini diterapkan pada seluruh data teks

**Word2Vec Small Sample Self Dictionary Model**

In [None]:
sentences = [text.split() for text in df['lemma_text']]

model = Word2Vec(
    sentences=sentences,
    vector_size=300,
    window=5,
    min_count=2,
    workers=4,
    sg=1
)

model.save("yelp_word2vec.model")

In [None]:
w2v = model.wv
def vectorize_sentence(sentence, model, dim=300):
    words = sentence.split()
    valid_words = [w for w in words if w in model]
    if not valid_words:
        return np.zeros(dim)
    return np.mean(model[valid_words], axis=0)

tqdm.pandas()
X_vectors = np.vstack(df['lemma_text'].progress_apply(lambda s: vectorize_sentence(s, w2v)))

100%|██████████| 60000/60000 [00:14<00:00, 4214.54it/s]


In [14]:
X_vectors.shape

(60000, 300)

code cells ini melatih model Word2Vec menggunakan dataset sendiri yang sudah diproses menjadi kumpulan kalimat,
parameter seperti vector_size=300 menentukan ukuran embedding, window=5 mengatur konteks kata, min_count=2 mengabaikan kata yang jarang muncul, dan sg=1 berarti menggunakan skip-gram training, fungsi vectorize_sentence mengubah setiap kalimat menjadi satu vektor dengan menghitung rata-rata dari vektor kata yang valid dalam model.

**Word2Vec Big Self Dictionary Model**

In [6]:
class SentenceGenerator:
   def __init__(self, series):
      self.series = series
   def __iter__(self):
      for text in self.series:
         yield text.split()


sentences = SentenceGenerator(df_big_data['lemma_text'])

model = Word2Vec(
    sentences=sentences,
    vector_size=300,
    window=5,
    min_count=2,
    workers=12,
    sg=1
)

model.save("yelp_word2vec_big_data.model")

In [7]:
model = Word2Vec.load("yelp_word2vec_big_data.model")

In [9]:
w2v = model.wv
def vectorize_sentence(sentence, model, dim=300):
    words = sentence.split()
    valid_words = [w for w in words if w in model]
    if not valid_words:
        return np.zeros(dim)
    return np.mean(model[valid_words], axis=0)

tqdm.pandas()
X_vectors = np.vstack(df['lemma_text'].progress_apply(lambda s: vectorize_sentence(s, w2v)))

100%|██████████| 60000/60000 [00:11<00:00, 5370.46it/s]


**TF IDF**

In [14]:
def build_tfidf_features(X_train, X_test, 
                         max_features=5000, 
                         ngram_range=(1, 2), 
                         sublinear_tf=True, 
                         stop_words='english'):
    tfidf_vectorizer = TfidfVectorizer(
        max_features=max_features,
        ngram_range=ngram_range,
        sublinear_tf=sublinear_tf,
        stop_words=stop_words
    )

    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    return tfidf_vectorizer, X_train_tfidf, X_test_tfidf

Cells ini mendefined TF-IDF dari data teks pelatihan dan pengujian, fungsi build_tfidf_features membuat vectorizer dengan parameter seperti max_features=5000 untuk membatasi jumlah kata, ngram_range=(1,2) agar menggunakan unigram dan bigram, sublinear_tf=True untuk menyeimbangkan bobot kata yang sering muncul, serta stop_words='english' untuk menghapus stopwords, selanjutnya, fit_transform() digunakan untuk melatih dan mengubah data train, sedangkan transform() untuk mengubah data test, dan hasil akhirnya mengembalikan vectorizer beserta representasi TF-IDF dari keduanya.

## SPLIT DATASET

**Word2Vec GoogleNews Model**

In [6]:

y = df['sentiment']
X_train_pre, X_test_pre, y_train_pre, y_test_pre = train_test_split(
    X_pretrained, y, test_size=0.2, random_state=42
)

**Word2Vec Small Self dictionary**

In [15]:
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(
    X_vectors, y, test_size=0.2, random_state=42
)

**Word2Vec Big Self dictionary**

In [10]:
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(
    X_vectors, y, test_size=0.2, random_state=42
)

**TF iDF**

In [13]:
X = df['lemma_text']
y = df['sentiment']

X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(
    X, y, test_size=0.2, random_state=42
)


**Glove**

In [10]:
X = df['lemma_text']
y = df['sentiment']

In [11]:
def sentence_to_glove_vector(sentence, glove_kv):
    words = [w for w in sentence.split() if w in glove_kv]
    if not words:
        return np.zeros(glove_kv.vector_size)
    return np.mean([glove_kv[w] for w in words], axis=0)

X_vectors_glove = np.array([sentence_to_glove_vector(text, glove_kv) for text in tqdm(X)])

100%|██████████| 60000/60000 [00:11<00:00, 5011.59it/s]


In [12]:
X_train_glove, X_test_glove, y_train, y_test = train_test_split(
    X_vectors_glove, y, test_size=0.2, random_state=42
)

code cells ini mengubah setiap kalimat menjadi representasi vektor menggunakan model GloVe, fungsi sentence_to_glove_vector memeriksa setiap kata dalam kalimat dan hanya mengambil kata yang ada di dalam glove_kv, jika tidak ada kata kata yang cocok, fungsi bakal mengembalikan vektor nol dengan ukuran sesuai dimensi embedding.

## Training

**Word2Vec GoogleNews Dictionary**

In [8]:
svm_pretrained = LinearSVC(C=1.0, max_iter=3000)
svm_pretrained.fit(X_train_pre, y_train_pre)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


**Word2Vec Small Self Dictionary**

In [16]:
svm_model = LinearSVC(C=1.0, max_iter=3000)
svm_model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


**Word2Vec Big Self Dictionary**

In [11]:
svm_model = LinearSVC(C=1.0, max_iter=3000)
svm_model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


**TF IDF**

In [15]:
tfidf_vectorizer, X_train_tfidf, X_test_tfidf = build_tfidf_features(X_train_text, X_test_text)

svm_tfidf = LinearSVC(C=1.0, max_iter=3000)
svm_tfidf.fit(X_train_tfidf, y_train_text)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


**Glove**

In [13]:
svm_glove = LinearSVC(C=1.0, max_iter=3000)
svm_glove.fit(X_train_glove, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


## Test and Evaluation

**Word2Vec GoogleNews Evalution**

In [9]:
y_pred_pre = svm_pretrained.predict(X_test_pre)

print("Pre-trained Word2Vec + SVM Accuracy:", accuracy_score(y_test_pre, y_pred_pre))
print(classification_report(y_test_pre, y_pred_pre))

Pre-trained Word2Vec + SVM Accuracy: 0.715
              precision    recall  f1-score   support

           0       0.74      0.80      0.77      4023
           1       0.63      0.57      0.60      3963
           2       0.77      0.78      0.77      4014

    accuracy                           0.71     12000
   macro avg       0.71      0.71      0.71     12000
weighted avg       0.71      0.71      0.71     12000



**Word2Vec Small Self Dictionary Evaluation**

In [17]:
y_pred = svm_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7344166666666667
              precision    recall  f1-score   support

           0       0.76      0.80      0.78      4023
           1       0.64      0.60      0.62      3963
           2       0.79      0.80      0.79      4014

    accuracy                           0.73     12000
   macro avg       0.73      0.73      0.73     12000
weighted avg       0.73      0.73      0.73     12000



**Word2Vec Big Self Dictionary Evaluation**

In [12]:
y_pred = svm_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.74775
              precision    recall  f1-score   support

           0       0.77      0.82      0.79      4023
           1       0.66      0.61      0.63      3963
           2       0.80      0.82      0.81      4014

    accuracy                           0.75     12000
   macro avg       0.74      0.75      0.75     12000
weighted avg       0.74      0.75      0.75     12000



**TF IDF**

In [16]:
y_pred_tfidf = svm_tfidf.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_text, y_pred_tfidf))
print(classification_report(y_test_text, y_pred_tfidf))

Accuracy: 0.73875
              precision    recall  f1-score   support

           0       0.76      0.79      0.78      4023
           1       0.65      0.60      0.63      3963
           2       0.79      0.82      0.81      4014

    accuracy                           0.74     12000
   macro avg       0.74      0.74      0.74     12000
weighted avg       0.74      0.74      0.74     12000



**Glove**

In [14]:
y_pred_glove = svm_glove.predict(X_test_glove)

print("GloVe + SVM Accuracy:", accuracy_score(y_test, y_pred_glove))
print(classification_report(y_test, y_pred_glove))

GloVe + SVM Accuracy: 0.6965
              precision    recall  f1-score   support

           0       0.72      0.78      0.75      4023
           1       0.61      0.55      0.58      3963
           2       0.75      0.76      0.76      4014

    accuracy                           0.70     12000
   macro avg       0.69      0.70      0.69     12000
weighted avg       0.69      0.70      0.69     12000



## Saving models

**Word2Vec GoogleNews Dictionary**

In [12]:
joblib.dump(svm_pretrained, "svm_pretrained_word2vec_yelp.pkl")

['svm_pretrained_word2vec_yelp.pkl']

**Word2Vec Small Self Dictionary**

In [19]:
model.save("yelp_word2vec.model")
joblib.dump(svm_model, "svm_yelp_model.pkl")

['svm_yelp_model.pkl']

**Word2Vec Big Self Dictionary**

In [10]:
joblib.dump(svm_model, "svm_big_yelp_model.pkl")

['svm_big_yelp_model.pkl']

**TF IDF**

In [17]:
joblib.dump(svm_tfidf, "svm_tfidf_model.pkl")

['svm_tfidf_model.pkl']

**Glove**

In [15]:
joblib.dump(svm_glove, "svm_glove_model.pkl")

['svm_glove_model.pkl']