## IMPORT LIBRARY

In [22]:
import pandas as pd
import numpy as np
import joblib
from tqdm import tqdm
from gensim.models import Word2Vec
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer

## LOADING DATASET

In [4]:
df = pd.read_parquet('./dataset/svm_data_preprocessed.parquet')

In [5]:
df.count()

clean_text    60000
lemma_text    60000
pos_tags      60000
sentiment     60000
dtype: int64

In [6]:
df.head()

Unnamed: 0,clean_text,lemma_text,pos_tags,sentiment
0,if you decide to eat here just be aware it is ...,if you decide to eat here just be aware it be ...,"[[if, IN], [you, PRP], [decide, VBP], [to, TO]...",1
1,a couple friends and i stopped by for some lat...,a couple friend and i stop by for some late ni...,"[[a, DT], [couple, JJ], [friends, NNS], [and, ...",1
2,sometimes this food is very very good unfortu...,sometimes this food be very very good unfortun...,"[[sometimes, RB], [this, DT], [food, NN], [is,...",1
3,after trying a few ramen places with crazy var...,after try a few ramen place with crazy variety...,"[[after, IN], [trying, VBG], [a, DT], [few, JJ...",1
4,great food terrible customer service ive been ...,great food terrible customer service ive be th...,"[[great, JJ], [food, NN], [terrible, JJ], [cus...",1


## Model's Architecture

**Word2Vec Self Dictionary Model**

In [11]:
# Suppose your lemma_text column has preprocessed, tokenized sentences
sentences = [text.split() for text in df['lemma_text']]

model = Word2Vec(
    sentences=sentences,
    vector_size=300,     # match the pre-trained model dimension
    window=5,
    min_count=2,         # ignore rare words
    workers=4,
    sg=1                 # 1 = skip-gram, 0 = CBOW
)

model.save("yelp_word2vec.model")

In [12]:
w2v = model.wv
vector = w2v["great"]

In [13]:
def vectorize_sentence(sentence, model, dim=300):
    words = sentence.split()
    valid_words = [w for w in words if w in model]
    if not valid_words:
        return np.zeros(dim)
    return np.mean(model[valid_words], axis=0)

tqdm.pandas()
X_vectors = np.vstack(df['lemma_text'].progress_apply(lambda s: vectorize_sentence(s, w2v)))

100%|██████████| 60000/60000 [00:14<00:00, 4214.54it/s]


In [14]:
X_vectors.shape

(60000, 300)

**TF IDF**

In [24]:
def build_tfidf_features(X_train, X_test, 
                         max_features=5000, 
                         ngram_range=(1, 2), 
                         sublinear_tf=True, 
                         stop_words='english'):
    tfidf_vectorizer = TfidfVectorizer(
        max_features=max_features,
        ngram_range=ngram_range,
        sublinear_tf=sublinear_tf,
        stop_words=stop_words
    )

    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    return tfidf_vectorizer, X_train_tfidf, X_test_tfidf

## SPLIT DATASET

**Word2Vec**

In [15]:
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(
    X_vectors, y, test_size=0.2, random_state=42
)

**TF iDF**

In [27]:
X = df['lemma_text']
y = df['sentiment']

X_train_text, X_test_text, y_train_text, y_test_text = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## Training

**Word2Vec Self Dictionary**

In [16]:
svm_model = LinearSVC(C=1.0, max_iter=3000)
svm_model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


**TF IDF**

In [28]:
tfidf_vectorizer, X_train_tfidf, X_test_tfidf = build_tfidf_features(X_train_text, X_test_text)

svm_tfidf = LinearSVC(C=1.0, max_iter=3000)
svm_tfidf.fit(X_train_tfidf, y_train_text)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


## Test and Evaluation

**Word2Vec Self Dictionary Evaluation**

In [17]:
y_pred = svm_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7344166666666667
              precision    recall  f1-score   support

           0       0.76      0.80      0.78      4023
           1       0.64      0.60      0.62      3963
           2       0.79      0.80      0.79      4014

    accuracy                           0.73     12000
   macro avg       0.73      0.73      0.73     12000
weighted avg       0.73      0.73      0.73     12000



**TF IDF**

In [30]:
y_pred_tfidf = svm_tfidf.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test_text, y_pred_tfidf))
print(classification_report(y_test_text, y_pred_tfidf))

Accuracy: 0.73875
              precision    recall  f1-score   support

           0       0.76      0.79      0.78      4023
           1       0.65      0.60      0.63      3963
           2       0.79      0.82      0.81      4014

    accuracy                           0.74     12000
   macro avg       0.74      0.74      0.74     12000
weighted avg       0.74      0.74      0.74     12000



## Saving models

**Word2Vec Self Dictionary**

In [19]:
model.save("yelp_word2vec.model")
joblib.dump(svm_model, "svm_yelp_model.pkl")

['svm_yelp_model.pkl']