#### Import library

In [23]:
import os
import pandas as pd
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import numpy as np
import pickle
import joblib
import scipy.sparse

#### Read and Show shape of data

In [14]:
train_df = pd.read_csv("D:/Projects/Sentiment_Analysis/data/processed/train_clean.csv")
dev_df   = pd.read_csv("D:/Projects/Sentiment_Analysis/data/processed/dev_clean.csv")
test_df  = pd.read_csv("D:/Projects/Sentiment_Analysis/data/processed/test_clean.csv")

print("Train:", train_df.shape)
print("Dev:", dev_df.shape)
print("Test:", test_df.shape)

train_df.head()

Train: (11426, 3)
Dev: (3166, 3)
Test: (3166, 3)


Unnamed: 0,sentence,sentiment,topic
0,slide giao trinh day du,2,1
1,nhiet tinh giang day gan gui voi sinh vien,2,0
2,di hoc day du full diem chuyen can,0,1
3,chua ap dung cong thong va cac thiet bi ho tro...,0,0
4,thay giang bai co nhieu bai tap vi du tren lop,2,0


#### Preparing Word2Vec training data

In [15]:
# Make sure the data type is string
train_df['sentence'] = train_df['sentence'].astype(str)
dev_df['sentence'] = dev_df['sentence'].astype(str)
test_df['sentence'] = test_df['sentence'].astype(str)

# Then separate from
sentences = [text.split() for text in train_df['sentence']]
dev_sentences = [text.split() for text in dev_df['sentence']]
test_sentences = [text.split() for text in test_df['sentence']]

#### Fit the Word2Vec model

In [16]:
w2v_model = Word2Vec(
    sentences=sentences,   # training data
    vector_size=100,        # number of dimensions of the vector (usually 100–300)
    window=5,               # context size (5 words around)
    min_count=2,            # ignore words that appear < 2 times
    workers=4,              # number of CPU threads
    sg=1                    # skip-gram (1) or CBOW (0)
)

# Save the model
w2v_model.save("D:/Projects/Sentiment_Analysis/models/word2vec.model")

#### Convert each sentence to a mean vector

In [17]:
# Function to calculate the average vector of a sentence
def get_sentence_vector(tokens, model):
    valid_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(valid_vectors) == 0:
        return np.zeros(model.vector_size)  # if the sentence is empty or has no valid word
    return np.mean(valid_vectors, axis=0)

# Create a vector matrix for all data
X_train_w2v = np.array([get_sentence_vector(tokens, w2v_model) for tokens in sentences])
X_dev_w2v = np.array([get_sentence_vector(tokens, w2v_model) for tokens in dev_sentences])
X_test_w2v = np.array([get_sentence_vector(tokens, w2v_model) for tokens in test_sentences])

#### Save embedding to file

In [25]:
# Lưu vectorizer
joblib.dump(train_df["sentiment"].values, "D:/Projects/Sentiment_Analysis/models/Word2Vec/y_train.pkl")
joblib.dump(dev_df["sentiment"].values, "D:/Projects/Sentiment_Analysis/models/Word2Vec/y_dev.pkl")
joblib.dump(test_df["sentiment"].values, "D:/Projects/Sentiment_Analysis/models/Word2Vec/y_test.pkl")

# Lưu ma trận TF-IDF dạng nén (nếu muốn)
np.savez_compressed("D:/Projects/Sentiment_Analysis/data/features/Word2Vec/X_train_w2v.npz", X_train_w2v)
np.savez_compressed("D:/Projects/Sentiment_Analysis/data/features/Word2Vec/X_dev_w2v.npz", X_dev_w2v)
np.savez_compressed("D:/Projects/Sentiment_Analysis/data/features/Word2Vec/X_test_w2v.npz", X_test_w2v)

print("✅ D:\Projects\Sentiment_Analysis\models\Word2Vec/word2vec_features.pkl")
print("X_train:", X_train_w2v.shape)

✅ D:\Projects\Sentiment_Analysis\models\Word2Vec/word2vec_features.pkl
X_train: (11426, 100)
