In [9]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

# NLTK setup
nltk.download('punkt')
nltk.data.path.append(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\nltk_data")

# Load train and test data
train_df = pd.read_csv(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\datasets\cleaned_train.csv")
test_df = pd.read_csv(r"C:\Users\jorda\Documents\Education\Machine Learning\Final\MLFinalCode\datasets\cleaned_test.csv")

# Tokenize training data for Word2Vec
tokenized_train = train_df['text_final'].dropna().apply(word_tokenize).tolist()

# Train Word2Vec on training set
w2v_model = Word2Vec(
    sentences=tokenized_train,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1,
    seed=42
)

# Save Word2Vec model
w2v_model.save("word2vec_model.model")

# TF-IDF on training corpus
train_corpus = train_df['text_final'].fillna("")
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(train_corpus)

# Get IDF weights
idf_weights = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

# Define embedding function
def get_weighted_w2v(text, model, idf_dict):
    tokens = word_tokenize(text)
    word_vecs = []
    weight_sum = 0
    for word in tokens:
        if word in model.wv and word in idf_dict:
            vec = model.wv[word] * idf_dict[word]
            word_vecs.append(vec)
            weight_sum += idf_dict[word]
    if word_vecs:
        return np.sum(word_vecs, axis=0) / weight_sum
    else:
        return np.zeros(model.vector_size)

# Apply to train and test sets
train_features = np.array([get_weighted_w2v(text, w2v_model, idf_weights) for text in train_corpus])
test_corpus = test_df['text_final'].fillna("")
test_features = np.array([get_weighted_w2v(text, w2v_model, idf_weights) for text in test_corpus])

# Save features and labels
np.save("w2v_tfidf_train_features.npy", train_features)
np.save("w2v_tfidf_test_features.npy", test_features)
train_df[['target']].to_csv("w2v_tfidf_train_labels.csv", index=False)
test_df.to_csv("w2v_tfidf_test_labels.csv", index=False)   # test set has not target so save entire dataframe

[nltk_data] Downloading package punkt to C:\Users\jorda/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
