In [8]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
import pickle

In [9]:
# Function to load tokenized chunks from disk
def load_tokenized_chunk(file_path):
    with open(file_path, 'rb') as f:
        chunk = pickle.load(f)
    return chunk

# Function to transform text data into word vectors
def document_vector(doc, model):
    # Remove out-of-vocabulary words
    doc = [word for word in doc if word in model.wv.key_to_index]
    return np.mean(model.wv[doc], axis=0) if len(doc) > 0 else np.zeros(model.vector_size)

# Load the trained Word2Vec model
model = Word2Vec.load("models/word2vec.model")

# List of all tokenized chunk files
all_files = [f'pickled/tokenized_chunk_{i}.pkl' for i in range(20)]

# Create an empty DataFrame to hold the new dataset
new_dataset = pd.DataFrame()

# Process each tokenized chunk file
for file in all_files:
    with open(file, 'rb') as f:
        tokenized_chunk = pickle.load(f)
        
        # Transform each document in the chunk into a vector
        tokenized_chunk['vector'] = tokenized_chunk['tokens'].apply(lambda x: document_vector(x, model))
        
        # Drop the original tokens column
        tokenized_chunk = tokenized_chunk.drop(columns=['tokens', 'body'])
        
        # Append the transformed chunk to the new dataset
        new_dataset = pd.concat([new_dataset, tokenized_chunk], ignore_index=True)
        print(f'Processed {file}')

Processed pickled/tokenized_chunk_0.pkl
Processed pickled/tokenized_chunk_1.pkl
Processed pickled/tokenized_chunk_2.pkl
Processed pickled/tokenized_chunk_3.pkl
Processed pickled/tokenized_chunk_4.pkl
Processed pickled/tokenized_chunk_5.pkl
Processed pickled/tokenized_chunk_6.pkl
Processed pickled/tokenized_chunk_7.pkl
Processed pickled/tokenized_chunk_8.pkl
Processed pickled/tokenized_chunk_9.pkl
Processed pickled/tokenized_chunk_10.pkl
Processed pickled/tokenized_chunk_11.pkl
Processed pickled/tokenized_chunk_12.pkl
Processed pickled/tokenized_chunk_13.pkl
Processed pickled/tokenized_chunk_14.pkl
Processed pickled/tokenized_chunk_15.pkl
Processed pickled/tokenized_chunk_16.pkl
Processed pickled/tokenized_chunk_17.pkl
Processed pickled/tokenized_chunk_18.pkl
Processed pickled/tokenized_chunk_19.pkl


FileNotFoundError: [Errno 2] No such file or directory: 'pickled/tokenized_chunk_20.pkl'

In [10]:
# save to file 
new_dataset.to_pickle('pickled/word2vec_dataset.pkl')

In [11]:
train_indices, test_indices = train_test_split(range(len(new_dataset['mbti'])), 
                                               test_size=0.3, 
                                               stratify=new_dataset['mbti'], 
                                               random_state=42)

In [12]:
# save the indices
with open('pickled/train_indices.pkl', 'wb') as f:
    pickle.dump(train_indices, f)
with open('pickled/test_indices.pkl', 'wb') as f:
    pickle.dump(test_indices, f)