In [2]:
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
from tqdm import tqdm
import re

def preprocess_text(text):
    return re.sub(r'[^\w\s]', '', str(text).lower()).split()

def train_model(chunk):
    sentences = [preprocess_text(text) for text in chunk['text']]
    model.train(sentences, total_examples=len(sentences), epochs=5)

def process_chunk(i, df, chunk_size):
    start_idx = i * chunk_size
    end_idx = (i + 1) * chunk_size
    chunk = df.iloc[start_idx:end_idx]
    train_model(chunk)

vocab = np.loadtxt('model_vocab.txt', dtype=str).tolist()

docs = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'number', 'other']

skip_gram = 1  # 0 for CBOW, 1 for skip-gram

print('Initializing model')
model = Word2Vec([vocab], vector_size=100, window=5, sg=skip_gram, epochs=1)

chunk_size = 1000  # Adjust as needed

for doc in tqdm(docs):
    df = pd.read_parquet(f'Wikipedia Data/{doc}.parquet')
    num_chunks = len(df) // chunk_size + 1
    with ThreadPoolExecutor() as executor:
        futures = [executor.submit(process_chunk, i, df, chunk_size) for i in range(num_chunks)]
        for future in futures:
            future.result()

if skip_gram == 0:
    model.save("wikipedia_cbow.model")
else:
    model.save("wikipedia_sg.model")



Initializing model


100%|██████████| 28/28 [14:55:34<00:00, 1919.08s/it]   


In [11]:
# Test the trained model
word = "poop"
similar_words = model.wv.most_similar(word)

print(f"Similar words to '{word}':")
for similar_word, similarity in similar_words:
    print(f"{similar_word}: {similarity}")

Similar words to 'poop':
slop: 0.6632512807846069
spanking: 0.6175498366355896
bucket: 0.6167871952056885
goddamn: 0.615112841129303
giggling: 0.605552613735199
barnyard: 0.6014346480369568
fart: 0.5992071032524109
momma: 0.5944929122924805
cribs: 0.5913742184638977
crock: 0.5903614163398743
