In [1]:
import pandas as pd
import numpy as np
import pickle

In [None]:
# Read cleaned corpus
with open('vocab_counts.txt','rb') as f:
   vocab = pickle.load(f)

with open('corpus_clean.txt','rb') as f:
   corpus = pickle.load(f)

print('vocab',type(vocab),len(vocab))
print('corpus',type(corpus),len(corpus))

In [1]:
from .Tokenizer import TwitchTokenizer

tokenizer = TwitchTokenizer()

# Tokenizing the corpus

The code below is formatted for multiple corpuses, allowing for easy training of multiple models using different corpuses

In [44]:
import pandas as pd

path = r"PATH/TO/CORPUS/FILES/"
corpus_files = ['corpus_raw.txt'] # List all corpus files of interest
corpi = []

for corpus_file in corpus_files:
    corpi.append(pd.read_csv(path+corpus_file, delimiter='\n', header=None, dtype=str, na_filter=False))

In [None]:
# Example tokenizer output
ex = tokenizer.tokenize('POG that was 5head!!! HOLY sheeeEEEeEeEeSH wooooow')
print(ex)

In [None]:
from tqdm import tqdm

tok_corps = [] # Tokenized corpuses

for corpus in corpi: # Tokenize each corpus
    tok_corp = []
    for msg in tqdm(corpus[0]):
        tok_corp.append(tokenizer.tokenize(str(msg)))
    tok_corps.append(tok_corp)
            
    if(len(tok_corp) == len(corpus)):
        print('Tokenize success')
    else:
        print('Tokenized:', len(tok_corp))
        print('Corpus:',len(corpus))

In [4]:
import csv
# Save using CSV Writer to avoid errors
with open('tokenized_raw_corpus.csv', 'w', newline='', encoding='utf-8-sig') as f:
    writer = csv.writer(f)
    for row in tok_corp:
        writer.writerow(row)

In [2]:
import csv
# Load
with open("Corpus/tokenized_raw_corpus.csv", "r", encoding="utf-8-sig") as f:
    corp = csv.reader(f)
    tok_corp = [row for row in corp]

# Creating the model

In [None]:
import gensim

models = []

# for tok_corp in tok_corps:
# Train a Word2Vec model on the tokens
model = gensim.models.Word2Vec(tok_corp, vector_size=500, window=5, min_count=10, workers=6)

# Train the model
model.train(tok_corp, total_examples=len(tok_corp), epochs=20)

models.append(model)

In [42]:
m_name = 'Name Your Model Here'

In [43]:
model.save('models/'+m_name+'/model')

In [None]:
tok_corp[0] # Print first tokenized line as an example

# Create the metadata for TensorFlow's Embedding Projector

In [11]:
import gensim
folder = 'models/'
models = ['List of model names']

for m in models:
    # Load model
    path = folder+m
    model = gensim.models.Word2Vec.load(path+'/model')

    batch_size = 1000
    word_index = 0

    # Create metadata based on vocab seen in models
    with open(path+'/!embeddings_'+m+'.tsv', 'w', encoding='utf-8-sig') as tensors:
        with open(path+'/!metadata_'+m+'.tsv', 'w', encoding='utf-8-sig') as metadata:
            while word_index < len(model.wv.index_to_key):
                batch_words = model.wv.index_to_key[word_index:word_index + batch_size]
                for word in batch_words:
                    encoded=word.encode('utf-8-sig')
                    metadata.write(word + '\n')
                    vector_row = '\t'.join(map(str, model.wv[word]))
                    tensors.write(vector_row + '\n')
                word_index += batch_size
                # Clear memory
                del batch_words