# RunW2VecTraining

This notebook performs a data cleaning and subsequently allows to train word- and char-gram models, which are used for the QueryExpander functionality (resolving both misspelling and synonym issues).

In [1]:
%run includes/imports.py
%load_ext autoreload
%autoreload 2
from includes.w2vec import W2VecModel
from includes.stringop import StringOp
import io

In [2]:
data = pd.read_csv('data/NOTEEVENTS.csv', dtype={'TEXT': str}, usecols = ['TEXT'])

In [2]:
def write_clean_index(file, maxindex=-1):
    sentences = []
    for i in tqdm(range(data.shape[0])):
        sen = StringOp.clean(data.iloc[i]['TEXT'])
        for s in sen:
            if not 'FILLER' in s:
                s = re.sub(r'[0-9]+', '', s)
                words = s.replace('.', '').replace(',','').replace(':','').split(' ')
                words = [w.lower() for w in words if len(w)>0]
                sentences.append(words)
        if maxindex!=-1 and i>maxindex:
            break
                
    f = open(file, 'w')
    json.dump(sentences, f)
    f.close()
    return sentences

def read_clean_index(file):
    f = open(file, 'rb')
    bdata = io.BytesIO()
    while True:
        chunk = f.read(65536*4)
        clear_output(True)
        if not chunk:
            break
        bdata.write(chunk)
    bdata.seek(0)
    
    sentences = json.load(bdata)
    f.close()
    return sentences

In [4]:
result = write_clean_index(file='index/clean.json')

100%|██████████| 2083180/2083180 [54:29<00:00, 637.18it/s] 


In [3]:
result = read_clean_index(file='index/clean.json') #can take a while.

2689960


In [4]:
model_char = W2VecModel()
model_word = W2VecModel()

In [5]:
model_char.train_model(sentences = result, use_char_gram=True, size=300, window=3, min_count=5, workers=4, max_vocab_size=100000, char_n_max=5, char_n_min=3, file='index/model_char.w2v')

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end


In [6]:
model_word.train_model(sentences = result, use_char_gram=False, size=300, window=3, min_count=5, workers=4, max_vocab_size=100000, char_n_max=5, char_n_min=3, file='index/model_word.w2v')

Epoch #0 start
Epoch #0 end
Epoch #1 start
Epoch #1 end
Epoch #2 start
Epoch #2 end
Epoch #3 start
Epoch #3 end
Epoch #4 start
Epoch #4 end
