## Train FastText Models

This notebook trains various character embeddings using FastText on Windows 10 UIF data. 

In [35]:
import fasttext
from gensim.models.wrappers import FastText
import pandas as pd
import numpy as np
from itertools import product
import time

## Hyperparameters

    input             # training file path (required)
    model             # unsupervised fasttext model {cbow, skipgram} [skipgram]
    lr                # learning rate [0.05]
    dim               # size of word vectors [100]
    ws                # size of the context window [5]
    epoch             # number of epochs [5]
    minCount          # minimal number of word occurences [5]
    minn              # min length of char ngram [3]
    maxn              # max length of char ngram [6]
    neg               # number of negatives sampled [5]
    wordNgrams        # max length of word ngram [1]
    loss              # loss function {ns, hs, softmax, ova} [ns]
    bucket            # number of buckets [2000000]
    thread            # number of threads [number of cpus]
    lrUpdateRate      # change the rate of updates for the learning rate [100]
    t                 # sampling threshold [0.0001]
    verbose           # verbose [2]


In [36]:
# get hyperparameter combinations
params = {
    "model":  ["cbow", "skipgram"],
    "lr": [0.0025],
    "dim": [300, 600],
    "ws": [5, 10],
    "epoch": [5],
    "minCount": [10],
    "minn": [5],
    "maxn": [5],
    "neg": [5],
    "thread": [6]
}

combinations = product(*(params[Name] for Name in params.keys()))
param_combs = [dict(zip(params.keys(),v)) for v in product(*params.values())]

In [37]:
# training function
def train_fasttext(params, input_data):
    print('\nTraining FastText on {}...'.format(input_data))
    print('Training parameters: \n{}'.format(str(params)))
    s_time = time.time()
    ft_model = fasttext.train_unsupervised(input_data, 
                                           dim=params['dim'],
                                           model=params['model'],
                                           epoch=params['epoch'],
                                           lr=params['lr'],
                                           maxn=params['maxn'],
                                           minCount=params['minCount'],
                                           minn=params['minn'],
                                           neg=params['neg'],
                                           thread=params['thread'],
                                           ws=params['ws'])
    e_time = time.time()
    print('Training complete. Elapsed time: {0:.2f}s'.format(e_time - s_time))
    return ft_model 

In [38]:
# train fasttext
input_data = 'data/uif_data_clean.tsv'

for params in param_combs:
    model = train_fasttext(params, input_data)
    save_path = 'models/fasttext_{}_{}_{}.bin'.format(params['model'], params['dim'], params['ws'])
    model.save_model(save_path)
    print('Model saved at {}'.format(save_path)) 


Training FastText on data/uif_data_clean.tsv...
Training parameters: 
{'dim': 300, 'neg': 5, 'lr': 0.0025, 'maxn': 5, 'minCount': 10, 'model': 'cbow', 'epoch': 5, 'ws': 5, 'thread': 6, 'minn': 5}
Training complete. Elapsed time: 623.07s
Model saved at models/fasttext_cbow_300_5.bin

Training FastText on data/uif_data_clean.tsv...
Training parameters: 
{'dim': 300, 'neg': 5, 'lr': 0.0025, 'maxn': 5, 'minCount': 10, 'model': 'cbow', 'epoch': 5, 'ws': 10, 'thread': 6, 'minn': 5}
Training complete. Elapsed time: 802.58s
Model saved at models/fasttext_cbow_300_10.bin

Training FastText on data/uif_data_clean.tsv...
Training parameters: 
{'dim': 300, 'neg': 5, 'lr': 0.0025, 'maxn': 5, 'minCount': 10, 'model': 'skipgram', 'epoch': 5, 'ws': 5, 'thread': 6, 'minn': 5}
Training complete. Elapsed time: 1748.02s
Model saved at models/fasttext_skipgram_300_5.binModel saved at models/fasttext_cbow_600_5.bin

Training FastText on data/uif_data_clean.tsv...
Training parameters: 
{'dim': 600, 'neg': 5