In [1]:
import fastText
import pandas as pd
import numpy as np
import nltk
from pymystem3 import Mystem
from pathlib import Path
from sklearn.metrics import accuracy_score

In [2]:
def predict_ft(model, texts):
    labels = np.array([model.predict(text)[0][0].strip('__label__') for text in texts])
    return labels.astype(int)

In [9]:
def cross_val_score(data_path, data_type,
                    scorer, model_params):
    
    data_dir = Path(data_path)
    n_files = len(list(data_dir.glob('*.txt')))
    assert n_files % 3 == 0
    n_splits = n_files // 3
    
    scores = []
    
    for fold in range(n_splits):
        train_path = Path(data_path, f'train_{fold}.txt')
        test_path = Path(data_path, f'test_{fold}.txt')
        label_pth = Path(test_path, '.label')
        
        test = pd.read_json(test_path, orient='records', lines=True)
        with open(test_path) as f:
            test_texts = f.readlines()
            
        test_labels = pd.read_csv(label_path, header=None,names=['label'])
        
        model = fastText.train_supervised(train_path.as_posix(), **model_params)
        
        preds = predict_ft(model, test_texts)
        scores.append(scorer(test_labels, preds))
    return np.mean(scores)

In [20]:
def random_search(train_dir, test_dir, data_type,
                  param_grid, scorer,  n_trials=10):
    best_score  = 0 
    best_paramss = {}
    scores = {}
    for i in range(n_trials):
        model_params = {
            k: np.random.choice(param_grid[k])
            for k in param_grid
        }
        
        model_params['maxn'] = max(model_params['maxn'], model_params['minn'])
        if tuple(model_params.items()) not in scores:
            score = cross_val_score(train_dir, test_dir, data_type,
                                   scorer, model_params)
            scores[tuple(model_params.items())] = score
            
            if score > best_score:
                best_score = score
                best_params = model_params

    return best_score, best_params, scores

In [11]:
param_grid = {
    'minCount': range(1,6) ,
    'wordNgrams': range(1,4),
    'minn': [2] ,
    'maxn': [5],
    'epoch': [5, 10, 15],
    'thread': [6],
    'dim': [50, 100, 200],
    'lr': np.linspace(0.05, 0.2, 200),
    'lrUpdateRate': np.arange(1,10)*100,
}

In [12]:
best_score, best_params, scores = random_search('../data/ft/lemmatized/kfolds', '../data/json/kfolds/', 'lemmatized',
              param_grid, accuracy_score)

In [13]:
best_score

0.8728872657040551

In [14]:
best_params

{'minCount': 1,
 'wordNgrams': 2,
 'minn': 2,
 'maxn': 5,
 'epoch': 15,
 'thread': 6,
 'dim': 200,
 'lr': 0.17512562814070354,
 'lrUpdateRate': 400}

In [None]:
param_grid = {
    'minCount': range(1,6) ,
    'wordNgrams': range(1,4),
    'minn': range(3) ,
    'maxn': range(5),
    'epoch': [15,20],
    'thread': [6],
    'dim': [100, 150, 200, 250],
    'lr': np.linspace(0.1, 0.2, 50),
    'lrUpdateRate': np.arange(1,10)*100,
}
best_score_2, best_params_2, scores_2 = random_search('../data/ft/lemmatized/kfolds', '../data/json/kfolds/',
                                                      'lemmatized', param_grid, accuracy_score, n_trials=50)