# FastText 


In [1]:
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


### Dataset and split

In [2]:
dataset = load_dataset("imdb")
train_dataset = dataset["train"].train_test_split(
    stratify_by_column="label", test_size=0.2, seed=42
)
test_df = dataset["test"]
train_df = train_dataset["train"]
valid_df = train_dataset["test"]

Found cached dataset imdb (/home/djulo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)
100%|████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 399.86it/s]
Loading cached split indices for dataset at /home/djulo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-5f37fd0866e4f89f.arrow and /home/djulo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-dd5732a0e6ac784c.arrow


### 1. Turn the dataset into a dataset compatible with Fastext


In [3]:
from string import punctuation
import re

def preprocess(dataset: dict) -> dict:
    '''Take a dataset, return the dataset preprocessed'''
    
    # Convert the text to lowercase and remove <br>
    dataset['text'] = dataset['text'].lower().replace('<br /><br />', '')
    
    # Replace punctuation with spaces
    dataset['text'] = re.sub('['+punctuation+']', ' ', dataset['text'])
    
    # Replace multiple with a single space
    dataset['text'] = " ".join(dataset['text'].split())
    
    return dataset

# Apply the preprocessing function to the train dataset
train_df = train_df.map(preprocess)

# Apply the preprocessing function to the test dataset
test_df = test_df.map(preprocess)

# Apply the preprocessing function to the validation dataset
valid_df = valid_df.map(preprocess)

# Create a training file compatible with fastext
with open('training_data.txt', 'w') as file:
    # Loop through each string in the list
    for data in train_df:
        # Write the string to the file and append a newline character
        file.write('__label__' + str(data['label']) + ' ' + data['text'] + '\n')

# Create a test file compatible with fastext
with open('test_data.txt', 'w') as file:
    # Loop through each string in the list
    for data in test_df:
        # Write the string to the file and append a newline character
        file.write('__label__' + str(data['label']) + ' ' + data['text'] + '\n')
        
# Create a valid file compatible with fastext
with open('valid_data.txt', 'w') as file:
    # Loop through each string in the list
    for data in valid_df:
        # Write the string to the file and append a newline character
        file.write('__label__' + str(data['label']) + ' ' + data['text'] + '\n')
     

Loading cached processed dataset at /home/djulo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-1fd51be787e9806b.arrow
Loading cached processed dataset at /home/djulo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-40be9da7ef59e51c.arrow
Loading cached processed dataset at /home/djulo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0/cache-64045981f0c2816e.arrow


### 2. Train a FastText classifier

In [4]:
import fasttext

# Train a FastText model
fastText_model = fasttext.train_supervised('training_data.txt')

Read 4M words
Number of words:  69701
Number of labels: 2
Progress: 100.0% words/sec/thread: 2663497 lr:  0.000000 avg.loss:  0.432575 ETA:   0h 0m 0s


In [5]:
fastText_model.test('validation_data.txt')

(25000, 0.8706, 0.8706)

The tuple tells us:
1. The number of sample: 25000
2. The precision at one: 0.87908
3. the recall at one: 0.87908

### 3. Use of the hyperparameters search functionality of FastText

The data in the training_data.txt is already shuffled. We can verify it by checking the first lines of the file.

In [6]:
model = fasttext.train_supervised('training_data.txt', autotuneValidationFile='validation_data.txt')

Progress: 100.0% Trials:    9 Best score:  0.895080 ETA:   0h 0m 0s
Training again with best arguments
Read 4M words
Number of words:  69701
Number of labels: 2
Progress: 100.0% words/sec/thread:  909374 lr:  0.000000 avg.loss:  0.057352 ETA:   0h 0m 0s


In [11]:
model.test('validation_data.txt')

(25000, 0.89492, 0.89492)

### 4. differences between the default model and the attributes found with hyperparameters search

As we have tuned hyperparameters using the autotuneValidationFile option, we will only compare the hyper parameters attributes of the 2 models.

In [24]:
print('Default model:')
print("Value of the learning rate:", fastText_model.lr)
print("Dimensionality of word vectors:", fastText_model.dim)
print("Number of epochs:", fastText_model.epoch)
print("Max length of word ngram :", fastText_model.wordNgrams)
print("Min length of char ngram:", fastText_model.minn)
print("Max length of char ngram:", fastText_model.maxn)
print("Loss function {ns, hs, softmax}:", fastText_model.loss)

print()

print('Hyperparameters search model: ')
print("Value of the learning rate:", model.lr)
print("Dimensionality of word vectors:", model.dim)
print("Number of epochs:", model.epoch)
print("max length of word ngram:", model.wordNgrams)
print("Min length of char ngram:", model.minn)
print("Max length of char ngram:", model.maxn)
print("Loss function {ns, hs, softmax}:", model.loss)

Default model:
Value of the learning rate: 0.1
Dimensionality of word vectors: 100
Number of epochs: 5
Max length of word ngram : 1
Min length of char ngram: 0
Max length of char ngram: 0
Loss function {ns, hs, softmax}: loss_name.softmax

Hyperparameters search model: 
Value of the learning rate: 0.27162332716385296
Dimensionality of word vectors: 160
Number of epochs: 37
max length of word ngram: 2
Min length of char ngram: 0
Max length of char ngram: 0
Loss function {ns, hs, softmax}: loss_name.softmax


The 2 models use the same loss function but have several attributes that differ, indeed, the value of the learning rate, of the dimensionality of word vectors and of the number of epochs is higher in the hyperparameters search model.

default model:
1. Value of the learning rate: 0.1
2. Dimensionality of word vectors: 100
3. Number of epochs: 5


hyperparameters search model: 
1. Value of the learning rate: 0.27162332716385296
2. Dimensionality of word vectors: 160
3. Number of epochs: 37

### Bonus

The minn and maxn hyperparameters in FastText determine the minimum and maximum character n-gram lengths used during training.
If after hyperparameter search on your data, the values of minn and maxn are set to 0, it likely indicates that FastText is not using any character n-grams during training. This can happen if the hyperparameter search determined that the best performing model does not require character n-grams for the specific language and data you are working with.

In particular, for languages that do not have a rich morphology, such as English, it is common to find that the optimal values for minn and maxn are set to 0 during hyperparameter search. This is because English words are relatively short and do not require character n-grams to capture meaningful word representations.