# FastText

[Documentation](https://fasttext.cc/docs/en/support.html)  
[Article](https://arxiv.org/abs/1607.01759)

### Recap


### Summary


  
### Additional Resources


In [99]:
import sys
sys.path.append('../src')
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import fasttext as ft
from pathlib import Path
from os import remove
import numpy as np
from sklearn.model_selection import train_test_split

In [98]:
df = pd.read_parquet('../data/processed/imdb_dataset.parquet')
train_df = df[df['role']=='train']
test_df = df[df['role']=='test']
del df

# FastText Default

This library has a few particularities.  
It needs the data to be in a specific format such that the category has this prefix "__label __" and must be in a text file.  

Thus, we will prepare the dataset, and as soon as we are ready to fit the model, we will write the fite in a temporary location so that the model can read it from there, and when it is trained, we delete the file.

In [57]:
def fastext_preprocess(df:pd.DataFrame, label_colname:str):
    # Convert label column to string format for comparison later on
    if not isinstance(df[label_colname].iloc[0], str):
        df = df.astype({label_colname: 'str'})
        
    original_labels = df[label_colname].unique()
    df[label_colname] = "__label__" + df[label_colname].astype(str)
    preprocessed_labels = df[label_colname].unique()
    label_mapping = map(original_labels,preprocessed_labels)
    
    return df, label_mapping

In [58]:
train_df, mapping = fastext_preprocess(train_df,'sentiment')

In [23]:
# Write df to a text file , train the model using the text file and delete temp text file
train_df[['sentiment', 'review']].to_csv(Path('../data/interim/temp_fasttext.csv'), header=None, index=None, sep='\t')
model = ft.train_supervised(str(Path('../data/interim/temp_fasttext.csv')))
remove(Path('../data/interim/temp_fasttext.csv'))

In [44]:
# Convert df to list
test_df_list = list(test_df['review'].values)
predictions = model.predict(test_df_list)

In [68]:
test_df, _ = fastext_preprocess(test_df, 'sentiment')

In [78]:
predictions = np.array(predictions[0]).flatten()

In [79]:
accuracy_score(test_df['sentiment'],predictions)

0.8965723395775209

# FastText with Autotune

In [100]:
train_df, valid_df = train_test_split(train_df,test_size=.1)

In [103]:
train_df, _ = fastext_preprocess(train_df,'sentiment')
valid_df, _ = fastext_preprocess(valid_df,'sentiment')

train_df[['sentiment', 'review']].to_csv(Path('../data/interim/temp_fasttext.csv'), header=None, index=None, sep='\t')
valid_df[['sentiment', 'review']].to_csv(Path('../data/interim/temp_fasttext_v.csv'), header=None, index=None, sep='\t')

In [105]:
model = ft.train_supervised(str(Path('../data/interim/temp_fasttext.csv')),
                            autotuneValidationFile='../data/interim/temp_fasttext_v.csv',
                            autotuneDuration=120)

In [106]:
remove(Path('../data/interim/temp_fasttext.csv'))
remove(Path('../data/interim/temp_fasttext_v.csv'))

In [107]:
predictions = model.predict(test_df_list)

In [108]:
predictions = np.array(predictions[0]).flatten()

In [113]:
test_df, _ = fastext_preprocess(test_df, 'sentiment')
accuracy_score(test_df['sentiment'],predictions)

0.9042447190115583