# Text Classification with fastText

In [1]:
try:
    from google.colab import drive
    drive.mount('/content/gdrive/')
    
    PROJECT_PATH = "/content/gdrive/MyDrive/WID3002 Natural Language Processing"
    import sys
    sys.path.append(f"{PROJECT_PATH}")
    
    !pip install fasttext
except:
    PROJECT_PATH = "."

data_path = f"{PROJECT_PATH}/data/atis_intents_ori.csv"
train_data_path = f"{PROJECT_PATH}/data/atis_intents_train.csv"
test_data_path = f"{PROJECT_PATH}/data/atis_intents_test.csv"
fast_train_data_path = f"{PROJECT_PATH}/data/fasttext/atis_intents_train_fast.csv"
fast_test_data_path = f"{PROJECT_PATH}/data/fasttext/atis_intents_test_fast.csv"
remove_stop_train_data_path = f"{PROJECT_PATH}/data/fasttext/atis_intents_train_fast_stop_remove.csv"
remove_stop_test_data_path = f"{PROJECT_PATH}/data/fasttext/atis_intents_test_fast_stop_remove.csv"
save_model_path = f"{PROJECT_PATH}/models/fasttext_text_classification_model.bin"
    
    
import fasttext
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from utils import generate_fasttext_file, load_data


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


## Load Dataset and Train model

In [4]:
# Split the dataset into train and test
df = load_data(data_path)
train_data = df[:4000]
test_data = df[4000:]

# Write the train data to file
generate_fasttext_file(fast_train_data_path, train_data)
 
# Write the test data to file
generate_fasttext_file(fast_test_data_path, test_data)

# Train the model
model = fasttext.train_supervised(input=fast_train_data_path, epoch=50)
# Save the model
model.save_model(save_model_path)

# Test the model
result = model.test(fast_test_data_path)
print(f"Entry tested: {result[0]}")
print(f"Precision: {result[1]}")
print(f"Recall: {result[2]}")

Entry tested: 377
Precision: 0.946949602122016
Recall: 0.946949602122016


## Remove stop words and Train again model

In [4]:
# remove stopwords
a_train = load_data(train_data_path)
a_train['train_without_stopwords'] = a_train['question'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
a_test = load_data(test_data_path)

with open(remove_stop_train_data_path, "w", encoding="utf-8") as f:
    for i in range(len(a_train)):
        f.write("__label__" + str(a_train.iloc[i]["category"])[5:] + " " + a_train.iloc[i]["train_without_stopwords"] + "\n")

 
# Write the test data to file
generate_fasttext_file(remove_stop_test_data_path, a_test)

# Train the model
model = fasttext.train_supervised(input=remove_stop_train_data_path, epoch=50)
# Save the model
model.save_model(save_model_path)

# Test the model
result = model.test(remove_stop_test_data_path)
print(f"Entry tested: {result[0]}")
print(f"Precision: {result[1]}")
print(f"Recall: {result[2]}")

Entry tested: 733
Precision: 0.9699863574351978
Recall: 0.9699863574351978


## Trying with different prompt

In [6]:
# Make predictions on new data
text = "round trip fares from pittsburgh to orlando"
y_pred = model.predict(text, k=-1, threshold=0.5)
labels = y_pred[0]
probs = y_pred[1]

for label, prob in zip(labels, probs):
    print(f'{label} : {prob}')

__label__airfare : 1.0000091791152954


In [7]:
# Make predictions on new data
text = " i want to fly from orlando at 9 am and arrive in pittsburgh at 1110 in the morning"
y_pred = model.predict(text, k=-1, threshold=0.5)
labels = y_pred[0]
probs = y_pred[1]

for label, prob in zip(labels, probs):
    print(f'{label} : {prob}')

__label__flight : 0.9892359375953674
