In [None]:
import numpy as np  
import pandas as pd
from pathlib import Path
from fastai.text.all import *

In [None]:
### Load training data

In [None]:
df_train = pd.read_csv('spam_202302101016_training.csv');
df_train.head()

df_test = pd.read_csv('spam_202302101016_testing.csv');

# We can train using the whole dataset and using valid_pct to generate validation data instead
df = pd.concat([df_train, df_test])

In [None]:
### Part 1: Training for next word as our base language model

In [None]:
dls_lm = TextDataLoaders.from_df(df, 
                                 text_col = 'text', 
                                 label_col = 'label', 
                                 valid_pct = 0.10, 
                                 bs = 64, 
                                 is_lm = True)

In [None]:
learn = language_model_learner(dls_lm, AWD_LSTM, drop_mult = 0.3, metrics=accuracy)

In [None]:
# Estimate the best learning rate to prevent over-fitting
learn.lr_find()

In [None]:
### Note: This part can be really slow - you can stop after 2 epochs instead of 5
learn.fit_one_cycle(5, 2e-3)


In [None]:
learn.save_encoder('language_model')

In [None]:
### Part 2: Training for categorisation (spam or ham)

In [None]:
dls_clas = TextDataLoaders.from_df(df,
                        valid_pct = 0.10, 
                        text_col = 'text',
                        label_col = 'label', 
                        bs = 64, 
                        text_vocab = dls_lm.vocab)

In [None]:
learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult = 0.5, metrics = accuracy)

In [None]:
# Load our saved encoder into the new categorisation model
learn = learn.load_encoder('language_model')

In [None]:
# Estimate the best learning rate to prevent over-fitting
learn.lr_find()

In [None]:
### Note: This part can be slow
learn.fit_one_cycle(2, 2e-3)

In [None]:
### Note: This part can be slow (more training)
learn.freeze_to(-2)
learn.fit_one_cycle(3, 3e-3)

In [None]:
learn.freeze()

In [None]:
### Exporting the model

In [None]:
learn.export('models/awd_lstm_fully_trained_export')

In [None]:
### Loading the model

In [None]:
learn = load_learner('models/awd_lstm_fully_trained_export')

In [None]:
learn.predict('It was snow day in Tokyo. I\'m always at outside during work so that was awful. Do you like snow? Speaking of snow, I remembered "informer" though...🤔')

In [None]:
learn.predict('打扰大家了，打个广告！S9是一个可帮助社区代币销毁的平台 平台资金雄厚，持有国际牌照！ 注意事项：目前只开放海外用户，请开启境外VPN访问！ 官方网址: https://s9.com TG群: https://t.me/S9Coin_cn https://s9app.net/ZcTzak.png')

In [None]:
### Single predictions

In [None]:
df_test = pd.read_csv('spam_202302101016_testing.csv')

In [None]:
### Batch processing
dl_test   = learn.dls.test_dl(df_test)
preds     = learn.get_preds(dl=dl_test)

In [None]:
preds