# Configuration Section - Important

In [None]:
colab_flag = True  # Set it true if the notebook is run on Colab
abc_finetuned_flag = False  # Set it to true if the phase2 fine tuned model exists


# Importing Modules

## General Libraries

In [None]:
import pandas as pd
import numpy as np
import pickle

## For Colab

In [None]:
if colab_flag:
  
  !pip install -Uqq fastbook   
  from fastbook import * 
  from google.colab import drive 
  drive.mount('/content/drive')
  %cd /content/drive/My\ Drive/Colab\ Notebooks/

  gpu_info = !nvidia-smi
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
  else:
    print(gpu_info)

[K     |████████████████████████████████| 720 kB 7.2 MB/s 
[K     |████████████████████████████████| 1.2 MB 43.2 MB/s 
[K     |████████████████████████████████| 189 kB 78.3 MB/s 
[K     |████████████████████████████████| 46 kB 2.9 MB/s 
[K     |████████████████████████████████| 56 kB 3.5 MB/s 
[K     |████████████████████████████████| 51 kB 359 kB/s 
[?25hMounted at /content/drive
/content/drive/My Drive/Colab Notebooks
Sun Dec 12 11:33:48 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off | 

## Deep Learning

In [None]:
from fastai.text.all import *

# Building the LSTM model with a transfer learning approach

## Data loader object for the ABC headlines

In [None]:
# The data is stored in a df

abc_df = pd.read_csv("Data/abc/abcnews_date_text.csv")

In [None]:
path = Path("/content/drive/My Drive/Colab Notebooks/Data/abc")

In [None]:
# Function to create a L object with all the headlines

def get_abc(df):

    items = [df.iloc[row, 1] for row in range(len(df))]

    return L(items)


In [None]:
abc_dls = DataBlock(blocks=TextBlock.from_df(text_cols=0, is_lm=True),
                    get_items=get_abc, splitter=RandomSplitter(0.2)).dataloaders(abc_df, path=path, bs=128, seq_len=80)

In [None]:
abc_dls.show_batch(max_n=3)

Unnamed: 0,text,text_
0,xxbos 340 handguns surrendered in act xxbos interview brett ratten xxbos greens urge labor to back move to challenge tpv xxbos memorial marks a minutes mayhem xxbos bridge crashes prove baffling xxbos china takes four golds on day four of games xxbos xxunk ram rahim singh is an indian spiritual guru xxbos son of wallace challenge winner announced xxbos ndia wants more power to control costs xxbos federal govt to help davenport aboriginal community xxbos share market wall st dow,340 handguns surrendered in act xxbos interview brett ratten xxbos greens urge labor to back move to challenge tpv xxbos memorial marks a minutes mayhem xxbos bridge crashes prove baffling xxbos china takes four golds on day four of games xxbos xxunk ram rahim singh is an indian spiritual guru xxbos son of wallace challenge winner announced xxbos ndia wants more power to control costs xxbos federal govt to help davenport aboriginal community xxbos share market wall st dow jones
1,entertainment xxbos sa barley growers vote to keep single desk xxbos crab pot plunder on the rise fishers xxbos reds hand brumbies preseason pasting xxbos no confirmation on company tax cut plans xxbos lyon gets behind koschitzke xxbos outback queensland hotspot discovering new plant species xxbos russia blames chechens for moscow bombing xxbos emotional holmes wins ironman title xxbos call for earth hour support xxbos xenophon concerned by lack of health funding xxbos police stand by injured pope patrol officer,xxbos sa barley growers vote to keep single desk xxbos crab pot plunder on the rise fishers xxbos reds hand brumbies preseason pasting xxbos no confirmation on company tax cut plans xxbos lyon gets behind koschitzke xxbos outback queensland hotspot discovering new plant species xxbos russia blames chechens for moscow bombing xxbos emotional holmes wins ironman title xxbos call for earth hour support xxbos xenophon concerned by lack of health funding xxbos police stand by injured pope patrol officer xxbos
2,sa lincoln wines 0602 xxbos cox plate memorable moments xxbos denis xxunk struggles to xxunk banana xxbos driver faces manslaughter charge xxbos mining company eyes peterborough infrastructure xxbos man charged after crash outside police station xxbos transurban boosts profit xxbos carroll ruled out of final replay xxbos doctor not guilty of indecently dealing with xxbos parts of nsw declared disaster zones xxbos governments urged to boost remote health spending xxbos uk election exit polls projected on broadcasting house in london,lincoln wines 0602 xxbos cox plate memorable moments xxbos denis xxunk struggles to xxunk banana xxbos driver faces manslaughter charge xxbos mining company eyes peterborough infrastructure xxbos man charged after crash outside police station xxbos transurban boosts profit xxbos carroll ruled out of final replay xxbos doctor not guilty of indecently dealing with xxbos parts of nsw declared disaster zones xxbos governments urged to boost remote health spending xxbos uk election exit polls projected on broadcasting house in london xxbos


## Phase 1 - Implementing the general language model pre-trained with Wikipedia articles

In [None]:
learn = language_model_learner(abc_dls, AWD_LSTM, drop_mult=0.3, metrics=[accuracy, Perplexity()]).to_fp16()


## Phase 2 - Fine tuning the general model with the ABC headlines corpus

### First tuning - only the embeddings

In [None]:
# The pretrained model is frozen by default. Only the embeddings will be fine tuned at first

if not abc_finetuned_flag:

  learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,5.608009,5.44979,0.237,232.709183,17:22


### Unfreezzing the model and fine-tuning



In [None]:
if not abc_finetuned_flag:
  
  learn.unfreeze()
  learn.fit_one_cycle(5, 2e-3)
  learn.save("abc_finetuned")

else:
  
  learn.load("abc_finetuned")

epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.88123,5.02208,0.266117,151.726562,17:48
1,4.69963,4.987836,0.271229,146.618774,17:46


### Saving/Loading the encoder

In [None]:
if not abc_finetuned_flag:

  learn.save_encoder("abc_encoder")

### Generating a news headline

In [None]:
text = ""
n_words = 10
prediction = learn.predict(text, n_words, temperature=0.75)

In [None]:
print(prediction)

tourism group calls for highway upgrade retail sales surge


## Phase 3 - Fine tuning the classifier model

### Creating the Classifier DataLoaders

In [None]:
path = Path("/content/drive/My Drive/Colab Notebooks/Data/financial")

In [None]:
dls_clas = DataBlock(
                     blocks=(TextBlock.from_folder(path, vocab=abc_dls.vocab),CategoryBlock),
                     get_y = parent_label,
                     get_items=partial(get_text_files, folders=['train', 'test']),
                     splitter=GrandparentSplitter(valid_name='test')
                     ).dataloaders(path, path=path, bs=10, seq_len=72)

In [None]:
dls_clas.show_batch(max_n=3)

Unnamed: 0,text,category
0,xxbos xxmaj estonian telecoms company xxmaj elisa 's customer numbers cross 400 xxunk xxrep 3 0 xxup xxunk xxunk xxmaj oct 22 xxunk xxup xxunk xxunk xxmaj the xxmaj estonian telecommunications company xxmaj elisa won xxunk 50 xxunk xxrep 3 0 new clients in the nine months of this year xxunk bringing the number to xxunk by the end of xxmaj september xxunk the company said .,positive
1,xxbos xxmaj selects xxmaj third xxmaj party xxmaj logistics xxmaj leader xxmaj xxunk for best xxunk in xxunk class xxmaj technology xxunk xxmaj procurement xxmaj capabilities ; xxmaj leading xxmaj specialty xxmaj packaging xxmaj manufacturer xxmaj employs xxmaj complete xxunk xxmaj solution for xxmaj supply xxmaj chain xxmaj visibility and xxmaj transportation xxmaj efficiency to xxmaj increase xxmaj service xxmaj levels and on xxunk time xxmaj deliveries,positive
2,xxbos narrows to xxup xxunk xxunk m xxunk ' 09 29 xxmaj october 2009 xxunk xxmaj finnish software and hardware developer xxmaj xxunk xxmaj xxunk xxup xxunk : xxup xxunk xxunk or xxup eb xxunk said today that its net loss narrowed to xxup xxunk xxunk m for the first nine months of 2009 from xxup xxunk xxunk m for the same period a year ago .,positive


### Creating the classifier model

In [None]:
learn = text_classifier_learner(dls_clas, AWD_LSTM, drop_mult=0.5, metrics=accuracy).to_fp16()

In [None]:
# Let us load the imdb encoder
learn.load_encoder("abc_encoder")

<fastai.text.learner.TextLearner at 0x7f5cb5c9ba50>

### Fine-Tuning the Classifier

In [None]:
learn.fit_one_cycle(1, 2e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.661311,0.595786,0.672241,00:02


In [None]:
# Let us progressively unfrezze the model. First the last two layers.
learn.freeze_to(-2)
learn.fit_one_cycle(1, slice(1e-2/(2.6**4),1e-2))

epoch,train_loss,valid_loss,accuracy,time
0,0.648787,0.584434,0.67893,00:02


In [None]:
# A bit more
learn.freeze_to(-3)
learn.fit_one_cycle(1, slice(5e-3/(2.6**4),5e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.562845,0.518606,0.749164,00:03


In [None]:
# And the whole model
learn.unfreeze()
learn.fit_one_cycle(8, slice(1e-3/(2.6**4),1e-3))

epoch,train_loss,valid_loss,accuracy,time
0,0.444942,0.509684,0.745819,00:03
1,0.409537,0.441189,0.799331,00:03
2,0.371048,0.429041,0.804348,00:03
3,0.292841,0.424266,0.807692,00:03
4,0.245344,0.402682,0.826087,00:03
5,0.20806,0.390004,0.832776,00:03
6,0.176498,0.397791,0.829431,00:03
7,0.170939,0.405531,0.829431,00:03


### Exporting the model

In [None]:
learn.save("abc_financial_inferer")

Path('/content/drive/My Drive/Colab Notebooks/Data/financial/models/financial_inferer.pth')