# Configuration Section - Important

In [1]:
colab_flag = False  # Set it true if the notebook is run on Colab
source = "fin_lab_large"  # abc | fin_unlab | fin_lab_bal | fin_lab_large | fin_lab_imp3


# Importing Modules

## General Libraries

In [2]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm

## For Colab

In [3]:
if colab_flag:
  
  !pip install -Uqq fastbook   
  from fastbook import * 
  from google.colab import drive 
  drive.mount('/content/drive')
  %cd /content/drive/My\ Drive/Colab\ Notebooks/

  root = "/content/drive/My Drive/Colab Notebooks/"

  gpu_info = !nvidia-smi
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
  else:
    print(gpu_info)
  
else:

  root = "C:/Users/user/ML/Ironhack/GitHub/Final_Project/"

## Deep Learning

In [4]:
from fastai.text.all import *

# Functions

In [5]:
# Function to create a L object with all the text records from a df

def get_text(df,  text_col=0):

    items = [df.iloc[row, text_col] for row in range(len(df))]

    return L(items)

# Building the Language Model (LM) with a transfer learning approach

## Data loader object for the LM

In [6]:
if source in ["fin_lab_bal", "fin_lab_large", "fin_lab_imp3"]:  # The text data is stored in folders

  path = Path(root + f"Data/financial/labelled/{source[8:]}")

  get_text_path = partial(get_text_files, folders=['train', 'test'])

  dls_lm = DataBlock(blocks=TextBlock.from_folder(path, is_lm=True),
                      get_items=get_text_path, splitter=RandomSplitter(0.2)).dataloaders(path, path=path, bs=128, seq_len=80)

elif source in ["abc", "fin_unlab"]:

  df = pd.read_csv(f"Data/csv/{source}.csv")
  path = Path(root + f"Data/temp")

  dls_lm = DataBlock(blocks=TextBlock.from_df(text_cols=0, is_lm=True),
                     get_items=get_text, splitter=RandomSplitter(0.2)).dataloaders(df, path=path, bs=128, seq_len=80)

Due to IPython and Windows limitation, python multiprocessing isn't available now.
So `n_workers` has to be changed to 0 to avoid getting stuck


In [7]:
# Saving the language model dls

with open(f"Data/dls/dls_lm_{source}", "wb") as f:

    pickle.dump(dls_lm, f)

    f.close()

In [8]:
dls_lm.show_batch(max_n=3)

Unnamed: 0,text,text_
0,"xxbos xxmaj the period 's sales dropped to xxup eur 30.6 million from xxup eur xxunk million , according to the interim report , released today . xxbos xxmaj the group 's operating loss was xxup eur 0.8 mn , down from a profit of xxup eur 2.5 mn in 2004 . xxbos xxmaj neste xxmaj oil xxmaj corp . has signed long - term xxunk contracts with xxmaj xxunk xxmaj oy and xxmaj xxunk xxmaj xxunk xxmaj oy ,","xxmaj the period 's sales dropped to xxup eur 30.6 million from xxup eur xxunk million , according to the interim report , released today . xxbos xxmaj the group 's operating loss was xxup eur 0.8 mn , down from a profit of xxup eur 2.5 mn in 2004 . xxbos xxmaj neste xxmaj oil xxmaj corp . has signed long - term xxunk contracts with xxmaj xxunk xxmaj oy and xxmaj xxunk xxmaj xxunk xxmaj oy , both"
1,"mn , up from xxup eur xxunk mn in 2008 . xxbos xxmaj finnish automation solutions developer xxmaj cencorp xxmaj corporation ( xxup omx xxmaj helsinki : xxup xxunk ) said on xxmaj friday ( 27 xxmaj june ) that it has completed employee negotiations xxunk a reorganisation of its operations . xxbos xxmaj operating profit rose to xxup eur 4.7 mn from xxup eur 3.6 mn . xxbos xxmaj in the xxmaj baltic countries , development of operations and",", up from xxup eur xxunk mn in 2008 . xxbos xxmaj finnish automation solutions developer xxmaj cencorp xxmaj corporation ( xxup omx xxmaj helsinki : xxup xxunk ) said on xxmaj friday ( 27 xxmaj june ) that it has completed employee negotiations xxunk a reorganisation of its operations . xxbos xxmaj operating profit rose to xxup eur 4.7 mn from xxup eur 3.6 mn . xxbos xxmaj in the xxmaj baltic countries , development of operations and reorganisation"
2,"xxmaj it is the most xxunk xxunk to use if xxunk 're holding the phone with one hand . xxbos xxmaj that would be an increase from estimated sales of xxunk million last year . xxbos ` ` xxmaj our customers now have the xxunk to make xxunk for all the services they xxunk at one location , '' said xxmaj xxunk xxmaj xxunk , in - xxunk of xxmaj finnair 's xxmaj internet ( sales ) . xxbos xxup","it is the most xxunk xxunk to use if xxunk 're holding the phone with one hand . xxbos xxmaj that would be an increase from estimated sales of xxunk million last year . xxbos ` ` xxmaj our customers now have the xxunk to make xxunk for all the services they xxunk at one location , '' said xxmaj xxunk xxmaj xxunk , in - xxunk of xxmaj finnair 's xxmaj internet ( sales ) . xxbos xxup helsinki"


## Phase 1 - Implementing the general language model pre-trained with Wikipedia articles

In [9]:
learn = language_model_learner(dls_lm, AWD_LSTM, drop_mult=0.3, metrics=[accuracy, Perplexity()]).to_fp16()




## Phase 2 - Fine tuning the general model with the IMDb reviews corpus

### First tuning - only the embeddings

In [10]:
# The pretrained model is frozen by default. Only the embeddings will be fine tuned at first

learn.fit_one_cycle(1, 2e-2)



epoch,train_loss,valid_loss,accuracy,perplexity,time
0,4.43547,3.897859,0.295455,49.296795,00:37


### Unfreezzing the model and fine-tuning



In [11]:
learn.unfreeze()
learn.fit_one_cycle(5, 2e-3)

finetuned = f"{source}_finetuned"
learn.save(finetuned)




epoch,train_loss,valid_loss,accuracy,perplexity,time
0,3.64828,3.276807,0.377002,26.491041,00:56
1,3.30007,2.992637,0.411191,19.938194,00:56
2,3.053743,2.903553,0.425089,18.238842,00:55
3,2.862699,2.863342,0.426567,17.519981,00:55
4,2.726235,2.85288,0.428671,17.337646,00:55


Path('C:/Users/user/ML/Ironhack/GitHub/Final_Project/Data/financial/labelled/large/models/fin_lab_large_finetuned.pth')

### Saving the encoder

In [12]:
encoder = f"{source}_encoder"
learn.save_encoder(encoder)


### Generating a heading

In [None]:
text = ""
n_words = 30
prediction = learn.predict(text, n_words, temperature=0.9)

In [14]:
print(prediction)

Passenger traffic in India fell by 2.2 % to EUR 3.6 mn in the third quarter of 2009 from EUR 1.7 mn . The
