<a href="https://colab.research.google.com/github/NicolasMauge/learning_projects/blob/master/pytorch_language_model/Language_Model_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set the environment

## Installation of PyTorch and the needed repositories

In [0]:
!git clone https://github.com/NicolasMauge/utils_google_colab
!. utils_google_colab/colab_init.sh
!git clone https://github.com/NicolasMauge/learning_projects

Cloning into 'utils_google_colab'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (5/5), done.[K
remote: Total 72 (delta 1), reused 6 (delta 1), pack-reused 66[K
Unpacking objects: 100% (72/72), done.
Install of the PyTorch packages
tcmalloc: large alloc 1073750016 bytes == 0x58772000 @  0x7fb49b21a2a4 0x591a07 0x5b5d56 0x502e9a 0x506859 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x502209 0x502f3d 0x506859 0x504c28 0x502540 0x502f3d 0x507641 0x504c28 0x502540 0x502f3d 0x507641
Install of the PyDrive package
Install of the kaggle package
Cloning into 'learning_projects'...
remote: Enumerating objects: 138, done.[K
remote: Counting objects: 100% (138/138), done.[K
remote: Compressing objects: 100% (104/104), done.[K
remote: Total 206 (delta 54), reused 96 (delta 29), pack-reused 68[K
Receiving objects: 100% (206/206

In [0]:
!cd learning_projects && git fetch --all && git reset --hard origin/master

Fetching origin
HEAD is now at 4564b05 index -> self.index


In [0]:
import torch, torch.nn.functional as F
import torch.backends.cudnn as cudnn
import numpy as np

import tqdm

import sys
sys.path.append('utils_google_colab')
sys.path.append('learning_projects/pytorch_language_model')
from colab_utils import download
from colab_utils import upload
import get_data
from utils.utils import load_vocabulary
from models.models import get_language_model

## Download the text corpus and the dictionary itos

'wiki_text.csv' and 'vocab_itos.pkl' have been produced by the repository 'wikipedia_extract'

In [0]:
from pathlib import Path

split = False
if not Path("wiki_text.csv").is_file():
    download("wiki_text.csv")
    split = True
    

if not Path("vocab_itos.pkl").is_file():
    download("vocab_itos.pkl")

Download 16%.
Download 32%.
Download 49%.
Download 65%.
Download 82%.
Download 98%.
Download 100%.
Download 100%.


In [0]:
vocabulary = load_vocabulary("vocab_itos.pkl")
n_words = len(vocabulary)
print(f"Vocabulary: {n_words} words")

Vocabulary: 80002 words


# Set model

The model defined in models.py (program issued from fastai) is an AWD-LSTM

In [0]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

emb_size = 400
nhid = 1150
model = get_language_model(n_words, emb_size, nhid, 3, 1)
if device == 'cuda':
	model = torch.nn.DataParallel(model)
	cudnn.benchmark = True

In [0]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.5, weight_decay=3e-4)

In [0]:
!nvidia-smi

Sun Nov 25 07:33:55 2018       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 396.44                 Driver Version: 396.44                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    69W / 149W |    553MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
+-------

In [0]:
def save_model(filename, to_google_drive=False):
    torch.save(model.state_dict(), filename)
    if to_google_drive:     
        upload(filename)

def load_model(filename, from_google_drive=False):
    if from_google_drive:     
        download(filename)
        
    model.load_state_dict(torch.load(filename))        

# Load data

## Let's split the data in train / valid

In [0]:
# à lancer la première fois
if split:
    get_data.get_data.split("wiki_text.csv")

In [0]:
#!ls -l

## Load the data

In [0]:
# on colab, n batch is set on 8 because > 10 can't be processed due to gpu memory error
filenames = {"train":"wiki_text_train.csv", "test":"wiki_text_test.csv", "valid":"wiki_text_valid.csv"}
data_class = get_data.get_data(70, filenames, n_batch=10, phase="train")

## Some tests

In [0]:
data_class.set_batch()

In [0]:
iter_d = iter(data_class)
val1, t1 = next(iter_d)
val2, t2 = next(iter_d)

In [0]:
def f(x):
    return vocabulary[x]

print(f"input: {np.vectorize(f)(val1)}\n")
print(f"target: {np.vectorize(f)(t1)}")

In [0]:
# one step
criterion = torch.nn.CrossEntropyLoss()
optimizer.zero_grad()
if device=='cuda':
    input_train = val1.cuda()
    target_train = t1.cuda()
    
    decoded, _, _ = model(input_train)
    
    #print(f"input: {np.vectorize(f)(decoded.cpu().detach().numpy())}\n")
    #loss =  F.cross_entropy(decoded, target_train)
    print(decoded)
    print(decoded.shape)
    
    print(target_train)
    print(target_train.shape)
    loss = criterion(decoded, target_train)
    loss.backward()
        
    optimizer.step()

# Training

In [0]:
load_model("model_last15.pth", from_google_drive=True)

Download 35%.
Download 70%.
Download 100%.


In [0]:
data_class.set_batch()

In [0]:
last_index=0
list_loss=[]
num=0
with tqdm.tqdm(total=data_class.data.shape[0], position=0) as pbar:
    total_loss = 0
    for index, (input_train, target_train) in enumerate(data_class):
        model.zero_grad()
        if device=='cuda':
            input_train = input_train.cuda()
            target_train = target_train.cuda()

        decoded, raw_outputs, outputs = model(input_train)
        loss =  F.cross_entropy(decoded, target_train)
        loss.backward()
        list_loss.append(float(loss))

        optimizer.step()
        
        pbar.update(data_class.seq_len)
        if len(list_loss)> 100:
            moy = np.mean(list_loss[-100:])
            pbar.set_description(f"Loss: {moy}")
        
        
        if data_class.index//100000 > last_index:
            last_index = data_class.index//100000 
            num += 1
            save_model("model_last"+str(num)+".pth", to_google_drive=True)
        

# Notes

The model does not converge quickly:
- retrain the model with no punctuation / accents / less vocabulary 
- do transfer learning with this new model to add the punctuation, accents and 80 000 words
(difficulty: create a mapping between the embedding with no punctuation, etc. and the new model, duplicate weights for the differents words with accents, etc.)