# Install all you need for train

In [None]:
!pip3 install transformers
!pip install mpi4py
!pip install transformers[deepspeed]
!apt-get install libaio-dev

In [None]:
!pip install wandb
!wandb login

# Deepspeed config, usefull for any model

Read more how wtf all parametrs mean read [here](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training
)

In [None]:
%%writefile ds_config_gpt_j.json
{
  "train_batch_size": 15,
  "fp16": {
    "enabled": true,
    "min_loss_scale": 1,
    "opt_level": "O3"
  },
  "zero_optimization": {
    "stage": 3,
    "offload_param": {
      "device": "nvme",
      "nvme_path": "/home/deepschneider/deepspeed",
      "buffer_count": 5,
      "buffer_size": 1e8,
      "max_in_cpu": 1e9
    },
    "offload_optimizer": {
      "device": "nvme",
      "nvme_path": "/home/deepschneider/deepspeed",
      "buffer_count": 4,
      "pipeline_read": false,
      "pipeline_write": false,
      "pin_memory": true
    },
    "allgather_partitions": true,
    "allgather_bucket_size": 5e8,
    "contiguous_gradients": true,
    "overlap_comm": true,
    "aio": {
      "block_size": 1048576,
      "queue_depth": 8,
      "thread_count": 1,
      "single_submit": false,
      "overlap_events": true
    }
  },
  "optimizer": {
    "type": "AdamW",
    "params": {
      "lr": 5e-05,
      "betas": [
        0.9,
        0.999
      ],
      "eps": 1e-08
    }
  },
  "scheduler": {
    "type": "WarmupLR",
    "params": {
      "warmup_min_lr": 0,
      "warmup_max_lr": 5e-05,
      "warmup_num_steps": 100
    }
  }
}

In [None]:
#@markdown godbless 2ch data
!gdown https://drive.google.com/uc?id=1sYdutKVcXgcg-lTtanv1WkGQ4cTYNz0d

In [None]:
#@markdown simple data preprocessing
import re
import json
from sklearn.model_selection import train_test_split
import pandas as pd



from io import StringIO
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()


texts = pd.read_csv('/content/2ch.csv')['post']

def build_text_files(datas, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for text in datas:
      if type(text)==str:
        if len(text)>10:
          post = strip_tags(text)
          post = post.replace('br','').replace('<span class="spoiler">','').replace('</strong>','').replace('<strong>','').replace('<span class="unkfunc">&g','')
          data += post 
        
        
        
    f.write(data)

train, test = train_test_split(texts,test_size=0.15)

build_text_files(train,'train_dataset.txt')
build_text_files(test,'test_dataset.txt')

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))

In [None]:
%%writefile train.py

import os


"""
MASTER port should be open if train with ddp
RAnk - main gpu

"""
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '9994'
os.environ['RANK'] = "0"
os.environ['LOCAL_RANK'] = "0"# for ddp
os.environ['WORLD_SIZE'] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false" #uncoment for large files

import pandas as pd
import torch
from torch.utils.data import Dataset, random_split
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM
from transformers import TextDataset,DataCollatorForLanguageModeling

torch.manual_seed(42)

tokenizer = AutoTokenizer.from_pretrained("sberbank-ai/rugpt3large_based_on_gpt2", 
                                          bos_token='<|startoftext|>', eos_token='<|endoftext|>', pad_token='<|pad|>')
model = AutoModelForCausalLM.from_pretrained("sberbank-ai/rugpt3large_based_on_gpt2").cuda()


model.resize_token_embeddings(len(tokenizer))








train_path = 'train_dataset.txt'
test_path = 'test_dataset.txt'

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator



train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)

training_args = TrainingArguments(output_dir='deepspeed',
                                  num_train_epochs=5, 
                                  logging_steps=300, 
                                  save_steps=3000,
                                  per_device_train_batch_size=15,
                                  per_device_eval_batch_size=15,
                                  warmup_steps=100,
                                  weight_decay=0.01, 
                                  
                                  fp16=True,
                                  #warmup_steps=10,
                                  #weight_decay=0.01,  
                                  #fp16=True, 
                                  #fp16_opt_level='O1', not useful beacuse deepspeed
                                  report_to="wandb",
                                  deepspeed='ds_config_gpt_j.json')
trainer = Trainer(model=model, args=training_args, 
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=test_dataset)
trainer.train()

In [None]:
!python3 train.py