In [None]:
#VQ-GAN 1024 
Model = "f16_1024" #param ["f16_1024", "f16_16384", "f16_16384_hf"]
import cv2
import torch
import yaml
import torch
from PIL import Image

import numpy as np
import matplotlib.pyplot as plt
from IPython.display import clear_output
from omegaconf import OmegaConf
from taming.models.vqgan import VQModel

def load_config(config_path, display=False):
  config = OmegaConf.load(config_path)
  if display:
    print(yaml.dump(OmegaConf.to_container(config)))
  return config

def load_vqgan(config, ckpt_path=None):
  model = VQModel(**config.model.params)
  if ckpt_path is not None:
    sd = torch.load(ckpt_path, map_location="cpu")["state_dict"]
    missing, unexpected = model.load_state_dict(sd, strict=False)
  return model.eval()

def preprocess_vqgan(x, roll=True):
  x = 2.*x - 1.
  if roll:
    x = np.rollaxis(x,3,1)
  x = torch.Tensor(x)
  return x

def preprocess(x, permt=True):
  if permt:
    x = x.permute(0,2,3,1).numpy()
  x = np.clip(x, -1., 1.)
  x = (x + 1.)/2.
  return x

def custom_to_pil(x):
  x = np.clip(x, -1., 1.)
  x = (x + 1.)/2.
  x = (255*x).astype(np.uint8)
  x = Image.fromarray(x)
  if not x.mode == "RGB":
    x = x.convert("RGB")
  return x

vq_conf = load_config(f"chk_points/vqgan_imagenet_{Model}.yaml", display=False)
vq_model = load_vqgan(vq_conf, ckpt_path=f"chk_points/vqgan_imagenet_{Model}.ckpt").to('cuda')

'''
sz = []

plt.figure(figsize=(20, 40))
img_rec = []
for i in range(1):
  #quant_states, indices = V_encoder.encode(DS.obs[i+2][0])
  x = preprocess_vqgan(DS.obs[i+2])
  with torch.no_grad():
    z, _, [_, _, ind] = vq_model.encode(x.to('cuda'))
    b,c,h,w = z.shape
    nz = vq_model.quantize.get_codebook_entry(ind, (b,h,w,c))
    rec = vq_model.decode(nz).detach().cpu()
    sz.append(h*w)
  #print(rec.shape)
  img_rec.append(preprocess(rec))

for i in range(1):
  for j in range(1):
    plt.subplot(10, 6, i*6+j+1)
    plt.axis("off")
    plt.imshow(DS.obs[j+2][i])
    plt.title(f'origin {DS.obs[j+2][i].shape}')
  for j in range(1):
    plt.subplot(10, 6, i*6+j+4)
    plt.axis("off")
    plt.imshow(img_rec[j][i])
    plt.title(f'sintetic {sz[j]} token')

plt.show();
'''

In [None]:
import time
#time.sleep(3600*7)#6:30

In [None]:
imsize = [16*9, 16*9]

In [None]:
import os

In [None]:
lst = sorted(Path("./data/imgs").iterdir(), key=os.path.getmtime)
lst.reverse()
lst[:10]

In [None]:
import codecs
path = './data/imgs_descs.txt'
#path = './data/image_annotations_plans.txt'#################
with codecs.open(f'{path}', 'r', 'utf8', errors='ignore') as f:
    text = ''.join(f.readlines())

from pathlib import Path

p = Path("./data/imgs")
i = 0
for img_name in lst:#p.rglob("*"):
    i += 1
    if np.random.rand()<0.001:
        if not ('<<' in text):
            break
    img_name_short = str(img_name).replace('data\imgs\\', '')
    if img_name_short in text:
        #скачать и декодировать картинку
        img_orig = np.array(cv2.resize(cv2.imread(str(img_name)), imsize), dtype=np.float32)/255.
        b,g,r = img_orig[:,:,0], img_orig[:,:,1], img_orig[:,:,2]
        img_orig = np.dstack([r, g, b])
        img = preprocess_vqgan(np.stack(1*[img_orig]), True)       
        
        z, _, [_, _, ind] = vq_model.encode(img.to('cuda'))
        ind.squeeze_()
        b,c,h,w = z.shape
        #nz = vq_model.quantize.get_codebook_entry(ind, (b,h,w,c))
        #print('nz', nz)
        token_string = '<' + '><'.join( [str(el) for el in list(ind.detach().cpu().numpy())] ) + '>'
        nz = vq_model.quantize.get_codebook_entry(ind, (b,h,w,c))
        rec = vq_model.decode(nz).detach().cpu()
        if np.random.rand()<0.002:
            print('i', i)
            plt.imshow(img_orig)
            plt.show()
            plt.imshow(preprocess(rec)[0])
            plt.show()
        
        
        text = text.replace(f'<<{img_name_short}>>', token_string)
        
#text

In [None]:
text = text.replace('description:', 'description:<OUT>').replace('forecast vars:', 'forecast vars:<OUT>').replace('forecast img:', 'forecast img:<OUT>').replace('plan:', 'plan:<OUT>').replace('<END>\r\n', '<END>\r\n<IN>')
text = '<IN>' + text
#text

In [None]:
#Добавить датасет видео
import tqdm
from subprocess import Popen, PIPE
import torchvision.transforms as T
from base64 import b64encode
from IPython.display import HTML
from PIL import Image, ImageDraw

Actions = ['No','Fwd','Bck','Rgt','Lft','Rsf','Lsf']
Semantic = ['-','Barrel','Picture','Boxes','Vine box','Market','Gate','Door']
a,t,_,v = torch.load("./data/test_data.pt")

In [None]:
a

In [None]:
t

In [None]:
step = 12
for seq_num in range(v.shape[0]):
    print(seq_num, 'from', v.shape[0])
    for pointer_in_seq in range(0, v.shape[1] - step, step):
        
        img_orig = v[seq_num, pointer_in_seq]
        img_orig = np.rollaxis(img_orig.numpy(),0,3)
        img_orig = np.array(cv2.resize(img_orig, imsize), dtype=np.float32)
        #plt.imshow(np.rollaxis(img_orig.numpy(),0,3))
        #plt.show()
        img = preprocess_vqgan(np.stack(1*[img_orig]), True)     
        z, _, [_, _, ind] = vq_model.encode(img.to('cuda'))
        ind.squeeze_()
        b,c,h,w = z.shape
        #nz = vq_model.quantize.get_codebook_entry(ind, (b,h,w,c))
        #print('nz', nz)
        token_string = '<' + '><'.join( [str(el) for el in list(ind.detach().cpu().numpy())] ) + '>'
        nz = vq_model.quantize.get_codebook_entry(ind, (b,h,w,c))
        rec = vq_model.decode(nz).detach().cpu()
        
        item = Semantic[t[seq_num, pointer_in_seq]]
        #print(item)
        if 0:
            plt.imshow(img_orig)
            plt.show()
            plt.imshow(preprocess(rec)[0])
            plt.show()
        to_write = f'<IN>{token_string} description<OUT> {item}<END>\r\n'
        
        text += to_write

In [None]:
with codecs.open('data/image_annotations_plans.txt', 'w', 'utf8') as f:
    f.write(text)

In [None]:
#аннотированные картинки
path = 'data/image_annotations_plans.txt' 
with codecs.open(f'{path}', 'r', 'utf8') as f:
    texts = ''.join(f.readlines())
texts = texts.replace('<s>', '<END>').replace('\r\n', '\n').replace('</s>', '')
#.replace('<IN>', '')
texts = texts.replace('>f', '> f').replace('>d', '> d').replace('>p', '> p').replace('>s', '> s').replace('>k', '> k').replace('  ', ' ').replace('\n', '\n<IN>').replace('<IN><IN>', '<IN>')
print('texts', len(texts))
with codecs.open('data/all_txt.txt', 'w', 'utf8') as f:
    f.write(texts)
    
    
def process_book(path, drop_spaces=True):  
    with codecs.open(f'{path}', 'r', 'utf8', errors='ignore') as f:
        texts = ''.join(f.readlines())
    texts = texts.replace('<s>', '<END>').replace('\r\n', '\n').replace('</s>', '')
    if drop_spaces:
        texts = texts.replace('\n', '\t')#у нас датасет такой
        texts = '\n' + texts
    print(path, 'texts', len(texts))
    with codecs.open('data/all_txt.txt', 'a', 'utf8') as f:
        f.write(texts)
#вики
path = 'data/wiki_data.txt'
process_book(path)
process_book(path, drop_spaces=False)

    
#/toy_text_doom_tasks
path = 'data/toy_text_doom_tasks.txt'     
process_book(path, drop_spaces=False)
    
#logic
path = 'data/formal_logic_textbook.txt'     
process_book(path)
process_book(path, drop_spaces=False)
    
    
#hpmor
path = 'data/hpmor.txt'     
process_book(path)   
process_book(path, drop_spaces=False)

path = "data/Book 1 - The Philosopher's Stone.txt"
process_book(path)   
process_book(path, drop_spaces=False)

path = 'data/Book 2 - The Chamber of Secrets.txt'
process_book(path)
process_book(path, drop_spaces=False)

path = 'data/Book 4 - The Goblet of Fire.txt'
process_book(path)
process_book(path, drop_spaces=False)
    
#rationality.txt
path = 'data/Map and Territory.txt'     
process_book(path)
process_book(path, drop_spaces=False)

path = 'data/doom fanfics.txt' 
process_book(path)
process_book(path, drop_spaces=False)

path = 'data/treasure island.txt' 
process_book(path)
process_book(path, drop_spaces=False)

path = 'data/robinson crusoe.txt' 
process_book(path)
process_book(path, drop_spaces=False)

path = 'data/Sherlock Holmes.txt' 
process_book(path)

path = 'data/scrum.txt' 
process_book(path)
process_book(path, drop_spaces=False)

path = 'data/military stories.txt' 
process_book(path)
process_book(path, drop_spaces=False)

path = 'data/military stories 2.txt' 
process_book(path)
process_book(path, drop_spaces=False)

path = 'data/doom wiki.txt'
process_book(path)
process_book(path, drop_spaces=False)

path = 'data/homm.txt'
process_book(path)
process_book(path, drop_spaces=False)

path = 'data/military materials.txt'
process_book(path)
process_book(path, drop_spaces=False)

path = 'data/military materials 2.txt'
process_book(path)
process_book(path, drop_spaces=False)

path = 'data/anatomy.txt'
process_book(path)
process_book(path, drop_spaces=False)

path = 'data/churchill.txt'
process_book(path)
process_book(path, drop_spaces=False)

path = 'data/summary_data.txt'
process_book(path, drop_spaces=False)

path = 'data/dialogues_text.txt'
process_book(path, drop_spaces=False)

path = 'data/chat_data.txt'
process_book(path, drop_spaces=False)

In [None]:
#аннотированные картинки с памятью
path = 'data/imgs_descs_memory.txt' 
with codecs.open(f'{path}', 'r', 'utf8') as f:
    texts = ''.join(f.readlines())
texts = texts.replace('<s>', '<END>').replace('\r\n', '\n').replace('</s>', '')
print('texts', len(texts))
with codecs.open('data/all_txt.txt', 'a', 'utf8') as f:
    f.write(texts)

In [None]:
#cut text
import codecs
thresh = 100000000
with codecs.open('data/all_txt.txt', 'r', 'utf8', errors='ignore') as f:
    lines = f.readlines()
lines = [line[:thresh] for line in lines]
lines = '\n'.join(lines)

with codecs.open('data/all_txt_cut.txt', 'w', 'utf8') as f:
    f.write(lines)

In [None]:
1/0

In [None]:
#здесь мы выгружаем GPT-2 и пытаемся дообучать на кроссдоменном датасете
import torch
from transformers import GPT2Tokenizer
import codecs
import numpy as np

from trl import AutoModelForCausalLMWithValueHead
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
model_name = "gpt2_finetuned"

In [None]:
# 1. load a pretrained model

model = AutoModelForCausalLM.from_pretrained(model_name)
#model = AutoModelForCausalLMWithValueHead.from_pretrained(model_name)
#model_ref = AutoModelForCausalLMWithValueHead.from_pretrained(model_name)

#tokenizer = GPT2Tokenizer.from_pretrained(model_name)
#tokenizer.pad_token = tokenizer.eos_token
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
#Учим эту модель. У Nehc есть пример, как это делать
#Модификация токенайзера 
#добавляем несколько токенов нашей разметки

#сюда надо фигануть весь словарь картинок
video_tokens_cnt = 1024
video_tokens = []
for i in range(video_tokens_cnt):
    video_tokens.append(f'<{i}>')
    
special_tokens_dict = {'additional_special_tokens': video_tokens + ['<IN>','<OUT>','<END>','|PAD|']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
model.resize_token_embeddings(len(tokenizer))
print(tokenizer.all_special_tokens)
tokenizer.pad_token = '|PAD|'

In [None]:
#Загрузка датасета
# теоретически, это устревший формат датасета, но на практике
# он дает равномерную загрузку графической памяти и позволяет 
# использовать больший батч.

'''

Предполагается, что датасет выглядит как-то так: 

<IN>Современное прочтение СТЭ не отрицает наличие ЭС и ее влияние на видообразование, но делает акцент на "мутационном факторе" изменчивости. В любом случае, благодарю за ваш ответ по существу. 🤝
<OUT>Все примеры мутационной изменчивости не столь очевидны (особенно с учетом отсутствия прямого соответствия ген-признак).  Всегда есть вероятность априорного существования соответствующего морфоза.
<END>
<IN>Просто тема сама рифмуется с соразмерным госфинансированием...
<OUT>ну что вам так финансирование далось... всем фундаментальным проектам не хватает финансирования, точнее, с финансированием происходит полный рандом. это нормально. вот будет конкретный прикладной проект — тогда и надо будет выбивать для него финансирование
<END>

В простом тесктовом файле...

'''

if 0:
    texts = ''
    paths = ['data/thougths_my.txt', 'data/public_my.txt', 'data/dialogues_my.txt']
    for path in paths:
        with codecs.open(f'{path}', 'r', 'utf8') as f:
            texts += ''.join(f.readlines())
    texts = texts.replace('<s>', '<END>').replace('\r\n', '\n').replace('</s>', '')
    #IN OUT не будет
    with codecs.open('data/all_txt.txt', 'w', 'utf8') as f:
        f.write(texts)


#аннотированные картинки
path = 'data/image_annotations_plans.txt' 
with codecs.open(f'{path}', 'r', 'utf8') as f:
    texts = ''.join(f.readlines())
texts = texts.replace('<s>', '<END>').replace('\r\n', '\n').replace('</s>', '')
with codecs.open('data/all_txt.txt', 'w', 'utf8') as f:
    f.write(texts)
    

In [None]:
#вики
path = 'data/wiki_data.txt'     
with codecs.open(f'{path}', 'r', 'utf8') as f:
    texts = ''.join(f.readlines())
texts = texts.replace('<s>', '<END>').replace('\r\n', '\n').replace('</s>', '')

with codecs.open('data/all_txt.txt', 'a', 'utf8') as f:
    f.write(texts)

In [None]:
from transformers import TextDataset,DataCollatorForLanguageModeling

train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path='data/all_txt.txt',
          block_size=128)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
#обучение
from transformers import Trainer, TrainingArguments
import torch

# если не лезет в память, можно уменьшать **per_device_train_batch_size**
training_args = TrainingArguments(
    output_dir="./models/gpt_bot", #The output directory
    #evaluation_strategy="epoch",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs=4, # number of training epochs
    per_device_train_batch_size= 19, # batch size for training
    gradient_accumulation_steps=20,
    #per_device_eval_batch_size=50,  # batch size for evaluation
    eval_steps = 200, # Number of update steps between two evaluations.
    save_steps=1000, # after # steps model is saved 
    warmup_steps=500,# number of warmup steps for learning rate scheduler
    prediction_loss_only=True,
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    #train_dataset=tokenized_dataset['train'],
    train_dataset=train_dataset,
    #eval_dataset=test_dataset,
    #compute_metrics=compute_metrics,
)

torch.cuda.empty_cache()

In [None]:
trainer.train()

In [None]:
#можно засейвить модель и потом применять в RL
if 1:
    # save it locally
    model.save_pretrained("gpt2_finetuned")
    tokenizer.save_pretrained('gpt2_finetuned')

    # load the model from the Hub
    #from transformers import AutoModelForCausalLM

    #model = AutoModelForCausalLM.from_pretrained("my-fine-tuned-model-ppo")

In [None]:
#специальные токены { vertical-output: true }
T_OUT = tokenizer.encode('<OUT>')[0]
T_END = tokenizer.encode('<END>')[0]
T_PAD = tokenizer.encode('|PAD|')[0]
T_124 = tokenizer.encode('<124>')[0]

T_OUT, T_END, T_PAD, T_124

In [None]:
text = "Здравствуй железяка разумная! Как у тебя сегодня дела?"
model.cuda()

text = f"<IN>{text}\n<OUT>"
inpt = tokenizer.encode(text, return_tensors="pt")
inpt= inpt.cuda()

out = model.generate(inpt,  max_length=len(inpt[0])+300, do_sample=True, top_k=5, top_p=0.95, temperature=1, eos_token_id=T_END, pad_token_id=T_PAD)

out_tokens = torch.where(out[0]==T_OUT)
last_repl = out[0][out_tokens[0][-1]+1:-1]
repl = tokenizer.decode(last_repl)

print(repl)

In [None]:
def answer(text):
    inpt = tokenizer.encode(text, return_tensors="pt")
    inpt= inpt.cuda()
    print(text)

    out = model.generate(inpt,  max_length=len(inpt[0])+300, do_sample=True, top_k=5, top_p=0.95, temperature=1, eos_token_id=T_END, pad_token_id=T_PAD)

    out_tokens = torch.where(out[0]==T_OUT)
    last_repl = out[0][out_tokens[0][-1]+1:-1]
    repl = tokenizer.decode(last_repl)

    print(repl)

In [None]:
#a11
text = '<916><61><922><922><922><922><922><922><55><906><1022><830><376><830><830><830><830><61><771><402><438><751><830><925><925><830><61><580><519><920><462><830><925><925><925><61><120><519><255><327><61><925><328><55><483><657><255><49><328><830><925><147><120><287><328><830><328><61><830><706><957><29><222><925><55><55><55><328><851><590><596><164><376><61><328><328><328><801><925><422><255> description:'
text = f"<IN>{text}<OUT>"
answer(text)
text = '<916><61><922><922><922><922><922><922><55><906><1022><830><376><830><830><830><830><61><771><402><438><751><830><925><925><830><61><580><519><920><462><830><925><925><925><61><120><519><255><327><61><925><328><55><483><657><255><49><328><830><925><147><120><287><328><830><328><61><830><706><957><29><222><925><55><55><55><328><851><590><596><164><376><61><328><328><328><801><925><422><255> save hp plan:'
text = f"<IN>{text}<OUT>"
answer(text)

In [None]:
#images (10).jpg
text = '<597><813><813><110><813><813><556><714><937><778><985><173><891><893><61><873><261><96><761><189><481><230><215><140><661><192><444><548><897><548><720><333><828><884><171><192><508><966><577><805><611><965><661><414><966><638><255><406><979><254><966><92><244><439><404><811><187><906><164><297><313><330><514><925><979><681><671><758><608><36><49><1019><468><140><574><325><287><217><714><562><432> description:'
text = f"<IN>{text}<OUT>"
answer(text)

In [None]:
#images (2).jpg
text = '<685><714><719><632><910><590><299><894><473><634><812><334><607><382><250><526><422><862><710><774><780><552><437><20><894><746><597><323><1006><422><406><813><370><414><854><473><330><941><681><323><813><870><845><432><7><642><599><57><717><845><269><473><714><96><29><596><870><477><283><477><974><884><215><881><596><698><333><517><460><893><813><222><402><110><714><996><407><718><172><189><150> description:'
text = f"<IN>{text}<OUT>"
answer(text)