<a href="https://colab.research.google.com/github/NajlaaNawaii/Multilingual-Stable-Diffusion-Towards-more-Inclusive-Text-To-Image-Synthesis/blob/main/Adapter_layer_Training_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets transformers diffusers==0.2.4 -q


In [None]:
import os
from PIL import Image, ImageDraw
import cv2
import numpy as np
from IPython.display import HTML
from base64 import b64encode

import torch
from torch import autocast
from torch.nn import functional as F
from diffusers import StableDiffusionPipeline, AutoencoderKL
from diffusers import UNet2DConditionModel, PNDMScheduler, LMSDiscreteScheduler
from diffusers.schedulers.scheduling_ddim import DDIMScheduler
#from transformers import CLIPTextModel, CLIPTokenizer
from tqdm.auto import tqdm
from huggingface_hub import notebook_login
from google.colab import output
# upload external file before import
from google.colab import files
import helper

device = 'cuda'

In [None]:
from datasets import load_dataset
dataset = load_dataset("laion/laion2B-en", data_files="part-00000-5114fd87-297e-42b0-9d11-50f1df323dfa-c000.snappy.parquet")


In [None]:
dataset = dataset['train'][0:15500]

In [None]:
Text_Dataset = dataset['TEXT']

In [None]:
train_dataset=Text_Dataset[0:12500]

In [None]:
val_dataset=Text_Dataset[12500:15500]

<H1> Steps:


<h3> 1. Get text models

In [None]:
!pip install multilingual-clip
!pip install git+https://github.com/openai/CLIP.git


In [None]:
from multilingual_clip import pt_multilingual_clip
import transformers
from transformers import CLIPTextModel, CLIPTokenizer

In [None]:
from transformers import CLIPTextModel, CLIPTokenizer


# Original CLIP
tokenizer_p14 = CLIPTokenizer.from_pretrained('openai/clip-vit-large-patch14')
text_encoder_p14 = CLIPTextModel.from_pretrained('openai/clip-vit-large-patch14')
text_encoder_p14 = text_encoder_p14.to(device)


# Multi-lingual CLIP
model_name = 'M-CLIP/LABSE-Vit-L-14'
text_tokenizer_Multi = transformers.AutoTokenizer.from_pretrained(model_name)
text_model_Multi = pt_multilingual_clip.MultilingualCLIP.from_pretrained(model_name)
text_model_Multi = text_model_Multi.to(device)







<h3> 2. Get dataset

<h2> data shape ===> (sentences, tokens, embed)

In [None]:
from torch.utils.data import Dataset, DataLoader


class Data(Dataset):
    def __init__(self, df):
      with torch.no_grad():
        text_input_p14 = tokenizer_p14(
                          df[0:1000], padding='max_length', max_length=tokenizer_p14.model_max_length,
                          truncation=True, return_tensors='pt')
        self.y = text_encoder_p14(text_input_p14.input_ids.to(device))[0]
        self.x =  text_model_Multi(df[0:1000], text_tokenizer_Multi, device)
        i = 1000
        n = len(df)
        while i+1000 <= n:
            torch.cuda.empty_cache()
            text_input_p14 = tokenizer_p14(
                          df[i:i+1000], padding='max_length', max_length=tokenizer_p14.model_max_length,
                          truncation=True, return_tensors='pt')
            self.y = torch.cat((self.y, text_encoder_p14(text_input_p14.input_ids.to(device))[0]))
            self.x = torch.cat((self.x, text_model_Multi(df[i:i+1000], text_tokenizer_Multi, device)))
            i +=  1000



    def __getitem__(self,index):
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.y)

In [None]:
def train(model, criterion, train_loader, val_loader, optimizer, checkpoint, epochs=100):
    useful_stuff = {'training_loss':[], 'validation_accuracy':[]}

    for epoch in range(epochs):
        for i, (x, y) in enumerate(train_loader):
            model.train()
            optimizer.zero_grad()
            prediction = model(x)
            loss = criterion(prediction, y)
            loss.backward()
            optimizer.step()
            useful_stuff['training_loss'].append(loss.data.item())

        for i, (x, y) in enumerate(val_loader):
            model.eval()
            optimizer.zero_grad()

            prediction = model(x)

            loss = criterion(prediction, y)
            useful_stuff['validation_accuracy'].append(loss.data.item())

        if epoch%10 == 0:
          print("epoch ", epoch, ":")
          print("val: ", int(useful_stuff['validation_accuracy'][-1]*10000)/10000.0, "tr: ", int(useful_stuff['training_loss'][-1]*10000)/10000.0)
          torch.save(model.state_dict(), checkpoint)
          # download checkpoint file


    return useful_stuff

In [None]:
train_dataset = Data(train_dataset)
val_dataset = Data(val_dataset)



odict_keys(['last_hidden_state', 'pooler_output'])
odict_keys(['last_hidden_state', 'pooler_output'])
odict_keys(['last_hidden_state', 'pooler_output'])
odict_keys(['last_hidden_state', 'pooler_output'])
odict_keys(['last_hidden_state', 'pooler_output'])
odict_keys(['last_hidden_state', 'pooler_output'])
odict_keys(['last_hidden_state', 'pooler_output'])
odict_keys(['last_hidden_state', 'pooler_output'])
odict_keys(['last_hidden_state', 'pooler_output'])
odict_keys(['last_hidden_state', 'pooler_output'])
odict_keys(['last_hidden_state', 'pooler_output'])
odict_keys(['last_hidden_state', 'pooler_output'])
odict_keys(['last_hidden_state', 'pooler_output'])
odict_keys(['last_hidden_state', 'pooler_output'])
odict_keys(['last_hidden_state', 'pooler_output'])


<h3>3. create the model class

In [None]:
import torch
import torch.nn as nn

class AdaptationLayer(nn.Module):
  def __init__(self, input_dim, output_dim):
    super(AdaptationLayer, self).__init__()
    self.fc1 = nn.Linear(input_dim, output_dim*2)
    torch.nn.init.kaiming_uniform_(self.fc1.weight, nonlinearity='relu')
    self.bn1 = nn.BatchNorm1d(77)

    self.fc2 = nn.Linear(input_dim*2, output_dim*2)
    torch.nn.init.kaiming_uniform_(self.fc2.weight, nonlinearity='relu')
    self.bn2 = nn.BatchNorm1d(77)

    self.fc3 = nn.Linear(input_dim*2, output_dim)
    torch.nn.init.kaiming_uniform_(self.fc3.weight, nonlinearity='relu')
    self.bn3 = nn.BatchNorm1d(77)

    self.fc4 = nn.Linear(input_dim, output_dim)
    torch.nn.init.kaiming_uniform_(self.fc4.weight, nonlinearity='relu')
    self.bn4 = nn.BatchNorm1d(77)

    self.fc5 = nn.Linear(input_dim, output_dim)

  def forward(self, x):
    x = nn.functional.normalize(x, p=2.0, dim=1, eps=1e-12, out=None)
    x = torch.relu(self.bn1(self.fc1(x)))
    x = torch.relu(self.bn2(self.fc2(x)))
    x = torch.relu(self.bn3(self.fc3(x)))
    x = torch.relu(self.bn4(self.fc4(x)))

    return self.fc5(x)

In [None]:
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=32, shuffle=True)

model  = AdaptationLayer(768,768)
model.to(device)
#state_dict = torch.load('checkpoint_9.pth')
#model.load_state_dict(state_dict)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.0003)
criterion = nn.MSELoss()
training_results = train(model, criterion, train_loader, val_loader, optimizer, 'new.pth', epochs=200)

epoch  0 :
val:  0.8086 tr:  0.8036
epoch  10 :
val:  0.674 tr:  0.5941
epoch  20 :
val:  0.6924 tr:  0.5278


KeyboardInterrupt: ignored

In [None]:
import matplotlib.pyplot as plt

plt.plot(training_results['training_loss'])

In [None]:
plt.plot(training_results['validation_accuracy'])