# Imports

In [1]:
import transformers
from transformers import CLIPConfig, CLIPModel, CLIPProcessor, CLIPImageProcessor, CLIPTokenizerFast
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim.lr_scheduler as lr_scheduler
import numpy as np
import random
import math
import scipy.io as sio
import nibabel as nib
from pathlib import Path
from gensim.models import Word2Vec
import re

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# Load word and fMRI data

### Load 3d fMRI data

In [3]:
NUM_SUBJS = 8
subjects_fmri = [] #stores all 8 subject fmri np arrays

fMRI_folder = Path('./doi_10.5061_dryad.gt413__v1')
assert fMRI_folder.exists(), f"Foldder: {fMRI_folder} does not exist."

for subj_id in range(8):
#     fmri_file_name = str(subj_id) + '_masked_2d.npy'
#     fmri = np.load(fMRI_folder / fmri_file_name)  
    fmri_file_name = str(subj_id) + '_smooth_nifti_4d.nii'
    fmri = nib.load(fMRI_folder / fmri_file_name)
    fmri = np.array(fmri.dataobj)
    assert isinstance(fmri, np.ndarray), f"Imported fmri_scan for subject {subj_id} is not of type numpy.ndarray"
    assert(fmri.ndim) == 4, f"Imported fmri_scan for subject {subj_id} is not 4 dimensional"
    subjects_fmri.append(fmri)

# Load words

In [4]:
feature_matrix = np.zeros((5176,195)) #stores the feature vectors as a row for each word
feature_names = [] #stores the names of all features in order
feature_types = {} #stores the types of features and all the names of the features for each type

features = sio.loadmat(fMRI_folder / 'story_features.mat')
feature_count = 0
for feature_type in features['features'][0]:
    feature_types[feature_type[0][0]] = []
    if isinstance(feature_type[1][0], str):
        feature_types[feature_type[0][0]].append(feature_type[1][0])
        feature_names.append(feature_type[1][0])
    else:
        for feature in feature_type[1][0]:
            feature_types[feature_type[0][0]].append(feature[0])
            feature_names.append(feature[0])
    feature_matrix[:, feature_count:feature_count+feature_type[2].shape[1]] = feature_type[2] #adds the (5176xN) feature values to the feature matrix for the current feature group
    feature_count += feature_type[2].shape[1]

In [5]:
words_info = [] #stores tuples of (word, time, features) sorted by time appeared

mat_file = fMRI_folder / 'subject_1.mat' #only looks at the first subject file, somewhere it said all the timings were the same so this should be safe
mat_contents = sio.loadmat(mat_file)
for count, row in enumerate(mat_contents['words'][0]):
    word_value = row[0][0][0][0]
    time = row[1][0][0]
    word_tuple = (word_value, time, feature_matrix[count,:])
    words_info.append(word_tuple)

In [6]:
chapter_nine_text = ""
for row in mat_contents['words'][0]:
    chapter_nine_text += row[0][0][0][0] + " "
print(chapter_nine_text)

Harry had never believed he would meet a boy he hated more than Dudley, but that was before he met Draco Malfoy. Still, first-year Gryffindors only had Potions with the Slytherins, so they didn't have to put up with Malfoy much. Or at least, they didn't until they spotted a notice pinned up in the Gryffindor common room that made them all groan. Flying lessons would be starting on Thursday -- and Gryffindor and Slytherin would be learning together. + "Typical," said Harry darkly. "Just what I always wanted. To make a fool of myself on a broomstick in front of Malfoy." + He had been looking forward to learning to fly more than anything else. "You don't know that you'll make a fool of yourself," said Ron reasonably. "Anyway, I know Malfoy's always going on about how good he is at Quidditch, but I bet that's all talk." + Malfoy certainly did talk about flying a lot. He complained loudly about first years never getting on the House Quidditch teams and told long, boastful stories that alway

### Align fMRI scans with sets of 4 words

In [7]:
subjects_samples = [[] for i in range(NUM_SUBJS)] #stores lists of all the samples for each subject

word_count = 0
while word_count < len(words_info) - 8:
    #gets the 4 input words, and the 4 consecutive words while verifying they were read in sequence
    scan_words = []
    start_time = words_info[word_count][1]
    in_sequence = True #tracks if the words are in sequence or not
    for i in range(8):
        word_info = words_info[word_count + i]
        if word_info[1] != start_time + 0.5*i:
            #if some of the words are not in sequence, skip forward 1 word after innter loop
            in_sequence = False
        scan_words.append(word_info[0])
    if not in_sequence:
        word_count +=1
        continue
    fmri_time = start_time + 2 #effect of reading words is assumed to start 2 seconds after and end 8 seconds after
    fmri_index = fmri_time//2 #since a scan happens every two seconds, the index is the time divided by 2
    if not isinstance(fmri_index, np.int32):
        #if the first word is not aligned with the fmri scan (i.e. its not the first word in a TR)
        word_count += 1
        continue
    for count, subject in enumerate(subjects_fmri):
        #adds tuple of (fmri_scan, four words)
        subjects_samples[count].append((subject[:,:,:,fmri_index+2], scan_words[0:4]))
    print("Created sample:")
    print("\tScan time:", str(start_time))
    print("\tInput words:", str(scan_words[0:4]))
    #if successful, skip forward to the next set of 4 words
    word_count += 4

print("Total number of samples:", str(len(subjects_samples[0])))

Created sample:
	Scan time: 20
	Input words: ['Harry', 'had', 'never', 'believed']
Created sample:
	Scan time: 22
	Input words: ['he', 'would', 'meet', 'a']
Created sample:
	Scan time: 24
	Input words: ['boy', 'he', 'hated', 'more']
Created sample:
	Scan time: 26
	Input words: ['than', 'Dudley,', 'but', 'that']
Created sample:
	Scan time: 28
	Input words: ['was', 'before', 'he', 'met']
Created sample:
	Scan time: 30
	Input words: ['Draco', 'Malfoy.', 'Still,', 'first-year']
Created sample:
	Scan time: 32
	Input words: ['Gryffindors', 'only', 'had', 'Potions']
Created sample:
	Scan time: 34
	Input words: ['with', 'the', 'Slytherins,', 'so']
Created sample:
	Scan time: 36
	Input words: ['they', "didn't", 'have', 'to']
Created sample:
	Scan time: 38
	Input words: ['put', 'up', 'with', 'Malfoy']
Created sample:
	Scan time: 40
	Input words: ['much.', 'Or', 'at', 'least,']
Created sample:
	Scan time: 42
	Input words: ['they', "didn't", 'until', 'they']
Created sample:
	Scan time: 44
	Input w

	Scan time: 1464
	Input words: ["wizard's", 'duel', 'before,', 'I']
Created sample:
	Scan time: 1466
	Input words: ['suppose?"', '+', '"Of', 'course']
Created sample:
	Scan time: 1468
	Input words: ['he', 'has,"', 'said', 'Ron,']
Created sample:
	Scan time: 1470
	Input words: ['wheeling', 'around.', '"I\'m', 'his']
Created sample:
	Scan time: 1472
	Input words: ['second,', "who's", 'yours?"', '+']
Created sample:
	Scan time: 1474
	Input words: ['Malfoy', 'looked', 'at', 'Crabbe']
Created sample:
	Scan time: 1476
	Input words: ['and', 'Goyle,', 'sizing', 'them']
Created sample:
	Scan time: 1478
	Input words: ['up.', '+', '"Crabbe,"', 'he']
Created sample:
	Scan time: 1480
	Input words: ['said.', '"Midnight', 'all', 'right?']
Created sample:
	Scan time: 1482
	Input words: ["We'll", 'meet', 'you', 'in']
Created sample:
	Scan time: 1484
	Input words: ['the', 'trophy', 'room;', "that's"]
Created sample:
	Scan time: 1486
	Input words: ['always', 'unlocked."', '+', 'When']
Created sample:
	Sc

	Scan time: 1684
	Input words: ['all', 'the', 'armchairs', 'into']
Created sample:
	Scan time: 1686
	Input words: ['hunched', 'black', 'shadows.', 'They']
Created sample:
	Scan time: 1688
	Input words: ['had', 'almost', 'reached', 'the']
Created sample:
	Scan time: 1690
	Input words: ['portrait', 'hole', 'when', 'a']
Created sample:
	Scan time: 1692
	Input words: ['voice', 'spoke', 'from', 'the']
Created sample:
	Scan time: 1694
	Input words: ['chair', 'nearest', 'them,', '"I']
Created sample:
	Scan time: 1696
	Input words: ["can't", 'believe', "you're", 'going']
Created sample:
	Scan time: 1698
	Input words: ['to', 'do', 'this,', 'Harry."']
Created sample:
	Scan time: 1700
	Input words: ['+', 'A', 'lamp', 'flickered']
Created sample:
	Scan time: 1702
	Input words: ['on.', 'It', 'was', 'Hermione']
Created sample:
	Scan time: 1704
	Input words: ['Granger,', 'wearing', 'a', 'pink']
Created sample:
	Scan time: 1706
	Input words: ['bathrobe', 'and', 'a', 'frown.']
Created sample:
	Scan tim

# OpenAI CLIP

### Tokenizer

In [8]:
import gzip
import html
import os
from functools import lru_cache

import ftfy
import regex as re


@lru_cache()
def default_bpe():
    return os.path.join(os.path.dirname(os.path.abspath("./doi_10.5061_dryad.gt413__v1")), "bpe_simple_vocab_16e6.txt.gz")


@lru_cache()
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))


def get_pairs(word):
    """Return set of symbol pairs in a word.
    Word is represented as tuple of symbols (symbols being variable-length strings).
    """
    pairs = set()
    prev_char = word[0]
    for char in word[1:]:
        pairs.add((prev_char, char))
        prev_char = char
    return pairs


def basic_clean(text):
    text = ftfy.fix_text(text)
    text = html.unescape(html.unescape(text))
    return text.strip()


def whitespace_clean(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


class SimpleTokenizer(object):
    def __init__(self, bpe_path: str = default_bpe()):
        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
        merges = merges[1:49152-256-2+1]
        merges = [tuple(merge.split()) for merge in merges]
        vocab = list(bytes_to_unicode().values())
        vocab = vocab + [v+'</w>' for v in vocab]
        for merge in merges:
            vocab.append(''.join(merge))
        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
        self.encoder = dict(zip(vocab, range(len(vocab))))
        self.decoder = {v: k for k, v in self.encoder.items()}
        self.bpe_ranks = dict(zip(merges, range(len(merges))))
        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)

    def bpe(self, token):
        if token in self.cache:
            return self.cache[token]
        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
        pairs = get_pairs(word)

        if not pairs:
            return token+'</w>'

        while True:
            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
            if bigram not in self.bpe_ranks:
                break
            first, second = bigram
            new_word = []
            i = 0
            while i < len(word):
                try:
                    j = word.index(first, i)
                    new_word.extend(word[i:j])
                    i = j
                except:
                    new_word.extend(word[i:])
                    break

                if word[i] == first and i < len(word)-1 and word[i+1] == second:
                    new_word.append(first+second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_word = tuple(new_word)
            word = new_word
            if len(word) == 1:
                break
            else:
                pairs = get_pairs(word)
        word = ' '.join(word)
        self.cache[token] = word
        return word

    def encode(self, text):
        bpe_tokens = []
        text = whitespace_clean(basic_clean(text)).lower()
        for token in re.findall(self.pat, text):
            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
        return bpe_tokens

    def decode(self, tokens):
        text = ''.join([self.decoder[token] for token in tokens])
        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
        return text

### Tokenize

In [9]:
import hashlib
import os
import urllib
import warnings
from typing import Any, Union, List
from pkg_resources import packaging

_tokenizer = SimpleTokenizer()

def tokenize(texts: Union[str, List[str]], context_length: int = 16, truncate: bool = False) -> Union[torch.IntTensor, torch.LongTensor]:
    """
    Returns the tokenized representation of given input string(s)

    Parameters
    ----------
    texts : Union[str, List[str]]
        An input string or a list of input strings to tokenize

    context_length : int
        The context length to use; all CLIP models use 77 as the context length

    truncate: bool
        Whether to truncate the text in case its encoding is longer than the context length

    Returns
    -------
    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length].
    We return LongTensor when torch version is <1.8.0, since older index_select requires indices to be long.
    """
    if isinstance(texts, str):
        texts = [texts]

    sot_token = _tokenizer.encoder["<|startoftext|>"]
    eot_token = _tokenizer.encoder["<|endoftext|>"]
    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
    if packaging.version.parse(torch.__version__) < packaging.version.parse("1.8.0"):
        result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
    else:
        result = torch.zeros(len(all_tokens), context_length, dtype=torch.int)

    for i, tokens in enumerate(all_tokens):
        if len(tokens) > context_length:
            if truncate:
                tokens = tokens[:context_length]
                tokens[-1] = eot_token
            else:
                raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
        result[i, :len(tokens)] = torch.tensor(tokens)

    return result

### Model

In [10]:
from collections import OrderedDict
from typing import Tuple, Union

import numpy as np
import torch
import torch.nn.functional as F
from torch import nn


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1):
        super().__init__()

        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
        self.conv1 = nn.Conv3d(inplanes, planes, 1, bias=False)
        self.bn1 = nn.BatchNorm3d(planes)
        self.relu1 = nn.ReLU(inplace=True)

        self.conv2 = nn.Conv3d(planes, planes, 3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm3d(planes)
        self.relu2 = nn.ReLU(inplace=True)

        self.avgpool = nn.AvgPool3d(stride) if stride > 1 else nn.Identity()

        self.conv3 = nn.Conv3d(planes, planes * self.expansion, 1, bias=False)
        self.bn3 = nn.BatchNorm3d(planes * self.expansion)
        self.relu3 = nn.ReLU(inplace=True)

        self.downsample = None
        self.stride = stride

        if stride > 1 or inplanes != planes * Bottleneck.expansion:
            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
            self.downsample = nn.Sequential(OrderedDict([
                ("-1", nn.AvgPool3d(stride)),
                ("0", nn.Conv3d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
                ("1", nn.BatchNorm3d(planes * self.expansion))
            ]))

    def forward(self, x: torch.Tensor):
        identity = x

        out = self.relu1(self.bn1(self.conv1(x)))
        out = self.relu2(self.bn2(self.conv2(out)))
        out = self.avgpool(out)
        out = self.bn3(self.conv3(out))

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu3(out)
        return out


# class AttentionPool2d(nn.Module):
#     def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
#         super().__init__()
#         self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
#         self.k_proj = nn.Linear(embed_dim, embed_dim)
#         self.q_proj = nn.Linear(embed_dim, embed_dim)
#         self.v_proj = nn.Linear(embed_dim, embed_dim)
#         self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
#         self.num_heads = num_heads

#     def forward(self, x):
#         x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
#         x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
#         x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
#         x, _ = F.multi_head_attention_forward(
#             query=x[:1], key=x, value=x,
#             embed_dim_to_check=x.shape[-1],
#             num_heads=self.num_heads,
#             q_proj_weight=self.q_proj.weight,
#             k_proj_weight=self.k_proj.weight,
#             v_proj_weight=self.v_proj.weight,
#             in_proj_weight=None,
#             in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
#             bias_k=None,
#             bias_v=None,
#             add_zero_attn=False,
#             dropout_p=0,
#             out_proj_weight=self.c_proj.weight,
#             out_proj_bias=self.c_proj.bias,
#             use_separate_proj_weight=True,
#             training=self.training,
#             need_weights=False
#         )
#         return x.squeeze(0)


class ModifiedResNet(nn.Module):
    """
    A ResNet class that is similar to torchvision's but contains the following changes:
    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
    - The final pooling layer is a QKV attention instead of an average pool
    """

    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
        super().__init__()
        self.output_dim = output_dim
        self.input_resolution = input_resolution

        # the 3-layer stem
        self.conv1 = nn.Conv3d(1, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn1 = nn.BatchNorm3d(width // 2)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv3d(width // 2, width // 2, kernel_size=2, padding=1, bias=False)
        self.bn2 = nn.BatchNorm3d(width // 2)
        self.relu2 = nn.ReLU(inplace=True)
        self.conv3 = nn.Conv3d(width // 2, width, kernel_size=3, padding=1, bias=False)
        self.bn3 = nn.BatchNorm3d(width)
        self.relu3 = nn.ReLU(inplace=True)
        self.avgpool = nn.AvgPool3d(2)
        self.linear = nn.Linear(2048, 1024)

        # residual layers
        self._inplanes = width  # this is a *mutable* variable used during construction
        self.layer1 = self._make_layer(width, layers[0])
        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)

        embed_dim = width * 32  # the ResNet feature dimension
#         self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)

    def _make_layer(self, planes, blocks, stride=1):
        layers = [Bottleneck(self._inplanes, planes, stride)]

        self._inplanes = planes * Bottleneck.expansion
        for _ in range(1, blocks):
            layers.append(Bottleneck(self._inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        def stem(x):
            x = self.relu1(self.bn1(self.conv1(x)))
            x = self.relu2(self.bn2(self.conv2(x)))
            x = self.relu3(self.bn3(self.conv3(x)))
            x = self.avgpool(x)
            return x

        x = x.type(self.conv1.weight.dtype)
        x = stem(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
#         x = self.avgpool(x) #changed final attentionpool to avgpool
        #x = self.attnpool(x)
        x = x.view(-1,2048) #will have to change this if different layer/kernel sizes used
        x = self.linear(x)
        return x


class LayerNorm(nn.LayerNorm):
    """Subclass torch's LayerNorm to handle fp16."""

    def forward(self, x: torch.Tensor):
        orig_type = x.dtype
        ret = super().forward(x.type(torch.float32))
        return ret.type(orig_type)


class QuickGELU(nn.Module):
    def forward(self, x: torch.Tensor):
        return x * torch.sigmoid(1.702 * x)


class ResidualAttentionBlock(nn.Module):
    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
        super().__init__()

        self.attn = nn.MultiheadAttention(d_model, n_head)
        self.ln_1 = LayerNorm(d_model)
        self.mlp = nn.Sequential(OrderedDict([
            ("c_fc", nn.Linear(d_model, d_model * 4)),
            ("gelu", QuickGELU()),
            ("c_proj", nn.Linear(d_model * 4, d_model))
        ]))
        self.ln_2 = LayerNorm(d_model)
        self.attn_mask = attn_mask

    def attention(self, x: torch.Tensor):
        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]

    def forward(self, x: torch.Tensor):
        x = x + self.attention(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x


class Transformer(nn.Module):
    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
        super().__init__()
        self.width = width
        self.layers = layers
        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])

    def forward(self, x: torch.Tensor):
        return self.resblocks(x)


class CLIP(nn.Module):
    def __init__(self,
                 embed_dim: int,
                 # vision
                 image_resolution: int,
                 vision_layers: Union[Tuple[int, int, int, int], int],
                 vision_width: int,
                 # text
                 context_length: int,
                 vocab_size: int,
                 transformer_width: int,
                 transformer_heads: int,
                 transformer_layers: int
                 ):
        super().__init__()

        self.context_length = context_length

        #initializes resnet (removed option for vision transformer)
        vision_heads = vision_width * 32 // 64
        self.visual = ModifiedResNet(
            layers=vision_layers,
            output_dim=embed_dim,
            heads=vision_heads,
            input_resolution=image_resolution,
            width=vision_width
        )

        #initializes text transformer
        self.transformer = Transformer(
            width=transformer_width,
            layers=transformer_layers,
            heads=transformer_heads,
            attn_mask=self.build_attention_mask()
        )

        self.vocab_size = vocab_size
        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
        self.ln_final = LayerNorm(transformer_width)

        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
        #self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
        self.logit_scale = torch.ones([]) * np.log(1 / 0.07)

        self.initialize_parameters()

    def initialize_parameters(self):
        nn.init.normal_(self.token_embedding.weight, std=0.02)
        nn.init.normal_(self.positional_embedding, std=0.01)

        if isinstance(self.visual, ModifiedResNet):
#             if self.visual.attnpool is not None:
#                 std = self.visual.attnpool.c_proj.in_features ** -0.5
#                 nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
#                 nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
#                 nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
#                 nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)

            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
                for name, param in resnet_block.named_parameters():
                    if name.endswith("bn3.weight"):
                        nn.init.zeros_(param)

        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
        attn_std = self.transformer.width ** -0.5
        fc_std = (2 * self.transformer.width) ** -0.5
        for block in self.transformer.resblocks:
            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)

        if self.text_projection is not None:
            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)

    def build_attention_mask(self):
        # lazily create causal attention mask, with full attention between the vision tokens
        # pytorch uses additive attention mask; fill with -inf
        mask = torch.empty(self.context_length, self.context_length)
        mask.fill_(float("-inf"))
        mask.triu_(1)  # zero out the lower diagonal
        return mask

    @property
    def dtype(self):
        return self.visual.conv1.weight.dtype

    def encode_image(self, image):
        return self.visual(image.type(self.dtype))

    def encode_text(self, text):
        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]

        x = x + self.positional_embedding.type(self.dtype)
        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = self.ln_final(x).type(self.dtype)

        # x.shape = [batch_size, n_ctx, transformer.width]
        # take features from the eot embedding (eot_token is the highest number in each sequence)
        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection

        return x

    def forward(self, image, text):
        image_features = self.encode_image(image)
        text_features = self.encode_text(text)

        # normalized features
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        text_features = text_features / text_features.norm(dim=1, keepdim=True)

        # cosine similarity as logits
        logit_scale = self.logit_scale.exp()
        logits_per_image = logit_scale * image_features @ text_features.t()
        logits_per_text = logits_per_image.t()

        # shape = [global_batch_size, global_batch_size]
        return logits_per_image, logits_per_text


def convert_weights(model: nn.Module):
    """Convert applicable model parameters to fp16"""

    def _convert_weights_to_fp16(l):
        if isinstance(l, (nn.Conv1d, nn.Conv3d, nn.Linear)):
            l.weight.data = l.weight.data.half()
            if l.bias is not None:
                l.bias.data = l.bias.data.half()

        if isinstance(l, nn.MultiheadAttention):
            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
                tensor = getattr(l, attr)
                if tensor is not None:
                    tensor.data = tensor.data.half()

        for name in ["text_projection", "proj"]:
            if hasattr(l, name):
                attr = getattr(l, name)
                if attr is not None:
                    attr.data = attr.data.half()

    model.apply(_convert_weights_to_fp16)


def build_model(state_dict: dict):
    vit = "visual.proj" in state_dict

    if vit:
        vision_width = state_dict["visual.conv1.weight"].shape[0]
        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
        image_resolution = vision_patch_size * grid_size
    else:
        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
        vision_layers = tuple(counts)
        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
        vision_patch_size = None
        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
        image_resolution = output_width * 32

    embed_dim = state_dict["text_projection"].shape[1]
    context_length = state_dict["positional_embedding"].shape[0]
    vocab_size = state_dict["token_embedding.weight"].shape[0]
    transformer_width = state_dict["ln_final.weight"].shape[0]
    transformer_heads = transformer_width // 64
    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith("transformer.resblocks")))

    model = CLIP(
        embed_dim,
        image_resolution, vision_layers, vision_width, vision_patch_size,
        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
    )

    for key in ["input_resolution", "context_length", "vocab_size"]:
        if key in state_dict:
            del state_dict[key]

    convert_weights(model)
    model.load_state_dict(state_dict)
    return model.eval()

In [11]:
_MODELS = {
    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
    "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
    "RN50x64": "https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt",
    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
    "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
    "ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
}

In [12]:
import torch
from PIL import Image
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
from tqdm import tqdm

def _download(url: str, root: str):
    os.makedirs(root, exist_ok=True)
    filename = os.path.basename(url)

    expected_sha256 = url.split("/")[-2]
    download_target = os.path.join(root, filename)

    if os.path.exists(download_target) and not os.path.isfile(download_target):
        raise RuntimeError(f"{download_target} exists and is not a regular file")

    if os.path.isfile(download_target):
        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
            return download_target
        else:
            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")

    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
            while True:
                buffer = source.read(8192)
                if not buffer:
                    break

                output.write(buffer)
                loop.update(len(buffer))

    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
        raise RuntimeError("Model has been downloaded but the SHA256 checksum does not not match")

    return download_target

In [13]:
name="RN50"
if name in _MODELS:
    model_path = _download(_MODELS[name], os.path.expanduser("~/.cache/clip"))
else:
    raise RuntimeError(f"Model {name} not found; available models = {available_models()}")

In [14]:
jit=True
with open(model_path, 'rb') as opened_file:
    try:
        # loading JIT archive
        model = torch.jit.load(opened_file, map_location=device if jit else "cpu").eval()
        state_dict = model.state_dict()
    except RuntimeError:
        # loading saved state dict
        if jit:
            warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
            jit = False
        state_dict = torch.load(opened_file, map_location="cpu")

In [15]:
#vision
counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
vision_layers = tuple(counts)
print(vision_layers)
vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
print(output_width)
assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
image_resolution = output_width * 32

#transformer
embed_dim = state_dict["text_projection"].shape[1]
context_length = state_dict["positional_embedding"].shape[0]
vocab_size = state_dict["token_embedding.weight"].shape[0]
transformer_width = state_dict["ln_final.weight"].shape[0]
transformer_heads = transformer_width // 64
transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith("transformer.resblocks")))

(3, 4, 6, 3)
7


In [16]:
# model = CLIP(
#     embed_dim=state_dict["text_projection"].shape[1],
#     image_resolution=None, 
#     vision_layers=[3, 4, 6, 3], 
#     vision_width=64,
#     context_length=state_dict["positional_embedding"].shape[0], 
#     vocab_size=state_dict["token_embedding.weight"].shape[0], 
#     transformer_width=state_dict["ln_final.weight"].shape[0], 
#     transformer_heads=state_dict["ln_final.weight"].shape[0]//64, 
#     transformer_layers=len(set(k.split(".")[2] for k in state_dict if k.startswith("transformer.resblocks")))
# )

model = CLIP(
    embed_dim,
    image_resolution, vision_layers, vision_width,
    context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
)

In [17]:
def unison_shuffled_copies(a, b):
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [18]:
#trains the clip model from scratch
def train_clip(model, text_samples, image_samples, batch_size=10, num_epochs=100, lr=1e-3, temp=0.07, clip_value=0.01):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=0.2)
    loss_fn = nn.CrossEntropyLoss()
    for epoch in range(num_epochs):
        epoch_loss = 0
        image_epoch_correct = 0
        text_epoch_correct = 0
        epoch_total = 0
        text_samples, image_samples = unison_shuffled_copies(text_samples, image_samples)
        for batch in range(math.floor(image_samples.shape[0]/batch_size)):
            optimizer.zero_grad()
            
            #gets embeddings for text and image batches
            start_idx = batch*batch_size
            end_idx = (batch+1)*batch_size
            text_batch, image_batch = text_samples[start_idx:end_idx], image_samples[start_idx:end_idx]
            
            logits_per_image, logits_per_text = model(torch.unsqueeze(image_batch, dim=1), text_batch)
#             print(logits_per_image.shape)
#             print(logits_per_image)
            
            #symmetric loss function
            labels = torch.arange(batch_size).to(device)
            loss_text = loss_fn(logits_per_text, labels)
            loss_image = loss_fn(logits_per_image, labels)
            loss = (loss_text + loss_image)/2
            print("\tBatch:", batch, "/", math.floor(image_samples.shape[0]/batch_size), ", Loss:", loss)
            #print(loss)
            loss.backward()
#             torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
            epoch_loss += loss.item()
            optimizer.step()
            
            #compute accuracy
            image_winners = torch.argmax(logits_per_image, dim=0)
            text_winners = torch.argmax(logits_per_text, dim=0)
            #print(winners)
            #print(labels)
            image_corrects = (image_winners == labels)
            text_corrects = (text_winners == labels)
            total_image_correct = image_corrects.sum().float().item()
            total_text_correct = text_corrects.sum().float().item()
            image_epoch_correct += total_image_correct
            text_epoch_correct += total_text_correct
            epoch_total += batch_size
#             for param in model.parameters():
#                 print(torch.absolute(param.grad.data).sum())
# #                 print(param.grad.data)
        if epoch % 1 == 0:
            print("Epoch:", epoch, "Training Loss:", epoch_loss, "Training Image Accuracy:", image_epoch_correct/epoch_total, "Training Text Accuracy:", text_epoch_correct/epoch_total)

### Train using real fMRI and text data

In [19]:
print(tokenize(["Harry potter was a", "wizard and he always"]))
print(tokenize(["Harry potter was a", "wizard and he always"]).shape)

tensor([[49406,  3600,  9026,   739,   320, 49407,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0],
        [49406, 14295,   537,   797,  1466, 49407,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]], dtype=torch.int32)
torch.Size([2, 16])


In [20]:
def split_samples(samples):
    images = torch.zeros([len(samples)] + list(samples[0][0].shape))
    text = torch.zeros((len(samples), 16), dtype=int)
    for idx, sample in enumerate(samples):
        images[idx] = torch.tensor(sample[0])
        text[idx] = tokenize([" ".join(sample[1])])
    images = (images - images.min())/(images.max() - images.min())
    return images.to(device), text.to(device)

In [21]:
train_split = 1.0

train_samples = subjects_samples[0][:int(len(subjects_samples[0])*train_split)]
train_images, train_text = split_samples(train_samples)
print("Train:")
print(train_text.shape)
print(train_images.shape)
if torch.isnan(torch.sum(train_text)) or torch.isinf(torch.sum(train_text)):
    print('invalid input detected in text.')
if torch.isnan(torch.sum(train_images)) or torch.isinf(torch.sum(train_images)):
    print('invalid input detected in images.')

# test_samples = subjects_samples[0][int(len(subjects_samples[0])*train_split):]
# test_images, test_text = split_samples(test_samples)
# print("Test:")
# print(len(test_text))
# print(test_images.shape)

Train:
torch.Size([1287, 16])
torch.Size([1287, 53, 60, 50])


In [22]:
clip_model = CLIP(
    embed_dim,
    image_resolution, (2,2,2,2), 64,
    16, vocab_size, transformer_width, transformer_heads, transformer_layers
).to(device)

In [23]:
train_clip(clip_model, train_text, train_images, batch_size=64, lr=1e-4, num_epochs=1000)

	Batch: 0 / 20 , Loss: tensor(4.2428, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(4.8290, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(5.0569, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(4.3817, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(4.3878, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(4.3962, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(4.2058, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(4.1997, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(4.2369, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(4.1993, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(4.1703, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(4.1949, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(4.1689, device='cud

	Batch: 18 / 20 , Loss: tensor(3.6282, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(3.7421, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 4 Training Loss: 70.51367282867432 Training Image Accuracy: 0.08984375 Training Text Accuracy: 0.1015625
	Batch: 0 / 20 , Loss: tensor(3.2883, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(3.2049, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(3.0736, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(3.2897, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(3.2158, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(2.8975, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(3.5463, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(3.2500, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(3.4272, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , 

	Batch: 15 / 20 , Loss: tensor(2.5790, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(2.6592, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(2.5348, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(2.7632, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(2.4674, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 9 Training Loss: 49.16825079917908 Training Image Accuracy: 0.24296875 Training Text Accuracy: 0.2359375
	Batch: 0 / 20 , Loss: tensor(2.4104, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(2.2399, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(1.9997, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(2.2964, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(2.0848, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(2.1674, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20

	Batch: 11 / 20 , Loss: tensor(2.1459, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(1.9501, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(1.7364, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(2.0579, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(2.2723, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(1.9649, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(2.0297, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(2.3118, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(2.1846, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 14 Training Loss: 38.319405913352966 Training Image Accuracy: 0.384375 Training Text Accuracy: 0.365625
	Batch: 0 / 20 , Loss: tensor(1.7763, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(1.8607, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 /

	Batch: 7 / 20 , Loss: tensor(1.2134, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(1.3582, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(1.2782, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(1.6144, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(1.3405, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(1.5439, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(1.5858, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(1.4502, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(1.4587, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(1.2325, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(1.3273, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(1.3179, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(1.2430, devi

	Batch: 4 / 20 , Loss: tensor(1.1861, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(1.1778, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(1.4470, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(1.0056, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(1.0717, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(1.3200, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(1.1498, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(1.1836, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(1.2227, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(1.1817, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(1.4455, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(1.4802, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(1.3530, device=

	Batch: 1 / 20 , Loss: tensor(0.8078, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.7587, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(1.0690, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.9960, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(1.0126, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(1.0865, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.9548, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.8718, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.8449, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.9833, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.9510, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(1.0295, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(1.1207, device='cu

	Batch: 19 / 20 , Loss: tensor(0.7405, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 33 Training Loss: 15.71764761209488 Training Image Accuracy: 0.7625 Training Text Accuracy: 0.746875
	Batch: 0 / 20 , Loss: tensor(0.8899, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.7626, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.8611, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.6433, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.7435, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.8529, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.8077, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.6995, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(1.0030, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.8122, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss

	Batch: 16 / 20 , Loss: tensor(0.6828, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.8015, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.7177, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.7273, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 38 Training Loss: 14.575884878635406 Training Image Accuracy: 0.76328125 Training Text Accuracy: 0.75234375
	Batch: 0 / 20 , Loss: tensor(0.6724, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.5179, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.6026, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.5356, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.5601, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.7366, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.7298, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 

	Batch: 12 / 20 , Loss: tensor(1.0553, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.8068, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.5197, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.5145, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.6627, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.5489, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.7789, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.6529, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 43 Training Loss: 13.341532766819 Training Image Accuracy: 0.790625 Training Text Accuracy: 0.7984375
	Batch: 0 / 20 , Loss: tensor(0.7305, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.6038, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.8590, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20

	Batch: 8 / 20 , Loss: tensor(0.5456, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.6979, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.5178, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.7355, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.5087, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.6327, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.4559, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.8427, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.5495, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.5732, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.7686, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(1.0814, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 48 Training Loss: 12.87343072891235

	Batch: 4 / 20 , Loss: tensor(0.4793, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.5311, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.4414, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.5432, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.4318, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.4277, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.5708, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.5699, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.5035, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.4290, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.4084, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.5188, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.4244, device=

	Batch: 0 / 20 , Loss: tensor(0.3479, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.6190, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.6431, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.5543, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.7587, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.4080, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(1.1957, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.4242, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.5577, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.6471, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.4968, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.7272, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.6573, device='cud

	Batch: 18 / 20 , Loss: tensor(0.4930, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.5187, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 62 Training Loss: 9.20815885066986 Training Image Accuracy: 0.8984375 Training Text Accuracy: 0.88203125
	Batch: 0 / 20 , Loss: tensor(0.3436, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.5900, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.4393, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.4251, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.3858, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.3271, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.3808, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.5034, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3909, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , 

	Batch: 15 / 20 , Loss: tensor(0.4250, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.4815, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.6706, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.7262, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.7147, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 67 Training Loss: 11.09088408946991 Training Image Accuracy: 0.828125 Training Text Accuracy: 0.8375
	Batch: 0 / 20 , Loss: tensor(0.9321, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.4023, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.6905, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.6188, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.6863, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.4895, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , L

	Batch: 12 / 20 , Loss: tensor(0.4386, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.3444, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.3713, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.3402, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.4937, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.4246, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.4752, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.4644, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 72 Training Loss: 8.337689012289047 Training Image Accuracy: 0.90703125 Training Text Accuracy: 0.91171875
	Batch: 0 / 20 , Loss: tensor(0.4692, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.4649, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.4327, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3

	Batch: 8 / 20 , Loss: tensor(0.8441, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.6660, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.6211, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.4670, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.5332, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.5482, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.5865, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.5119, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.5817, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.4697, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.7093, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.6737, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 77 Training Loss: 10.80568262934684

	Batch: 4 / 20 , Loss: tensor(0.3789, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.3102, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.3717, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.3897, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3868, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.5874, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.4999, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.3489, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.3322, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.3629, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.4234, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.3757, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.3578, device=

	Batch: 1 / 20 , Loss: tensor(0.5130, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.4493, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.6246, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.5298, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.4645, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.9162, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.6270, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.6023, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.3950, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(1.0152, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.5333, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.8528, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.5671, device='cu

	Batch: 19 / 20 , Loss: tensor(0.3924, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 91 Training Loss: 8.915916502475739 Training Image Accuracy: 0.88359375 Training Text Accuracy: 0.8875
	Batch: 0 / 20 , Loss: tensor(0.4028, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.3238, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.3166, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.3834, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.3705, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2833, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.3522, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.4083, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3001, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.4021, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Lo

	Batch: 16 / 20 , Loss: tensor(0.3524, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.4729, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.4347, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.2308, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 96 Training Loss: 7.555694550275803 Training Image Accuracy: 0.9140625 Training Text Accuracy: 0.90390625
	Batch: 0 / 20 , Loss: tensor(0.2482, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.3661, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.3428, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.3344, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.4363, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.3797, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.5666, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20

	Batch: 13 / 20 , Loss: tensor(0.3952, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.4174, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.5739, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.3423, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.3832, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.4006, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.4157, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 101 Training Loss: 7.715092211961746 Training Image Accuracy: 0.9171875 Training Text Accuracy: 0.9125
	Batch: 0 / 20 , Loss: tensor(0.4106, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.3868, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.4497, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.2208, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20

	Batch: 9 / 20 , Loss: tensor(0.3061, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.3622, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.4328, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.4305, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.3829, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.4089, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.6191, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.4237, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.3878, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3075, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.3094, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 106 Training Loss: 7.891047716140747 Training Image Accuracy: 0.90390625 Training Text Accuracy: 0.90390625
	Batch

	Batch: 5 / 20 , Loss: tensor(0.4295, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.4305, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.5928, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.4031, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.6750, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.4599, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.5898, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.5369, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.5004, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.5896, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.6827, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.5035, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.5270, device

	Batch: 1 / 20 , Loss: tensor(0.2558, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.3686, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.2783, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.3108, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.3590, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.3554, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.3041, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.2251, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.4266, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.3034, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.3616, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.3454, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.3854, device='cu

	Batch: 19 / 20 , Loss: tensor(0.4171, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 120 Training Loss: 8.040475487709045 Training Image Accuracy: 0.9078125 Training Text Accuracy: 0.903125
	Batch: 0 / 20 , Loss: tensor(0.2798, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.2900, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2945, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.5427, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.3190, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.3699, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.2962, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.3914, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3478, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.4065, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , 

	Batch: 15 / 20 , Loss: tensor(0.4708, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.2976, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.3633, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3644, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.3122, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 125 Training Loss: 7.0454427897930145 Training Image Accuracy: 0.92734375 Training Text Accuracy: 0.91875
	Batch: 0 / 20 , Loss: tensor(0.3705, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.2963, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.3586, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.2879, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.3138, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2712, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 2

	Batch: 11 / 20 , Loss: tensor(0.5640, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.3838, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2912, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.4162, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.3401, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.3295, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.3149, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3252, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.3242, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 130 Training Loss: 6.764251708984375 Training Image Accuracy: 0.9390625 Training Text Accuracy: 0.9421875
	Batch: 0 / 20 , Loss: tensor(0.4204, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.3508, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2

	Batch: 7 / 20 , Loss: tensor(0.3465, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.4955, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.3672, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.3738, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.4820, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.4159, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.4078, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.2622, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.4216, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.4517, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.4365, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.4245, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.4421, devi

	Batch: 3 / 20 , Loss: tensor(0.2985, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.4008, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2952, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.3952, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.3318, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3606, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.3280, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.4226, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.2768, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.4041, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.3139, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.3856, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.3026, device='

	Batch: 0 / 20 , Loss: tensor(0.3102, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.3186, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.4643, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.2306, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.3132, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.3052, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.3313, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.2781, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3551, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.2839, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.4160, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.3590, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.2767, device='cud

	Batch: 18 / 20 , Loss: tensor(0.2252, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.3471, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 149 Training Loss: 5.53597891330719 Training Image Accuracy: 0.959375 Training Text Accuracy: 0.9578125
	Batch: 0 / 20 , Loss: tensor(0.2417, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.2076, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2141, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.2474, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.2318, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2046, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.2794, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.2683, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.2532, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , L

	Batch: 14 / 20 , Loss: tensor(0.4906, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.4050, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.4405, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.4262, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.4362, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.4105, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 154 Training Loss: 7.729954078793526 Training Image Accuracy: 0.91875 Training Text Accuracy: 0.9046875
	Batch: 0 / 20 , Loss: tensor(0.3780, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.2674, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.4295, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.4266, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.3699, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20

	Batch: 10 / 20 , Loss: tensor(0.4001, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.3554, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.3515, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.3878, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.3587, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.3660, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.4488, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.3876, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3062, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.2852, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 159 Training Loss: 6.9561241418123245 Training Image Accuracy: 0.93125 Training Text Accuracy: 0.9234375
	Batch: 0 / 20 , Loss: tensor(0.3381, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1

	Batch: 6 / 20 , Loss: tensor(0.3463, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.2611, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3265, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.2855, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.2516, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.2628, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.4384, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.3417, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.3189, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.2869, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.3427, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.4232, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3551, devic

	Batch: 2 / 20 , Loss: tensor(0.2646, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.2783, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.2666, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.3255, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.2021, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.3192, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.2164, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.3347, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.3273, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.2780, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.2370, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2566, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.3073, device='c

Epoch: 173 Training Loss: 7.675330609083176 Training Image Accuracy: 0.9078125 Training Text Accuracy: 0.91484375
	Batch: 0 / 20 , Loss: tensor(0.5082, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.3481, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.3382, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.3166, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.4026, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.4051, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.3557, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.4382, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.4291, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.4157, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.4318, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 

	Batch: 17 / 20 , Loss: tensor(0.3410, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3278, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.3781, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 178 Training Loss: 6.507686540484428 Training Image Accuracy: 0.94296875 Training Text Accuracy: 0.93515625
	Batch: 0 / 20 , Loss: tensor(0.2176, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.2590, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.5668, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.3023, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.3612, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.4128, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.4482, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.4402, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 2

	Batch: 13 / 20 , Loss: tensor(0.3364, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.4031, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.3499, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.3545, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.2945, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3695, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.3233, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 183 Training Loss: 7.196863412857056 Training Image Accuracy: 0.9265625 Training Text Accuracy: 0.91953125
	Batch: 0 / 20 , Loss: tensor(0.2754, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.3444, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.4140, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.3567, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 

	Batch: 9 / 20 , Loss: tensor(0.2529, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.2728, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.3346, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.3546, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.3235, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.2233, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.2866, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.3157, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.2888, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.2547, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.2715, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 188 Training Loss: 5.670689582824707 Training Image Accuracy: 0.965625 Training Text Accuracy: 0.95078125
	Batch: 

	Batch: 5 / 20 , Loss: tensor(0.2959, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.3148, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.2846, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.2765, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.3365, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.3930, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.3038, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.3081, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2657, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.3221, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.2734, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.5953, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.3148, device

	Batch: 1 / 20 , Loss: tensor(0.4029, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.3610, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.3596, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.2970, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.4025, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.4144, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.4069, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3578, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.3235, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.5214, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.3223, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.2780, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2385, device='cu

	Batch: 19 / 20 , Loss: tensor(0.2437, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 202 Training Loss: 5.110813558101654 Training Image Accuracy: 0.96328125 Training Text Accuracy: 0.96328125
	Batch: 0 / 20 , Loss: tensor(0.1868, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.2708, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2665, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.3001, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.2356, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.3061, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.2852, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.4043, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.2890, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.2475, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20

	Batch: 15 / 20 , Loss: tensor(0.2348, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.2335, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.1820, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3589, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.2282, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 207 Training Loss: 5.017581596970558 Training Image Accuracy: 0.97109375 Training Text Accuracy: 0.96640625
	Batch: 0 / 20 , Loss: tensor(0.2070, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.2836, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2735, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.2312, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.2108, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2462, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 /

	Batch: 11 / 20 , Loss: tensor(2.2443, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(2.7195, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(4.4300, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(4.5685, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(8.9288, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(4.1640, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(4.9892, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(4.8825, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(4.2856, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 212 Training Loss: 48.27712117135525 Training Image Accuracy: 0.48046875 Training Text Accuracy: 0.5609375
	Batch: 0 / 20 , Loss: tensor(4.2515, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(4.2163, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 

	Batch: 7 / 20 , Loss: tensor(3.1361, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(2.6825, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(2.9352, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(2.8329, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(2.7800, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(2.8672, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(2.7010, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(2.6497, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(2.5956, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(2.6394, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(2.7475, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(2.3136, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(2.7683, devi

	Batch: 3 / 20 , Loss: tensor(0.7265, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.7609, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.6874, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.8236, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.8538, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.6659, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.7035, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.5991, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.7081, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.7666, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.4934, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.8858, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.6498, device='

	Batch: 0 / 20 , Loss: tensor(0.3834, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.3762, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.3283, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.4399, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.4190, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.3329, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.3269, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.3504, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3893, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.4684, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.3589, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.5074, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.3584, device='cud

	Batch: 18 / 20 , Loss: tensor(0.1809, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.3145, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 231 Training Loss: 5.92060612142086 Training Image Accuracy: 0.9609375 Training Text Accuracy: 0.95625
	Batch: 0 / 20 , Loss: tensor(0.3147, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.4127, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2191, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.2829, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.3127, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2567, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.2951, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.2155, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.2643, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Lo

	Batch: 14 / 20 , Loss: tensor(0.2721, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.1698, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.2261, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.2800, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.2615, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.3202, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 236 Training Loss: 5.243027314543724 Training Image Accuracy: 0.9671875 Training Text Accuracy: 0.96484375
	Batch: 0 / 20 , Loss: tensor(0.3316, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.1637, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2435, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.2962, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.3294, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 /

	Batch: 10 / 20 , Loss: tensor(0.2424, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.2847, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.3232, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2273, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.2700, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.3045, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.2564, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.3094, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3841, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.2502, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 241 Training Loss: 5.899934649467468 Training Image Accuracy: 0.95703125 Training Text Accuracy: 0.95390625
	Batch: 0 / 20 , Loss: tensor(0.2888, device='cuda:0', grad_fn=<DivBackward0>)
	Batch

	Batch: 6 / 20 , Loss: tensor(0.3617, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.2069, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3016, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.2489, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.2843, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.2964, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.2686, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2720, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.3700, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.2674, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.4285, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.2661, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3790, devic

	Batch: 2 / 20 , Loss: tensor(4.5205, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(4.0244, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(4.3827, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(4.0368, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(4.1107, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(4.0662, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(3.9199, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(3.9780, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(3.8589, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(3.7102, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(3.6509, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(3.5507, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(3.9971, device='c

Epoch: 255 Training Loss: 24.582185685634613 Training Image Accuracy: 0.59453125 Training Text Accuracy: 0.61640625
	Batch: 0 / 20 , Loss: tensor(0.8492, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.8226, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.9986, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.8602, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.8712, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(1.0859, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.8964, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.8025, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.8601, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(1.0936, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.7744, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 2

	Batch: 16 / 20 , Loss: tensor(0.3277, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.3536, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3837, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.3407, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 260 Training Loss: 6.6929143369197845 Training Image Accuracy: 0.93515625 Training Text Accuracy: 0.94375
	Batch: 0 / 20 , Loss: tensor(0.2880, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.4421, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.4121, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.2772, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.3624, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2985, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.3379, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20

	Batch: 12 / 20 , Loss: tensor(0.3200, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.1872, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.1792, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.2429, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.2341, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.2815, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.2408, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.2429, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 265 Training Loss: 4.917585492134094 Training Image Accuracy: 0.97421875 Training Text Accuracy: 0.97265625
	Batch: 0 / 20 , Loss: tensor(0.1280, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.1894, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2254, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 

	Batch: 8 / 20 , Loss: tensor(0.2064, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.2165, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.2982, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.2717, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.4086, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2682, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.2864, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.4720, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.2662, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.2406, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.2702, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.2932, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 270 Training Loss: 5.41007728874683

	Batch: 4 / 20 , Loss: tensor(0.3416, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.3086, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.2883, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.2479, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3213, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.3404, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.1863, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.2877, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.2267, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2490, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.2805, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.2171, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.3140, device=

	Batch: 1 / 20 , Loss: tensor(0.3125, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2745, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.2252, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.2338, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2588, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.2587, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.2246, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.2964, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.2712, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.2495, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.2763, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.2848, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2696, device='cu

	Batch: 19 / 20 , Loss: tensor(0.4641, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 284 Training Loss: 6.6953441351652145 Training Image Accuracy: 0.94765625 Training Text Accuracy: 0.934375
	Batch: 0 / 20 , Loss: tensor(0.3502, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.2950, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2388, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.2821, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.3347, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2910, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.2756, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.2691, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3156, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.2134, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 

	Batch: 15 / 20 , Loss: tensor(0.2802, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.3389, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.4845, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3836, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.3556, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 289 Training Loss: 6.413113087415695 Training Image Accuracy: 0.9421875 Training Text Accuracy: 0.93515625
	Batch: 0 / 20 , Loss: tensor(0.3834, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.4079, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.3489, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.2734, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.5467, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.3192, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 

	Batch: 12 / 20 , Loss: tensor(0.2464, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.3740, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.2687, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.2602, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.2160, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.3052, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.2917, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.4575, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 294 Training Loss: 5.8891371339559555 Training Image Accuracy: 0.95234375 Training Text Accuracy: 0.9515625
	Batch: 0 / 20 , Loss: tensor(0.1701, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.3341, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2986, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 

	Batch: 8 / 20 , Loss: tensor(0.4106, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.2638, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.2099, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.2608, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.3192, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2223, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.2459, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.2599, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.2981, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.2671, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3404, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.4398, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 299 Training Loss: 5.47373470664024

	Batch: 4 / 20 , Loss: tensor(0.2207, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2192, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.2898, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.2006, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.2388, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.3159, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.3280, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.2599, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.2531, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2308, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.2467, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.3328, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.3239, device=

	Batch: 1 / 20 , Loss: tensor(3.8755, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(3.8560, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(3.7990, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(3.8276, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(3.7310, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(3.8039, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(3.7336, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(3.8167, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(3.6773, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(3.7335, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(3.6951, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(3.7102, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(3.7079, device='cu

	Batch: 19 / 20 , Loss: tensor(0.8527, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 313 Training Loss: 27.56222265958786 Training Image Accuracy: 0.54453125 Training Text Accuracy: 0.55859375
	Batch: 0 / 20 , Loss: tensor(0.9883, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(1.1795, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(1.1070, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.9903, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(1.0511, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.9568, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.7670, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(1.2513, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.7969, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.7900, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20

	Batch: 15 / 20 , Loss: tensor(0.3151, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.3289, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.3211, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.4555, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.2769, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 318 Training Loss: 7.2614395916461945 Training Image Accuracy: 0.92734375 Training Text Accuracy: 0.9265625
	Batch: 0 / 20 , Loss: tensor(0.2590, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.2475, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2968, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.3166, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.2827, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2793, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 /

	Batch: 11 / 20 , Loss: tensor(0.2306, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.2472, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.3358, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.2968, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.3090, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.2412, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.2817, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3127, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.3746, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 323 Training Loss: 5.530177861452103 Training Image Accuracy: 0.9578125 Training Text Accuracy: 0.95625
	Batch: 0 / 20 , Loss: tensor(0.2656, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.2401, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 /

	Batch: 8 / 20 , Loss: tensor(0.2793, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.2658, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.2284, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.2907, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.2320, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2202, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.3935, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.2347, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.2751, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.2586, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.2961, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.2314, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 328 Training Loss: 5.10814841091632

	Batch: 4 / 20 , Loss: tensor(0.3101, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.3011, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.2109, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.3001, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3055, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.2343, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.2416, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.3371, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.2670, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2351, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.4065, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.2597, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.3016, device=

	Batch: 1 / 20 , Loss: tensor(0.3050, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2609, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.4419, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.3519, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2141, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.2226, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.2888, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3562, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.2683, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.2781, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.1655, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.2566, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2382, device='cu

	Batch: 19 / 20 , Loss: tensor(0.2748, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 342 Training Loss: 5.850981384515762 Training Image Accuracy: 0.94921875 Training Text Accuracy: 0.94765625
	Batch: 0 / 20 , Loss: tensor(0.2979, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.3127, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2823, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.2660, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.2877, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2731, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.3567, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.3221, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3068, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.1802, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20

	Batch: 15 / 20 , Loss: tensor(0.3260, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.3479, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.2299, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.2891, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.3807, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 347 Training Loss: 5.758801087737083 Training Image Accuracy: 0.95859375 Training Text Accuracy: 0.95234375
	Batch: 0 / 20 , Loss: tensor(0.3294, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.3063, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2630, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.1808, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.2743, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2877, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 /

	Batch: 11 / 20 , Loss: tensor(0.2649, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.3255, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2996, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.3788, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.2792, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.3147, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.3555, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.2713, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.2685, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 352 Training Loss: 5.590696960687637 Training Image Accuracy: 0.9625 Training Text Accuracy: 0.95703125
	Batch: 0 / 20 , Loss: tensor(0.3224, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.2215, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 /

	Batch: 7 / 20 , Loss: tensor(0.3738, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3044, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.4589, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.2874, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.2194, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.3797, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.3061, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.4407, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.2730, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.3278, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.3542, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.4208, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.2424, devi

	Batch: 3 / 20 , Loss: tensor(0.3167, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.2983, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.3857, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.3361, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.3023, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.3122, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.5215, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.5734, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.3192, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.3389, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.3423, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.2729, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.3626, device='

	Batch: 1 / 20 , Loss: tensor(3.6954, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(3.6370, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(3.6144, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(3.5573, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(3.6161, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(3.5553, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(3.5603, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(3.4967, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(3.5380, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(3.5679, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(3.6807, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(3.6050, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(3.5231, device='cu

	Batch: 19 / 20 , Loss: tensor(2.1982, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 371 Training Loss: 37.58425962924957 Training Image Accuracy: 0.3890625 Training Text Accuracy: 0.41640625
	Batch: 0 / 20 , Loss: tensor(1.8844, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(1.6478, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(1.2962, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(1.6054, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(1.9526, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(1.7711, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(1.9097, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(1.8955, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(1.7179, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(1.7078, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 

	Batch: 15 / 20 , Loss: tensor(0.7370, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.7298, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.5355, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.6099, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.6971, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 376 Training Loss: 14.782808423042297 Training Image Accuracy: 0.77265625 Training Text Accuracy: 0.7765625
	Batch: 0 / 20 , Loss: tensor(0.6792, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.6385, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.5965, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.6474, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.5532, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.5793, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 /

	Batch: 11 / 20 , Loss: tensor(0.3270, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.3904, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.4676, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.4338, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.3265, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.3476, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.3924, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3371, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.4857, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 381 Training Loss: 7.646836161613464 Training Image Accuracy: 0.9296875 Training Text Accuracy: 0.921875
	Batch: 0 / 20 , Loss: tensor(0.3400, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.3886, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 

	Batch: 7 / 20 , Loss: tensor(0.2034, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.2397, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.3164, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.3539, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.2553, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.2847, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2321, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.2370, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.3599, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.2659, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.3557, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3590, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.2808, devi

	Batch: 3 / 20 , Loss: tensor(0.3252, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.2552, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2698, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.3580, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.3396, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.2562, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.3044, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.3640, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.3494, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.3452, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2725, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.2944, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.2046, device='

	Batch: 1 / 20 , Loss: tensor(0.2585, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.3252, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.4046, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.3018, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2493, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.3005, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.2601, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.2844, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.3053, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20 , Loss: tensor(0.3002, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 11 / 20 , Loss: tensor(0.3830, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.4131, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.3517, device='cu

	Batch: 19 / 20 , Loss: tensor(0.2683, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 400 Training Loss: 5.845359832048416 Training Image Accuracy: 0.95234375 Training Text Accuracy: 0.94296875
	Batch: 0 / 20 , Loss: tensor(0.4609, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.2294, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2151, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.3024, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.2596, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.3806, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 / 20 , Loss: tensor(0.2375, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 7 / 20 , Loss: tensor(0.3502, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 8 / 20 , Loss: tensor(0.2596, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 9 / 20 , Loss: tensor(0.2665, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 10 / 20

	Batch: 15 / 20 , Loss: tensor(0.3539, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.2552, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.3081, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.2863, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.4613, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 405 Training Loss: 6.1618459820747375 Training Image Accuracy: 0.94140625 Training Text Accuracy: 0.94453125
	Batch: 0 / 20 , Loss: tensor(0.3205, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.2686, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 2 / 20 , Loss: tensor(0.2837, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 3 / 20 , Loss: tensor(0.2816, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 4 / 20 , Loss: tensor(0.2704, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 5 / 20 , Loss: tensor(0.2619, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 6 

	Batch: 11 / 20 , Loss: tensor(0.1904, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 12 / 20 , Loss: tensor(0.2786, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 13 / 20 , Loss: tensor(0.2513, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 14 / 20 , Loss: tensor(0.2809, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 15 / 20 , Loss: tensor(0.2472, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 16 / 20 , Loss: tensor(0.2910, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 17 / 20 , Loss: tensor(0.2756, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 18 / 20 , Loss: tensor(0.3141, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 19 / 20 , Loss: tensor(0.3859, device='cuda:0', grad_fn=<DivBackward0>)
Epoch: 410 Training Loss: 5.435997620224953 Training Image Accuracy: 0.96484375 Training Text Accuracy: 0.95390625
	Batch: 0 / 20 , Loss: tensor(0.3262, device='cuda:0', grad_fn=<DivBackward0>)
	Batch: 1 / 20 , Loss: tensor(0.2601, device='cuda:0', grad_fn=<DivBackward0>)
	Batch:

KeyboardInterrupt: 

In [None]:
# for alpha in (1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9):
#     print("Alpha:", alpha)
#     clip_model = CLIP(
#         embed_dim,
#         image_resolution, (2,2,2,2), 64,
#         16, vocab_size, transformer_width, transformer_heads, transformer_layers
#     ).to(device)
#     train_clip(clip_model, train_text, train_images, batch_size=64, lr=alpha, num_epochs=10)