### Imports

In [9]:
import pandas as pd
import numpy as np
from textblob import TextBlob
import spacy
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, log_loss
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FunctionTransformer
from spacy.lang.es.stop_words import STOP_WORDS
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder

In [7]:
# Cargar datos
df = pd.read_csv('/content/drive/MyDrive/NLP/TP/Airline_Reviews.csv')

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Preprocesamiento y limpieza

In [12]:
python -m spacy download es_core_news_sm
import spacy
from spacy.lang.es.stop_words import STOP_WORDS

SyntaxError: invalid syntax (<ipython-input-12-bd10379a4e19>, line 1)

In [11]:
# Preprocesamiento

# Cargar el modelo de lenguaje español de spaCy
nlp = spacy.load('es_core_news_sm')

# Lista de stopwords en español
stopwords = STOP_WORDS

# Función de preprocesamiento
def preprocess(text, remove_stopwords=True):
    doc = nlp(text.lower())
    tokens = [token.lemma_ for token in doc if token.is_alpha]
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stopwords]
    return ' '.join(tokens)

# Aplicar preprocesamiento a la columna de reseñas
df['cleaned_review'] = df['Review'].apply(lambda x: preprocess(x, remove_stopwords=True))

# Comprobar y eliminar filas con textos vacíos después del preprocesamiento
df = df[df['cleaned_review'].str.strip().astype(bool)]

OSError: [E050] Can't find model 'es_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
# Análisis de Sentimiento con TextBlob
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

df['sentiment'] = df['cleaned_review'].apply(get_sentiment)
df['sentiment_class'] = pd.cut(df['sentiment'], bins=[-1, -0.05, 0.05, 1], labels=['Malo', 'Promedio', 'Bueno'])

# Eliminar filas con valores nulos en 'sentiment_class'
df = df.dropna(subset=['sentiment_class'])

In [None]:
# # Análisis Exploratorio y Visualizaciones
# positive_reviews = ' '.join(df[df['sentiment_class'] == 'Bueno']['cleaned_review'])
# negative_reviews = ' '.join(df[df['sentiment_class'] == 'Malo']['cleaned_review'])

# wordcloud = WordCloud(width=800, height=400).generate(positive_reviews)
# plt.figure(figsize=(10, 5))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.show()

# wordcloud = WordCloud(width=800, height=400).generate(negative_reviews)
# plt.figure(figsize=(10, 5))
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis('off')
# plt.show()

### Sentiment Analysis

#### BoW + Regresión Logística

In [None]:
# Asegurarse de que las etiquetas categóricas estén en formato categórico
df['sentiment_class'] = df['sentiment_class'].astype('category')
df['sentiment_class_code'] = df['sentiment_class'].cat.codes

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['sentiment_class_code'], test_size=0.2, random_state=42)

# Convertir series de pandas a arrays de numpy (para asegurar que no sean de solo lectura)
X_train = np.array(X_train.copy())
X_test = np.array(X_test.copy())
y_train = np.array(y_train, dtype=np.int32, copy=True)
y_test = np.array(y_test, dtype=np.int32, copy=True)

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', LogisticRegression())
])

# Definir los parámetros
parameters = {
    'vect__max_features': [5000, 10000, 20000],
    'vect__ngram_range': [(1, 1), (1, 2)],
    'clf__C': [0.1, 1, 10]
}

# Realizar la búsqueda en malla
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Imprimir los mejores parámetros y el mejor puntaje
print(grid_search.best_params_)
print(grid_search.best_score_)

# Evaluación del mejor modelo
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)

# Evaluación usando entropía cruzada
cross_entropy = log_loss(y_test, y_prob)
print(f"Cross Entropy Loss: {cross_entropy}")

# Evaluación usando métricas estándar
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'clf__C': 0.1, 'vect__max_features': 20000, 'vect__ngram_range': (1, 1)}
0.7718498141481118
Cross Entropy Loss: 0.5261437336256444
              precision    recall  f1-score   support

           0       0.77      0.80      0.78      1313
           1       0.54      0.41      0.47       980
           2       0.85      0.92      0.88      2333

    accuracy                           0.78      4626
   macro avg       0.72      0.71      0.71      4626
weighted avg       0.76      0.78      0.77      4626



#### TF-IDF + Regresión Logística

In [None]:
# Asegurarse de que las etiquetas categóricas estén en formato categórico
df['sentiment_class'] = df['sentiment_class'].astype('category')

# Convertir etiquetas categóricas a códigos numéricos
df['sentiment_class_code'] = df['sentiment_class'].cat.codes

# Dividir en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['sentiment_class_code'], test_size=0.2, random_state=42)

# Convertir series de pandas a arrays de numpy (para asegurar que no sean de solo lectura)
X_train = np.array(X_train.copy())
X_test = np.array(X_test.copy())
y_train = np.array(y_train, dtype=np.int32, copy=True)
y_test = np.array(y_test, dtype=np.int32, copy=True)

# Definir el pipeline con preprocesamiento
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])

# Parámetros para GridSearchCV
parameters = {
    'tfidf__max_features': [5000, 10000, 20000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__C': [0.1, 1, 10]
}

# Realizar GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Imprimir los mejores parámetros y la mejor puntuación
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

# Evaluación del mejor modelo
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)

# Evaluación usando entropía cruzada
cross_entropy = log_loss(y_test, y_prob)
print(f"Cross Entropy Loss: {cross_entropy}")

# Evaluación usando métricas estándar
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Best Parameters: {'clf__C': 10, 'tfidf__max_features': 20000, 'tfidf__ngram_range': (1, 1)}
Best Score: 0.7629313333868859
Cross Entropy Loss: 0.5264325366470037
              precision    recall  f1-score   support

           0       0.76      0.77      0.77      1313
           1       0.50      0.43      0.47       980
           2       0.86      0.90      0.88      2333

    accuracy                           0.77      4626
   macro avg       0.71      0.70      0.70      4626
weighted avg       0.76      0.77      0.76      4626



#### Red Neuronal Recurrente (RNN)

In [None]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.utils.multiclass import unique_labels
from tensorflow.keras.models import clone_model

def create_model(num_layers=1, units=128, dropout_rate=0.2, learning_rate=0.001, frozen_layers=0):
    model = Sequential()
    model.add(Embedding(input_dim=20000, output_dim=128, input_length=100))
    for _ in range(num_layers):
        model.add(LSTM(units=units, dropout=dropout_rate, recurrent_dropout=dropout_rate, trainable=(not frozen_layers)))
    model.add(Dense(units=3, activation='softmax'))
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

class KerasClassifierWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, model):
        self.model = model
        self.classes_ = None

    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.classes_ = unique_labels(y)
        self.model.fit(X, y, epochs=5, batch_size=32, verbose=0)
        return self

    def predict(self, X):
        check_is_fitted(self)
        X = check_array(X)
        return self.model.predict_classes(X)

    def predict_proba(self, X):
        check_is_fitted(self)
        X = check_array(X)
        return self.model.predict_proba(X)

# Utiliza la función create_model para crear tu modelo
model = create_model()

# Envuelve tu modelo de Keras en el wrapper personalizado
wrapped_model = KerasClassifierWrapper(model)

# Define el parámetro grid
param_grid = {
    'num_layers': [1, 2],
    'units': [64, 128],
    'dropout_rate': [0.2, 0.3],
    'learning_rate': [0.001, 0.0001],
    'epochs': [5, 10],
    'batch_size': [32, 64],
    'frozen_layers': [0, 1]
}

# Realiza la búsqueda de cuadrícula
grid = GridSearchCV(estimator=wrapped_model, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1)
grid_result = grid.fit(X_train, y_train)

# Imprime los mejores resultados
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 3 folds for each of 128 candidates, totalling 384 fits




ValueError: Cannot have number of splits n_splits=3 greater than the number of samples: n_samples=2.

In [None]:
from sklearn.model_selection import GridSearchCV, LeaveOneOut
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

# Función para crear el modelo
def create_model(num_layers=1, units=128, dropout_rate=0.2, learning_rate=0.001, frozen_layers=0):
    model = Sequential()
    model.add(Embedding(input_dim=20000, output_dim=128, input_length=100))
    for _ in range(num_layers):
        model.add(LSTM(units=units, dropout=dropout_rate, recurrent_dropout=dropout_rate, trainable=(not frozen_layers)))
    model.add(Dense(units=3, activation='softmax'))
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Wrap the Keras model so it can be used by scikit-learn
model = create_model()

from sklearn.model_selection import LeaveOneOut
import itertools

# Define the parameter grid
param_grid = {
    'num_layers': [1, 2],
    'units': [64, 128],
    'dropout_rate': [0.2, 0.3],
    'learning_rate': [0.001, 0.0001],
    'epochs': [5, 10],
    'batch_size': [32, 64],
    'frozen_layers': [0, 1]
}

# Create a LeaveOneOut object
loo = LeaveOneOut()

best_score = 0
best_params = None

# Iterate over all parameter combinations
for params in itertools.product(*param_grid.values()):
    num_layers, units, dropout_rate, learning_rate, epochs, batch_size, frozen_layers = params
    model = create_model(num_layers=num_layers, units=units, dropout_rate=dropout_rate, learning_rate=learning_rate, frozen_layers=frozen_layers)
    scores = []

    # Perform Leave-One-Out Cross Validation
    for train_index, test_index in loo.split(X_train):
        X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
        y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]

        model.fit(X_train_fold, y_train_fold, epochs=epochs, batch_size=batch_size, verbose=0)
        _, accuracy = model.evaluate(X_test_fold, y_test_fold, verbose=0)
        scores.append(accuracy)

    # Compute average score
    avg_score = np.mean(scores)

    # Update best score and best params if current score is better
    if avg_score > best_score:
        best_score = avg_score
        best_params = params

# Print best score and best parameters
print("Best score:", best_score)
print("Best params:", best_params)


KeyError: "None of [Index([0], dtype='int32')] are in the [index]"

#### Modelo Basado en Transformador (BERT)

In [None]:
# Importar las bibliotecas necesarias
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import Dataset

# Configuración del dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Dividir los datos en conjuntos de entrenamiento y prueba
train_texts, test_texts, train_labels, test_labels = train_test_split(df['cleaned_review'], df['sentiment_class'], test_size=0.2, random_state=42)

# Cargar el tokenizador y el modelo preentrenado de BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=3).to(device)

default_label = len(label_map)  # Asigna un valor por defecto al último valor del mapeo
label_map = {
    "Bueno": 0,
    "Promedio": 1,
    "Malo": 2,
}

train_labels_mapped = [label_map Fa.get(label, default_label) for label in train_labels.tolist()]
# Crear el Dataset de Hugging
test_labels_mapped = [label_map.get(label, default_label) for label in test_labels.tolist()]
ce con las etiquetas mapeadas
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels_mapped
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels_mapped
})


# Definir los argumentos de entrenamiento
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=0.5,  # Reducir el número de épocas
    per_device_train_batch_size=2,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=15000,  # Guardar el modelo menos frecuentemente
)

# Crear el DataCollator para manejar el padding dinámico
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Crear el Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Entrenar el modelo
trainer.train()

# Evaluar el modelo
predictions = trainer.predict(test_dataset)
preds = torch.argmax(predictions.predictions, dim=1)
accuracy = accuracy_score(test_labels, preds)
report = classification_report(test_labels, preds)

print("Accuracy:", accuracy)
print("Classification Report:\n", report)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 163/9251 [32:07<29:50:48, 11.82s/it]
                                                   
  0%|          | 10/4626 [01:15<9:44:42,  7.60s/it]

{'loss': 1.1783, 'grad_norm': 8.972036361694336, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


                                                   
  0%|          | 20/4626 [02:32<9:46:24,  7.64s/it]

{'loss': 1.178, 'grad_norm': 273.867919921875, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


                                                   
  1%|          | 30/4626 [03:48<9:39:12,  7.56s/it]

{'loss': 1.1762, 'grad_norm': 8.874467849731445, 'learning_rate': 3e-06, 'epoch': 0.0}


                                                   
  1%|          | 40/4626 [05:03<9:35:20,  7.53s/it]

{'loss': 1.1138, 'grad_norm': 12.782450675964355, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}


                                                   
  1%|          | 50/4626 [06:20<9:41:49,  7.63s/it]

{'loss': 1.0513, 'grad_norm': 7.308981895446777, 'learning_rate': 5e-06, 'epoch': 0.01}


                                                   
  1%|▏         | 60/4626 [07:35<9:33:14,  7.53s/it]

{'loss': 1.0828, 'grad_norm': 276.2522888183594, 'learning_rate': 6e-06, 'epoch': 0.01}


                                                   
  2%|▏         | 70/4626 [08:51<9:30:40,  7.52s/it]

{'loss': 0.7817, 'grad_norm': 10.139519691467285, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.01}


                                                   
  2%|▏         | 80/4626 [10:07<9:40:39,  7.66s/it]

{'loss': 1.2151, 'grad_norm': 10.772778511047363, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.01}


                                                   
  2%|▏         | 90/4626 [11:23<9:32:22,  7.57s/it]

{'loss': 1.0887, 'grad_norm': 10.24316692352295, 'learning_rate': 9e-06, 'epoch': 0.01}


                                                    
  2%|▏         | 100/4626 [12:39<9:28:35,  7.54s/it]

{'loss': 1.053, 'grad_norm': 9.478252410888672, 'learning_rate': 1e-05, 'epoch': 0.01}


                                                    
  2%|▏         | 110/4626 [13:54<9:28:29,  7.55s/it]

{'loss': 1.0821, 'grad_norm': 192.3200225830078, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.01}


                                                    
  3%|▎         | 120/4626 [15:11<9:29:14,  7.58s/it]

{'loss': 1.062, 'grad_norm': 11.510499000549316, 'learning_rate': 1.2e-05, 'epoch': 0.01}


                                                    
  3%|▎         | 130/4626 [16:26<9:20:59,  7.49s/it]

{'loss': 1.3315, 'grad_norm': 6.001075744628906, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.01}


                                                    
  3%|▎         | 140/4626 [17:42<9:25:34,  7.56s/it]

{'loss': 1.1011, 'grad_norm': 12.234224319458008, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.02}


                                                    
  3%|▎         | 150/4626 [18:58<9:36:30,  7.73s/it]

{'loss': 1.0184, 'grad_norm': 21.43937110900879, 'learning_rate': 1.5e-05, 'epoch': 0.02}


                                                    
  3%|▎         | 160/4626 [20:14<9:26:11,  7.61s/it]

{'loss': 1.0074, 'grad_norm': 6.1920270919799805, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.02}


                                                    
  4%|▎         | 170/4626 [21:30<9:24:49,  7.61s/it]

{'loss': 1.0531, 'grad_norm': 7.29820442199707, 'learning_rate': 1.7000000000000003e-05, 'epoch': 0.02}


                                                    
  4%|▍         | 180/4626 [22:45<9:18:55,  7.54s/it]

{'loss': 1.1088, 'grad_norm': 7.418067932128906, 'learning_rate': 1.8e-05, 'epoch': 0.02}


                                                    
  4%|▍         | 190/4626 [24:01<9:20:43,  7.58s/it]

{'loss': 1.0209, 'grad_norm': 9.352797508239746, 'learning_rate': 1.9e-05, 'epoch': 0.02}


                                                    
  4%|▍         | 200/4626 [25:17<9:18:50,  7.58s/it]

{'loss': 0.9229, 'grad_norm': 4.575834274291992, 'learning_rate': 2e-05, 'epoch': 0.02}


                                                    
  5%|▍         | 210/4626 [26:33<9:17:59,  7.58s/it]

{'loss': 1.2784, 'grad_norm': 7.1370062828063965, 'learning_rate': 2.1e-05, 'epoch': 0.02}


                                                    
  5%|▍         | 220/4626 [27:49<9:18:27,  7.60s/it]

{'loss': 0.9562, 'grad_norm': 7.4313764572143555, 'learning_rate': 2.2000000000000003e-05, 'epoch': 0.02}


                                                    
  5%|▍         | 230/4626 [29:05<9:24:35,  7.71s/it]

{'loss': 1.0579, 'grad_norm': 7.401463508605957, 'learning_rate': 2.3000000000000003e-05, 'epoch': 0.02}


                                                    
  5%|▌         | 240/4626 [30:21<9:19:02,  7.65s/it]

{'loss': 0.973, 'grad_norm': 6.054625988006592, 'learning_rate': 2.4e-05, 'epoch': 0.03}


                                                    
  5%|▌         | 250/4626 [31:37<9:13:10,  7.58s/it]

{'loss': 1.1318, 'grad_norm': 7.102802753448486, 'learning_rate': 2.5e-05, 'epoch': 0.03}


                                                    
  6%|▌         | 260/4626 [32:52<9:05:30,  7.50s/it]

{'loss': 0.9451, 'grad_norm': 12.63166618347168, 'learning_rate': 2.6000000000000002e-05, 'epoch': 0.03}


                                                    
  6%|▌         | 270/4626 [34:09<9:14:23,  7.64s/it]

{'loss': 1.175, 'grad_norm': 5.573825359344482, 'learning_rate': 2.7000000000000002e-05, 'epoch': 0.03}


                                                    
  6%|▌         | 280/4626 [35:25<9:15:30,  7.67s/it]

{'loss': 0.8663, 'grad_norm': 5.950603008270264, 'learning_rate': 2.8000000000000003e-05, 'epoch': 0.03}


                                                    
  6%|▋         | 290/4626 [36:41<9:06:24,  7.56s/it]

{'loss': 0.9517, 'grad_norm': 5.729671955108643, 'learning_rate': 2.9e-05, 'epoch': 0.03}


                                                    
  6%|▋         | 300/4626 [37:58<9:12:25,  7.66s/it]

{'loss': 0.7383, 'grad_norm': 16.831714630126953, 'learning_rate': 3e-05, 'epoch': 0.03}


                                                    
  7%|▋         | 310/4626 [39:13<8:57:59,  7.48s/it]

{'loss': 0.9015, 'grad_norm': 15.01671028137207, 'learning_rate': 3.1e-05, 'epoch': 0.03}


                                                    
  7%|▋         | 320/4626 [40:28<9:00:21,  7.53s/it]

{'loss': 1.1492, 'grad_norm': 49.577274322509766, 'learning_rate': 3.2000000000000005e-05, 'epoch': 0.03}


                                                    
  7%|▋         | 330/4626 [41:44<9:08:16,  7.66s/it]

{'loss': 0.9172, 'grad_norm': 3.99371600151062, 'learning_rate': 3.3e-05, 'epoch': 0.04}


                                                    
  7%|▋         | 340/4626 [43:02<9:13:27,  7.75s/it]

{'loss': 0.9581, 'grad_norm': 22.317829132080078, 'learning_rate': 3.4000000000000007e-05, 'epoch': 0.04}


                                                    
  8%|▊         | 350/4626 [44:21<9:25:15,  7.93s/it]

{'loss': 0.8499, 'grad_norm': 4.7289581298828125, 'learning_rate': 3.5e-05, 'epoch': 0.04}


                                                    
  8%|▊         | 360/4626 [45:40<9:20:02,  7.88s/it]

{'loss': 1.1494, 'grad_norm': 5.000403881072998, 'learning_rate': 3.6e-05, 'epoch': 0.04}


                                                    
  8%|▊         | 370/4626 [46:58<9:14:56,  7.82s/it]

{'loss': 0.9272, 'grad_norm': 16.49075698852539, 'learning_rate': 3.7e-05, 'epoch': 0.04}


                                                    
  8%|▊         | 380/4626 [48:17<9:22:28,  7.95s/it]

{'loss': 0.9768, 'grad_norm': 12.523836135864258, 'learning_rate': 3.8e-05, 'epoch': 0.04}


                                                    
  8%|▊         | 390/4626 [49:35<9:10:35,  7.80s/it]

{'loss': 1.4516, 'grad_norm': 38.164215087890625, 'learning_rate': 3.9000000000000006e-05, 'epoch': 0.04}


                                                    
  9%|▊         | 400/4626 [50:53<9:09:40,  7.80s/it]

{'loss': 0.5742, 'grad_norm': 0.21015076339244843, 'learning_rate': 4e-05, 'epoch': 0.04}


                                                    
  9%|▉         | 410/4626 [52:11<9:03:49,  7.74s/it]

{'loss': 1.8074, 'grad_norm': 5.942378044128418, 'learning_rate': 4.1e-05, 'epoch': 0.04}


                                                    
  9%|▉         | 420/4626 [53:29<9:01:50,  7.73s/it]

{'loss': 1.0515, 'grad_norm': 4.244840621948242, 'learning_rate': 4.2e-05, 'epoch': 0.05}


                                                    
  9%|▉         | 430/4626 [54:47<9:07:24,  7.83s/it]

{'loss': 0.8668, 'grad_norm': 4.28302526473999, 'learning_rate': 4.3e-05, 'epoch': 0.05}


                                                    
 10%|▉         | 440/4626 [56:06<9:12:32,  7.92s/it]

{'loss': 0.9413, 'grad_norm': 111.23666381835938, 'learning_rate': 4.4000000000000006e-05, 'epoch': 0.05}


                                                    
 10%|▉         | 450/4626 [57:24<9:06:13,  7.85s/it]

{'loss': 0.8365, 'grad_norm': 16.338794708251953, 'learning_rate': 4.5e-05, 'epoch': 0.05}


                                                    
 10%|▉         | 460/4626 [58:42<9:03:57,  7.83s/it]

{'loss': 0.8138, 'grad_norm': 8.649962425231934, 'learning_rate': 4.600000000000001e-05, 'epoch': 0.05}


                                                      
 10%|█         | 470/4626 [1:00:01<9:04:12,  7.86s/it]

{'loss': 1.5839, 'grad_norm': 1.3428159952163696, 'learning_rate': 4.7e-05, 'epoch': 0.05}


                                                      
 10%|█         | 480/4626 [1:01:17<8:41:03,  7.54s/it]

{'loss': 1.1256, 'grad_norm': 19.292667388916016, 'learning_rate': 4.8e-05, 'epoch': 0.05}


                                                      
 11%|█         | 490/4626 [1:02:32<8:40:10,  7.55s/it]

{'loss': 1.3971, 'grad_norm': 10.982258796691895, 'learning_rate': 4.9e-05, 'epoch': 0.05}


                                                      
 11%|█         | 500/4626 [1:03:48<8:36:02,  7.50s/it]

{'loss': 1.0687, 'grad_norm': 16.942121505737305, 'learning_rate': 5e-05, 'epoch': 0.05}


                                                      
 11%|█         | 510/4626 [1:05:04<8:50:55,  7.74s/it]

{'loss': 1.1589, 'grad_norm': 9.776317596435547, 'learning_rate': 4.987881725642269e-05, 'epoch': 0.06}


                                                      
 11%|█         | 520/4626 [1:06:20<8:39:59,  7.60s/it]

{'loss': 0.9924, 'grad_norm': 15.075815200805664, 'learning_rate': 4.9757634512845374e-05, 'epoch': 0.06}


                                                      
 11%|█▏        | 530/4626 [1:07:37<8:49:11,  7.75s/it]

{'loss': 1.1858, 'grad_norm': 8.937904357910156, 'learning_rate': 4.963645176926806e-05, 'epoch': 0.06}


                                                      
 12%|█▏        | 540/4626 [1:08:55<8:54:56,  7.86s/it]

{'loss': 1.1313, 'grad_norm': 8.606093406677246, 'learning_rate': 4.951526902569074e-05, 'epoch': 0.06}


                                                      
 12%|█▏        | 550/4626 [1:10:13<8:45:45,  7.74s/it]

{'loss': 0.7638, 'grad_norm': 9.674701690673828, 'learning_rate': 4.939408628211343e-05, 'epoch': 0.06}


                                                      
 12%|█▏        | 560/4626 [1:11:32<9:04:51,  8.04s/it]

{'loss': 0.7454, 'grad_norm': 6.934145450592041, 'learning_rate': 4.9272903538536116e-05, 'epoch': 0.06}


                                                      
 12%|█▏        | 570/4626 [1:12:50<8:45:10,  7.77s/it]

{'loss': 0.6306, 'grad_norm': 13.127276420593262, 'learning_rate': 4.91517207949588e-05, 'epoch': 0.06}


                                                      
 13%|█▎        | 580/4626 [1:14:07<8:43:08,  7.76s/it]

{'loss': 0.6549, 'grad_norm': 0.48859700560569763, 'learning_rate': 4.903053805138149e-05, 'epoch': 0.06}


                                                      
 13%|█▎        | 590/4626 [1:15:25<8:40:04,  7.73s/it]

{'loss': 2.0454, 'grad_norm': 6.927049160003662, 'learning_rate': 4.890935530780417e-05, 'epoch': 0.06}


                                                      
 13%|█▎        | 600/4626 [1:16:42<8:36:56,  7.70s/it]

{'loss': 0.7031, 'grad_norm': 13.281806945800781, 'learning_rate': 4.878817256422686e-05, 'epoch': 0.06}


                                                      
 13%|█▎        | 610/4626 [1:18:00<8:39:56,  7.77s/it]

{'loss': 0.9188, 'grad_norm': 6.299983978271484, 'learning_rate': 4.866698982064954e-05, 'epoch': 0.07}


                                                      
 13%|█▎        | 620/4626 [1:19:17<8:38:09,  7.76s/it]

{'loss': 0.8305, 'grad_norm': 14.15206241607666, 'learning_rate': 4.8545807077072224e-05, 'epoch': 0.07}


                                                      
 14%|█▎        | 630/4626 [1:20:34<8:28:58,  7.64s/it]

{'loss': 0.8938, 'grad_norm': 5.8297438621521, 'learning_rate': 4.842462433349491e-05, 'epoch': 0.07}


                                                         
 14%|█▍        | 640/4626 [1:32:28<112:18:06, 101.43s/it]

{'loss': 0.5865, 'grad_norm': 5.787783145904541, 'learning_rate': 4.8303441589917595e-05, 'epoch': 0.07}


                                                         
 14%|█▍        | 650/4626 [1:33:46<11:35:12, 10.49s/it]

{'loss': 1.3208, 'grad_norm': 0.26890072226524353, 'learning_rate': 4.8182258846340284e-05, 'epoch': 0.07}


                                                       
 14%|█▍        | 660/4626 [1:35:05<8:44:39,  7.94s/it]

{'loss': 1.3224, 'grad_norm': 13.79137897491455, 'learning_rate': 4.806107610276297e-05, 'epoch': 0.07}


                                                      
 14%|█▍        | 670/4626 [1:36:22<8:28:09,  7.71s/it]

{'loss': 1.2256, 'grad_norm': 8.725366592407227, 'learning_rate': 4.7939893359185655e-05, 'epoch': 0.07}


                                                      
 15%|█▍        | 680/4626 [1:37:40<8:30:44,  7.77s/it]

{'loss': 1.0044, 'grad_norm': 17.748517990112305, 'learning_rate': 4.7818710615608344e-05, 'epoch': 0.07}


                                                      
 15%|█▍        | 690/4626 [1:38:57<8:23:41,  7.68s/it]

{'loss': 1.1049, 'grad_norm': 9.676599502563477, 'learning_rate': 4.769752787203102e-05, 'epoch': 0.07}


                                                      
 15%|█▌        | 700/4626 [1:40:15<8:33:32,  7.85s/it]

{'loss': 0.9744, 'grad_norm': 9.595849990844727, 'learning_rate': 4.757634512845371e-05, 'epoch': 0.08}


                                                      
 15%|█▌        | 710/4626 [1:41:34<8:33:26,  7.87s/it]

{'loss': 1.006, 'grad_norm': 9.91978645324707, 'learning_rate': 4.74551623848764e-05, 'epoch': 0.08}


                                                      
 16%|█▌        | 720/4626 [1:42:52<8:32:58,  7.88s/it]

{'loss': 0.9057, 'grad_norm': 14.834243774414062, 'learning_rate': 4.733397964129908e-05, 'epoch': 0.08}


                                                      
 16%|█▌        | 730/4626 [1:44:12<8:33:38,  7.91s/it]

{'loss': 0.7247, 'grad_norm': 0.46465474367141724, 'learning_rate': 4.721279689772177e-05, 'epoch': 0.08}


                                                      
 16%|█▌        | 740/4626 [1:45:31<8:34:32,  7.94s/it]

{'loss': 0.7953, 'grad_norm': 0.21811307966709137, 'learning_rate': 4.709161415414445e-05, 'epoch': 0.08}


                                                      
 16%|█▌        | 750/4626 [1:46:50<8:32:17,  7.93s/it]

{'loss': 1.4812, 'grad_norm': 11.960599899291992, 'learning_rate': 4.697043141056714e-05, 'epoch': 0.08}


                                                      
 16%|█▋        | 760/4626 [1:48:10<8:35:53,  8.01s/it]

{'loss': 1.2067, 'grad_norm': 8.236360549926758, 'learning_rate': 4.684924866698982e-05, 'epoch': 0.08}


                                                      
 17%|█▋        | 770/4626 [1:49:30<8:30:46,  7.95s/it]

{'loss': 0.6806, 'grad_norm': 2.0447323322296143, 'learning_rate': 4.6728065923412505e-05, 'epoch': 0.08}


                                                      
 17%|█▋        | 780/4626 [1:50:50<8:33:00,  8.00s/it]

{'loss': 1.0525, 'grad_norm': 8.532641410827637, 'learning_rate': 4.6606883179835194e-05, 'epoch': 0.08}


                                                      
 17%|█▋        | 790/4626 [1:52:09<8:24:51,  7.90s/it]

{'loss': 0.9217, 'grad_norm': 12.024447441101074, 'learning_rate': 4.6485700436257876e-05, 'epoch': 0.09}


                                                      
 17%|█▋        | 800/4626 [1:53:28<8:25:44,  7.93s/it]

{'loss': 0.9513, 'grad_norm': 13.317875862121582, 'learning_rate': 4.6364517692680565e-05, 'epoch': 0.09}


                                                      
 18%|█▊        | 810/4626 [1:54:48<8:25:58,  7.96s/it]

{'loss': 1.6637, 'grad_norm': 131.6868896484375, 'learning_rate': 4.624333494910325e-05, 'epoch': 0.09}


                                                      
 18%|█▊        | 820/4626 [1:56:08<8:23:55,  7.94s/it]

{'loss': 1.1674, 'grad_norm': 15.676673889160156, 'learning_rate': 4.612215220552594e-05, 'epoch': 0.09}


                                                      
 18%|█▊        | 830/4626 [1:57:27<8:26:11,  8.00s/it]

{'loss': 1.0547, 'grad_norm': 8.18075942993164, 'learning_rate': 4.6000969461948626e-05, 'epoch': 0.09}


                                                      
 18%|█▊        | 840/4626 [1:58:47<8:24:16,  7.99s/it]

{'loss': 1.1797, 'grad_norm': 10.840412139892578, 'learning_rate': 4.58797867183713e-05, 'epoch': 0.09}


                                                      
 18%|█▊        | 850/4626 [2:00:06<8:18:11,  7.92s/it]

{'loss': 0.7062, 'grad_norm': 17.18253517150879, 'learning_rate': 4.575860397479399e-05, 'epoch': 0.09}


                                                      
 19%|█▊        | 860/4626 [2:01:26<8:20:10,  7.97s/it]

{'loss': 1.1782, 'grad_norm': 7.438745975494385, 'learning_rate': 4.563742123121667e-05, 'epoch': 0.09}


                                                      
 19%|█▉        | 870/4626 [2:02:48<8:33:38,  8.21s/it]

{'loss': 1.0332, 'grad_norm': 7.782792568206787, 'learning_rate': 4.551623848763936e-05, 'epoch': 0.09}


                                                      
 19%|█▉        | 880/4626 [2:04:10<8:33:10,  8.22s/it]

{'loss': 0.8661, 'grad_norm': 7.945191860198975, 'learning_rate': 4.539505574406205e-05, 'epoch': 0.1}


                                                      
 19%|█▉        | 890/4626 [2:05:31<8:20:38,  8.04s/it]

{'loss': 1.0302, 'grad_norm': 9.602741241455078, 'learning_rate': 4.527387300048473e-05, 'epoch': 0.1}


                                                      
 19%|█▉        | 900/4626 [2:06:52<8:20:24,  8.06s/it]

{'loss': 0.8698, 'grad_norm': 6.521312236785889, 'learning_rate': 4.515269025690742e-05, 'epoch': 0.1}


                                                      
 20%|█▉        | 910/4626 [2:08:13<8:27:55,  8.20s/it]

{'loss': 1.0634, 'grad_norm': 8.188277244567871, 'learning_rate': 4.5031507513330104e-05, 'epoch': 0.1}


                                                      
 20%|█▉        | 920/4626 [2:09:32<8:10:16,  7.94s/it]

{'loss': 0.8243, 'grad_norm': 6.395430088043213, 'learning_rate': 4.491032476975279e-05, 'epoch': 0.1}


                                                      
 20%|██        | 930/4626 [2:10:52<8:06:41,  7.90s/it]

{'loss': 0.829, 'grad_norm': 10.90233325958252, 'learning_rate': 4.4789142026175476e-05, 'epoch': 0.1}


                                                      
 20%|██        | 940/4626 [2:12:10<8:04:27,  7.89s/it]

{'loss': 0.6577, 'grad_norm': 89.92393493652344, 'learning_rate': 4.466795928259816e-05, 'epoch': 0.1}


                                                      
 21%|██        | 950/4626 [2:13:29<8:11:16,  8.02s/it]

{'loss': 0.8221, 'grad_norm': 12.422714233398438, 'learning_rate': 4.454677653902085e-05, 'epoch': 0.1}


                                                      
 21%|██        | 960/4626 [2:14:50<8:10:22,  8.03s/it]

{'loss': 1.541, 'grad_norm': 1.3150070905685425, 'learning_rate': 4.442559379544353e-05, 'epoch': 0.1}


                                                      
 21%|██        | 970/4626 [2:16:09<7:56:49,  7.83s/it]

{'loss': 0.8238, 'grad_norm': 2.4827349185943604, 'learning_rate': 4.430441105186622e-05, 'epoch': 0.1}


                                                      
 21%|██        | 980/4626 [2:17:26<7:45:43,  7.66s/it]

{'loss': 1.205, 'grad_norm': 43.96434783935547, 'learning_rate': 4.418322830828891e-05, 'epoch': 0.11}


                                                      
 21%|██▏       | 990/4626 [2:18:42<7:41:56,  7.62s/it]

{'loss': 1.2678, 'grad_norm': 9.799456596374512, 'learning_rate': 4.406204556471158e-05, 'epoch': 0.11}


                                                       
 22%|██▏       | 1000/4626 [2:19:58<7:39:20,  7.60s/it]

{'loss': 0.9014, 'grad_norm': 15.988316535949707, 'learning_rate': 4.394086282113427e-05, 'epoch': 0.11}


                                                       
 22%|██▏       | 1010/4626 [2:21:15<7:39:28,  7.62s/it]

{'loss': 1.1137, 'grad_norm': 6.191070556640625, 'learning_rate': 4.3819680077556954e-05, 'epoch': 0.11}


                                                       
 22%|██▏       | 1020/4626 [2:22:33<7:50:35,  7.83s/it]

{'loss': 1.145, 'grad_norm': 9.081730842590332, 'learning_rate': 4.369849733397964e-05, 'epoch': 0.11}


                                                       
 22%|██▏       | 1030/4626 [2:23:51<7:50:52,  7.86s/it]

{'loss': 0.9606, 'grad_norm': 4.757795333862305, 'learning_rate': 4.357731459040233e-05, 'epoch': 0.11}


                                                       
 22%|██▏       | 1040/4626 [2:25:10<7:50:48,  7.88s/it]

{'loss': 1.0516, 'grad_norm': 13.451766967773438, 'learning_rate': 4.3456131846825015e-05, 'epoch': 0.11}


                                                       
 23%|██▎       | 1050/4626 [2:26:29<7:48:17,  7.86s/it]

{'loss': 0.9263, 'grad_norm': 1.802112340927124, 'learning_rate': 4.3334949103247704e-05, 'epoch': 0.11}


                                                       
 23%|██▎       | 1060/4626 [2:27:47<7:44:45,  7.82s/it]

{'loss': 0.8291, 'grad_norm': 1.1559914350509644, 'learning_rate': 4.321376635967038e-05, 'epoch': 0.11}


                                                       
 23%|██▎       | 1070/4626 [2:29:06<7:45:32,  7.86s/it]

{'loss': 1.4016, 'grad_norm': 14.150042533874512, 'learning_rate': 4.309258361609307e-05, 'epoch': 0.12}


                                                       
 23%|██▎       | 1080/4626 [2:30:25<7:49:52,  7.95s/it]

{'loss': 1.2841, 'grad_norm': 7.762014865875244, 'learning_rate': 4.297140087251576e-05, 'epoch': 0.12}


                                                       
 24%|██▎       | 1090/4626 [2:31:44<7:39:58,  7.80s/it]

{'loss': 1.1692, 'grad_norm': 12.523224830627441, 'learning_rate': 4.285021812893844e-05, 'epoch': 0.12}


                                                       
 24%|██▍       | 1100/4626 [2:33:02<7:41:06,  7.85s/it]

{'loss': 1.024, 'grad_norm': 12.225740432739258, 'learning_rate': 4.272903538536113e-05, 'epoch': 0.12}


                                                       
 24%|██▍       | 1110/4626 [2:34:21<7:43:03,  7.90s/it]

{'loss': 1.2007, 'grad_norm': 11.069928169250488, 'learning_rate': 4.260785264178381e-05, 'epoch': 0.12}


                                                       
 24%|██▍       | 1120/4626 [2:35:39<7:39:21,  7.86s/it]

{'loss': 1.1824, 'grad_norm': 9.202807426452637, 'learning_rate': 4.24866698982065e-05, 'epoch': 0.12}


                                                       
 24%|██▍       | 1130/4626 [2:36:58<7:32:16,  7.76s/it]

{'loss': 1.1404, 'grad_norm': 9.796546936035156, 'learning_rate': 4.236548715462918e-05, 'epoch': 0.12}


                                                       
 25%|██▍       | 1140/4626 [2:38:15<7:25:41,  7.67s/it]

{'loss': 0.8666, 'grad_norm': 4.502579689025879, 'learning_rate': 4.2244304411051864e-05, 'epoch': 0.12}


                                                       
 25%|██▍       | 1150/4626 [2:39:32<7:25:29,  7.69s/it]

{'loss': 1.471, 'grad_norm': 11.052311897277832, 'learning_rate': 4.2123121667474553e-05, 'epoch': 0.12}


                                                       
 25%|██▌       | 1160/4626 [2:40:50<7:26:00,  7.72s/it]

{'loss': 1.1738, 'grad_norm': 23.743440628051758, 'learning_rate': 4.2001938923897236e-05, 'epoch': 0.13}


                                                       
 25%|██▌       | 1170/4626 [2:42:07<7:25:04,  7.73s/it]

{'loss': 1.031, 'grad_norm': 22.01018714904785, 'learning_rate': 4.1880756180319925e-05, 'epoch': 0.13}


                                                       
 26%|██▌       | 1180/4626 [2:43:24<7:22:45,  7.71s/it]

{'loss': 1.1138, 'grad_norm': 15.866643905639648, 'learning_rate': 4.1759573436742614e-05, 'epoch': 0.13}


                                                       
 26%|██▌       | 1190/4626 [2:44:41<7:26:06,  7.79s/it]

{'loss': 1.0159, 'grad_norm': 11.200494766235352, 'learning_rate': 4.1638390693165296e-05, 'epoch': 0.13}


                                                       
 26%|██▌       | 1200/4626 [2:45:59<7:24:17,  7.78s/it]

{'loss': 0.9473, 'grad_norm': 9.500378608703613, 'learning_rate': 4.1517207949587985e-05, 'epoch': 0.13}


                                                       
 26%|██▌       | 1210/4626 [2:47:17<7:19:42,  7.72s/it]

{'loss': 1.1888, 'grad_norm': 11.025219917297363, 'learning_rate': 4.139602520601066e-05, 'epoch': 0.13}


                                                       
 26%|██▋       | 1220/4626 [2:48:35<7:26:08,  7.86s/it]

{'loss': 0.6975, 'grad_norm': 5.545356750488281, 'learning_rate': 4.127484246243335e-05, 'epoch': 0.13}


                                                       
 27%|██▋       | 1230/4626 [2:49:52<7:16:18,  7.71s/it]

{'loss': 1.2492, 'grad_norm': 5.79081916809082, 'learning_rate': 4.115365971885604e-05, 'epoch': 0.13}


                                                       
 27%|██▋       | 1240/4626 [2:51:10<7:18:01,  7.76s/it]

{'loss': 1.0044, 'grad_norm': 7.455893039703369, 'learning_rate': 4.103247697527872e-05, 'epoch': 0.13}


                                                       
 27%|██▋       | 1250/4626 [2:52:28<7:19:42,  7.81s/it]

{'loss': 1.2231, 'grad_norm': 17.35140037536621, 'learning_rate': 4.091129423170141e-05, 'epoch': 0.14}


                                                       
 27%|██▋       | 1260/4626 [2:53:45<7:15:44,  7.77s/it]

{'loss': 0.9798, 'grad_norm': 9.670408248901367, 'learning_rate': 4.079011148812409e-05, 'epoch': 0.14}


                                                       
 27%|██▋       | 1270/4626 [2:55:01<6:58:10,  7.48s/it]

{'loss': 1.0986, 'grad_norm': 8.500797271728516, 'learning_rate': 4.066892874454678e-05, 'epoch': 0.14}


                                                         
 28%|██▊       | 1280/4626 [3:01:42<70:41:32, 76.06s/it]

{'loss': 0.8436, 'grad_norm': 13.397397994995117, 'learning_rate': 4.0547746000969464e-05, 'epoch': 0.14}


                                                        
 28%|██▊       | 1290/4626 [3:02:59<8:56:44,  9.65s/it]

{'loss': 0.9636, 'grad_norm': 5.64932107925415, 'learning_rate': 4.0426563257392146e-05, 'epoch': 0.14}


                                                       
 28%|██▊       | 1300/4626 [3:04:17<7:15:38,  7.86s/it]

{'loss': 1.1953, 'grad_norm': 6.705053329467773, 'learning_rate': 4.0305380513814835e-05, 'epoch': 0.14}


                                                       
 28%|██▊       | 1310/4626 [3:05:34<7:08:32,  7.75s/it]

{'loss': 0.9847, 'grad_norm': 18.108341217041016, 'learning_rate': 4.018419777023752e-05, 'epoch': 0.14}


                                                       
 29%|██▊       | 1320/4626 [3:06:51<7:05:44,  7.73s/it]

{'loss': 1.1893, 'grad_norm': 17.013174057006836, 'learning_rate': 4.0063015026660206e-05, 'epoch': 0.14}


                                                       
 29%|██▉       | 1330/4626 [3:08:09<7:06:34,  7.77s/it]

{'loss': 1.0088, 'grad_norm': 8.410881042480469, 'learning_rate': 3.994183228308289e-05, 'epoch': 0.14}


                                                       
 29%|██▉       | 1340/4626 [3:09:26<7:04:00,  7.74s/it]

{'loss': 1.1439, 'grad_norm': 6.50054407119751, 'learning_rate': 3.982064953950558e-05, 'epoch': 0.14}


                                                       
 29%|██▉       | 1350/4626 [3:10:44<7:05:26,  7.79s/it]

{'loss': 1.1615, 'grad_norm': 12.281082153320312, 'learning_rate': 3.969946679592827e-05, 'epoch': 0.15}


                                                       
 29%|██▉       | 1360/4626 [3:12:02<7:00:27,  7.72s/it]

{'loss': 0.9943, 'grad_norm': 13.45701789855957, 'learning_rate': 3.957828405235094e-05, 'epoch': 0.15}


                                                             
 30%|██▉       | 1370/4626 [10:30:11<293:58:52, 325.04s/it]

{'loss': 1.1157, 'grad_norm': 20.67707633972168, 'learning_rate': 3.945710130877363e-05, 'epoch': 0.15}


                                                           
 30%|██▉       | 1380/4626 [10:31:28<15:01:55, 16.67s/it]

{'loss': 1.2362, 'grad_norm': 9.763714790344238, 'learning_rate': 3.933591856519632e-05, 'epoch': 0.15}


                                                         
 30%|███       | 1390/4626 [10:32:47<7:22:55,  8.21s/it]

{'loss': 0.9578, 'grad_norm': 11.661942481994629, 'learning_rate': 3.9214735821619e-05, 'epoch': 0.15}


                                                        
 30%|███       | 1400/4626 [10:34:09<7:15:12,  8.09s/it]

{'loss': 0.9116, 'grad_norm': 8.29368782043457, 'learning_rate': 3.909355307804169e-05, 'epoch': 0.15}


                                                        
 30%|███       | 1410/4626 [10:35:28<7:02:52,  7.89s/it]

{'loss': 1.2156, 'grad_norm': 20.1884765625, 'learning_rate': 3.8972370334464374e-05, 'epoch': 0.15}


                                                        
 31%|███       | 1420/4626 [10:36:46<7:02:03,  7.90s/it]

{'loss': 1.1288, 'grad_norm': 7.8325300216674805, 'learning_rate': 3.885118759088706e-05, 'epoch': 0.15}


                                                        
 31%|███       | 1430/4626 [10:38:05<7:00:19,  7.89s/it]

{'loss': 1.1327, 'grad_norm': 6.64664888381958, 'learning_rate': 3.8730004847309745e-05, 'epoch': 0.15}


                                                        
 31%|███       | 1440/4626 [10:39:24<6:55:12,  7.82s/it]

{'loss': 1.0721, 'grad_norm': 8.521710395812988, 'learning_rate': 3.860882210373243e-05, 'epoch': 0.16}


                                                        
 31%|███▏      | 1450/4626 [10:40:42<6:54:21,  7.83s/it]

{'loss': 0.9687, 'grad_norm': 11.054054260253906, 'learning_rate': 3.8487639360155117e-05, 'epoch': 0.16}


                                                        
 32%|███▏      | 1460/4626 [10:41:59<6:48:40,  7.74s/it]

{'loss': 1.1436, 'grad_norm': 9.336568832397461, 'learning_rate': 3.83664566165778e-05, 'epoch': 0.16}


                                                        
 32%|███▏      | 1470/4626 [10:43:17<6:47:31,  7.75s/it]

{'loss': 1.0048, 'grad_norm': 12.061640739440918, 'learning_rate': 3.824527387300049e-05, 'epoch': 0.16}


                                                        
 32%|███▏      | 1480/4626 [10:44:35<6:49:35,  7.81s/it]

{'loss': 1.3528, 'grad_norm': 11.087886810302734, 'learning_rate': 3.812409112942317e-05, 'epoch': 0.16}


                                                        
 32%|███▏      | 1490/4626 [10:45:53<6:45:43,  7.76s/it]

{'loss': 1.1044, 'grad_norm': 11.023117065429688, 'learning_rate': 3.800290838584586e-05, 'epoch': 0.16}


                                                        
 32%|███▏      | 1500/4626 [10:47:11<6:47:31,  7.82s/it]

{'loss': 1.1203, 'grad_norm': 17.47757339477539, 'learning_rate': 3.788172564226855e-05, 'epoch': 0.16}


                                                        
 33%|███▎      | 1510/4626 [10:48:29<6:45:58,  7.82s/it]

{'loss': 1.0465, 'grad_norm': 15.970434188842773, 'learning_rate': 3.7760542898691224e-05, 'epoch': 0.16}


                                                        
 33%|███▎      | 1520/4626 [10:49:47<6:45:35,  7.83s/it]

{'loss': 1.0776, 'grad_norm': 11.526310920715332, 'learning_rate': 3.763936015511391e-05, 'epoch': 0.16}


                                                        
 33%|███▎      | 1530/4626 [10:51:05<6:44:05,  7.83s/it]

{'loss': 1.1085, 'grad_norm': 24.951284408569336, 'learning_rate': 3.7518177411536595e-05, 'epoch': 0.17}


                                                        
 33%|███▎      | 1540/4626 [10:52:23<6:42:23,  7.82s/it]

{'loss': 1.0473, 'grad_norm': 8.589960098266602, 'learning_rate': 3.7396994667959284e-05, 'epoch': 0.17}


                                                        
 34%|███▎      | 1550/4626 [10:53:41<6:40:43,  7.82s/it]

{'loss': 0.9968, 'grad_norm': 8.42174243927002, 'learning_rate': 3.727581192438197e-05, 'epoch': 0.17}


                                                        
 34%|███▎      | 1560/4626 [10:55:00<6:41:45,  7.86s/it]

{'loss': 0.8309, 'grad_norm': 7.856040954589844, 'learning_rate': 3.7154629180804655e-05, 'epoch': 0.17}


                                                        
 34%|███▍      | 1570/4626 [10:56:18<6:38:03,  7.82s/it]

{'loss': 1.1341, 'grad_norm': 17.57099151611328, 'learning_rate': 3.7033446437227344e-05, 'epoch': 0.17}


                                                        
 34%|███▍      | 1580/4626 [10:57:36<6:34:28,  7.77s/it]

{'loss': 1.0691, 'grad_norm': 8.171350479125977, 'learning_rate': 3.691226369365002e-05, 'epoch': 0.17}


                                                        
 34%|███▍      | 1590/4626 [10:58:54<6:34:48,  7.80s/it]

{'loss': 1.1203, 'grad_norm': 11.610579490661621, 'learning_rate': 3.679108095007271e-05, 'epoch': 0.17}


                                                        
 35%|███▍      | 1600/4626 [11:00:12<6:30:13,  7.74s/it]

{'loss': 1.0058, 'grad_norm': 20.382795333862305, 'learning_rate': 3.66698982064954e-05, 'epoch': 0.17}


                                                        
 35%|███▍      | 1610/4626 [11:01:30<6:33:59,  7.84s/it]

{'loss': 0.8757, 'grad_norm': 10.411798477172852, 'learning_rate': 3.654871546291808e-05, 'epoch': 0.17}


                                                        
 35%|███▌      | 1620/4626 [11:02:48<6:32:48,  7.84s/it]

{'loss': 0.995, 'grad_norm': 9.340179443359375, 'learning_rate': 3.642753271934077e-05, 'epoch': 0.18}


                                                        
 35%|███▌      | 1630/4626 [11:04:06<6:33:17,  7.88s/it]

{'loss': 0.975, 'grad_norm': 8.64664363861084, 'learning_rate': 3.630634997576345e-05, 'epoch': 0.18}


                                                        
 35%|███▌      | 1640/4626 [11:05:25<6:28:56,  7.82s/it]

{'loss': 1.3551, 'grad_norm': 9.143139839172363, 'learning_rate': 3.618516723218614e-05, 'epoch': 0.18}


                                                        
 36%|███▌      | 1650/4626 [11:06:43<6:28:42,  7.84s/it]

{'loss': 1.1645, 'grad_norm': 11.269588470458984, 'learning_rate': 3.606398448860882e-05, 'epoch': 0.18}


                                                        
 36%|███▌      | 1660/4626 [11:08:01<6:27:24,  7.84s/it]

{'loss': 0.9289, 'grad_norm': 10.621381759643555, 'learning_rate': 3.5942801745031505e-05, 'epoch': 0.18}


                                                        
 36%|███▌      | 1670/4626 [11:09:20<6:26:44,  7.85s/it]

{'loss': 1.0086, 'grad_norm': 7.827033996582031, 'learning_rate': 3.5821619001454194e-05, 'epoch': 0.18}


                                                        
 36%|███▋      | 1680/4626 [11:10:39<6:28:12,  7.91s/it]

{'loss': 1.0588, 'grad_norm': 14.402323722839355, 'learning_rate': 3.5700436257876877e-05, 'epoch': 0.18}


                                                        
 37%|███▋      | 1690/4626 [11:11:57<6:26:58,  7.91s/it]

{'loss': 1.0245, 'grad_norm': 10.52235221862793, 'learning_rate': 3.5579253514299566e-05, 'epoch': 0.18}


                                                        
 37%|███▋      | 1700/4626 [11:13:15<6:23:03,  7.86s/it]

{'loss': 0.939, 'grad_norm': 13.669678688049316, 'learning_rate': 3.5458070770722255e-05, 'epoch': 0.18}


                                                        
 37%|███▋      | 1710/4626 [11:14:34<6:21:38,  7.85s/it]

{'loss': 0.8422, 'grad_norm': 7.867867469787598, 'learning_rate': 3.533688802714494e-05, 'epoch': 0.18}


                                                        
 37%|███▋      | 1720/4626 [11:15:52<6:20:03,  7.85s/it]

{'loss': 0.7745, 'grad_norm': 5.3830413818359375, 'learning_rate': 3.5215705283567626e-05, 'epoch': 0.19}


                                                        
 37%|███▋      | 1730/4626 [11:17:11<6:20:54,  7.89s/it]

{'loss': 1.0395, 'grad_norm': 5.222002983093262, 'learning_rate': 3.50945225399903e-05, 'epoch': 0.19}


                                                        
 38%|███▊      | 1740/4626 [11:18:29<6:16:17,  7.82s/it]

{'loss': 1.3708, 'grad_norm': 5.84749698638916, 'learning_rate': 3.497333979641299e-05, 'epoch': 0.19}


                                                        
 38%|███▊      | 1750/4626 [11:19:47<6:14:16,  7.81s/it]

{'loss': 1.3139, 'grad_norm': 6.9296369552612305, 'learning_rate': 3.485215705283568e-05, 'epoch': 0.19}


                                                        
 38%|███▊      | 1760/4626 [11:21:05<6:12:28,  7.80s/it]

{'loss': 1.1621, 'grad_norm': 10.19169807434082, 'learning_rate': 3.473097430925836e-05, 'epoch': 0.19}


                                                        
 38%|███▊      | 1770/4626 [11:22:24<6:13:13,  7.84s/it]

{'loss': 0.8586, 'grad_norm': 5.991764545440674, 'learning_rate': 3.460979156568105e-05, 'epoch': 0.19}


                                                        
 38%|███▊      | 1780/4626 [11:23:42<6:12:38,  7.86s/it]

{'loss': 1.0962, 'grad_norm': 11.638315200805664, 'learning_rate': 3.448860882210373e-05, 'epoch': 0.19}


                                                        
 39%|███▊      | 1790/4626 [11:25:00<6:11:19,  7.86s/it]

{'loss': 0.8446, 'grad_norm': 14.71629524230957, 'learning_rate': 3.436742607852642e-05, 'epoch': 0.19}


                                                        
 39%|███▉      | 1800/4626 [11:26:19<6:07:11,  7.80s/it]

{'loss': 1.2888, 'grad_norm': 13.91597843170166, 'learning_rate': 3.4246243334949105e-05, 'epoch': 0.19}


                                                        
 39%|███▉      | 1810/4626 [11:27:37<6:06:17,  7.80s/it]

{'loss': 1.1122, 'grad_norm': 12.281856536865234, 'learning_rate': 3.412506059137179e-05, 'epoch': 0.2}


                                                        
 39%|███▉      | 1820/4626 [11:28:55<6:06:16,  7.83s/it]

{'loss': 1.3396, 'grad_norm': 11.634056091308594, 'learning_rate': 3.4003877847794476e-05, 'epoch': 0.2}


                                                        
 40%|███▉      | 1830/4626 [11:30:14<6:05:13,  7.84s/it]

{'loss': 0.9347, 'grad_norm': 12.779587745666504, 'learning_rate': 3.388269510421716e-05, 'epoch': 0.2}


                                                        
 40%|███▉      | 1840/4626 [11:31:32<6:02:35,  7.81s/it]

{'loss': 0.9612, 'grad_norm': 10.546977996826172, 'learning_rate': 3.376151236063985e-05, 'epoch': 0.2}


                                                        
 40%|███▉      | 1850/4626 [11:32:51<6:01:12,  7.81s/it]

{'loss': 0.9205, 'grad_norm': 9.15870189666748, 'learning_rate': 3.364032961706253e-05, 'epoch': 0.2}


                                                        
 40%|████      | 1860/4626 [11:34:09<6:02:40,  7.87s/it]

{'loss': 1.0789, 'grad_norm': 10.980822563171387, 'learning_rate': 3.351914687348522e-05, 'epoch': 0.2}


                                                        
 40%|████      | 1870/4626 [11:35:27<6:00:43,  7.85s/it]

{'loss': 0.8695, 'grad_norm': 9.529138565063477, 'learning_rate': 3.339796412990791e-05, 'epoch': 0.2}


                                                        
 41%|████      | 1880/4626 [11:36:46<5:58:01,  7.82s/it]

{'loss': 1.0801, 'grad_norm': 8.03111743927002, 'learning_rate': 3.327678138633058e-05, 'epoch': 0.2}


                                                        
 41%|████      | 1890/4626 [11:38:04<5:55:08,  7.79s/it]

{'loss': 1.2494, 'grad_norm': 10.01775074005127, 'learning_rate': 3.315559864275327e-05, 'epoch': 0.2}


                                                        
 41%|████      | 1900/4626 [11:39:22<5:55:57,  7.83s/it]

{'loss': 1.2453, 'grad_norm': 20.94688606262207, 'learning_rate': 3.303441589917596e-05, 'epoch': 0.21}


                                                        
 41%|████▏     | 1910/4626 [11:40:41<5:56:41,  7.88s/it]

{'loss': 0.8704, 'grad_norm': 9.933085441589355, 'learning_rate': 3.2913233155598643e-05, 'epoch': 0.21}


                                                        
 42%|████▏     | 1920/4626 [11:41:59<5:56:02,  7.89s/it]

{'loss': 1.0984, 'grad_norm': 14.52611255645752, 'learning_rate': 3.279205041202133e-05, 'epoch': 0.21}


                                                        
 42%|████▏     | 1930/4626 [11:43:18<5:52:50,  7.85s/it]

{'loss': 1.089, 'grad_norm': 17.8295955657959, 'learning_rate': 3.2670867668444015e-05, 'epoch': 0.21}


                                                        
 42%|████▏     | 1940/4626 [11:44:36<5:49:57,  7.82s/it]

{'loss': 0.9234, 'grad_norm': 9.901483535766602, 'learning_rate': 3.2549684924866704e-05, 'epoch': 0.21}


                                                        
 42%|████▏     | 1950/4626 [11:45:54<5:49:15,  7.83s/it]

{'loss': 1.058, 'grad_norm': 8.543376922607422, 'learning_rate': 3.2428502181289386e-05, 'epoch': 0.21}


                                                        
 42%|████▏     | 1960/4626 [11:47:13<5:47:35,  7.82s/it]

{'loss': 1.0032, 'grad_norm': 7.966945648193359, 'learning_rate': 3.230731943771207e-05, 'epoch': 0.21}


                                                        
 43%|████▎     | 1970/4626 [11:48:31<5:47:07,  7.84s/it]

{'loss': 1.1131, 'grad_norm': 11.845447540283203, 'learning_rate': 3.218613669413476e-05, 'epoch': 0.21}


                                                        
 43%|████▎     | 1980/4626 [11:49:49<5:44:21,  7.81s/it]

{'loss': 1.1398, 'grad_norm': 16.853145599365234, 'learning_rate': 3.206495395055744e-05, 'epoch': 0.21}


                                                        
 43%|████▎     | 1990/4626 [11:51:07<5:44:00,  7.83s/it]

{'loss': 0.9697, 'grad_norm': 12.788467407226562, 'learning_rate': 3.194377120698013e-05, 'epoch': 0.22}


                                                        
 43%|████▎     | 2000/4626 [11:52:25<5:41:07,  7.79s/it]

{'loss': 0.9105, 'grad_norm': 5.583579063415527, 'learning_rate': 3.182258846340281e-05, 'epoch': 0.22}


                                                        
 43%|████▎     | 2010/4626 [11:53:43<5:40:21,  7.81s/it]

{'loss': 1.0757, 'grad_norm': 14.743757247924805, 'learning_rate': 3.17014057198255e-05, 'epoch': 0.22}


                                                        
 44%|████▎     | 2020/4626 [11:55:01<5:40:55,  7.85s/it]

{'loss': 0.961, 'grad_norm': 10.26385498046875, 'learning_rate': 3.158022297624819e-05, 'epoch': 0.22}


                                                        
 44%|████▍     | 2030/4626 [11:56:19<5:36:27,  7.78s/it]

{'loss': 1.2832, 'grad_norm': 10.94155502319336, 'learning_rate': 3.1459040232670865e-05, 'epoch': 0.22}


                                                        
 44%|████▍     | 2040/4626 [11:57:37<5:35:46,  7.79s/it]

{'loss': 1.0505, 'grad_norm': 17.44017219543457, 'learning_rate': 3.1337857489093554e-05, 'epoch': 0.22}


                                                        
 44%|████▍     | 2050/4626 [11:58:56<5:45:09,  8.04s/it]

{'loss': 1.0938, 'grad_norm': 12.830211639404297, 'learning_rate': 3.1216674745516236e-05, 'epoch': 0.22}


                                                        
 45%|████▍     | 2060/4626 [12:00:14<5:35:01,  7.83s/it]

{'loss': 1.026, 'grad_norm': 12.365126609802246, 'learning_rate': 3.1095492001938925e-05, 'epoch': 0.22}


                                                        
 45%|████▍     | 2070/4626 [12:01:32<5:33:24,  7.83s/it]

{'loss': 0.9022, 'grad_norm': 8.945682525634766, 'learning_rate': 3.0974309258361614e-05, 'epoch': 0.22}


                                                        
 45%|████▍     | 2080/4626 [12:02:50<5:29:36,  7.77s/it]

{'loss': 1.0562, 'grad_norm': 8.370729446411133, 'learning_rate': 3.0853126514784296e-05, 'epoch': 0.22}


                                                        
 45%|████▌     | 2090/4626 [12:04:08<5:30:01,  7.81s/it]

{'loss': 1.0635, 'grad_norm': 8.243417739868164, 'learning_rate': 3.0731943771206985e-05, 'epoch': 0.23}


                                                        
 45%|████▌     | 2100/4626 [12:05:27<5:29:38,  7.83s/it]

{'loss': 0.9671, 'grad_norm': 9.153733253479004, 'learning_rate': 3.061076102762967e-05, 'epoch': 0.23}


                                                        
 46%|████▌     | 2110/4626 [12:06:44<5:25:35,  7.76s/it]

{'loss': 1.1495, 'grad_norm': 13.815093994140625, 'learning_rate': 3.048957828405235e-05, 'epoch': 0.23}


                                                        
 46%|████▌     | 2120/4626 [12:08:02<5:24:16,  7.76s/it]

{'loss': 1.1887, 'grad_norm': 11.679290771484375, 'learning_rate': 3.0368395540475036e-05, 'epoch': 0.23}


                                                        
 46%|████▌     | 2130/4626 [12:09:20<5:23:52,  7.79s/it]

{'loss': 0.9817, 'grad_norm': 5.797985553741455, 'learning_rate': 3.024721279689772e-05, 'epoch': 0.23}


                                                        
 46%|████▋     | 2140/4626 [12:10:38<5:25:03,  7.85s/it]

{'loss': 0.9044, 'grad_norm': 5.585740566253662, 'learning_rate': 3.012603005332041e-05, 'epoch': 0.23}


                                                        
 46%|████▋     | 2150/4626 [12:11:56<5:22:42,  7.82s/it]

{'loss': 1.1598, 'grad_norm': 14.312020301818848, 'learning_rate': 3.0004847309743096e-05, 'epoch': 0.23}


                                                        
 47%|████▋     | 2160/4626 [12:13:15<5:21:50,  7.83s/it]

{'loss': 0.8709, 'grad_norm': 18.916614532470703, 'learning_rate': 2.988366456616578e-05, 'epoch': 0.23}


                                                        
 47%|████▋     | 2170/4626 [12:14:33<5:22:29,  7.88s/it]

{'loss': 1.1907, 'grad_norm': 9.593171119689941, 'learning_rate': 2.9762481822588467e-05, 'epoch': 0.23}


                                                        
 47%|████▋     | 2180/4626 [12:15:51<5:18:56,  7.82s/it]

{'loss': 1.0906, 'grad_norm': 5.392999172210693, 'learning_rate': 2.9641299079011146e-05, 'epoch': 0.24}


                                                        
 47%|████▋     | 2190/4626 [12:17:10<5:17:33,  7.82s/it]

{'loss': 1.1165, 'grad_norm': 11.29971694946289, 'learning_rate': 2.9520116335433835e-05, 'epoch': 0.24}


                                                        
 48%|████▊     | 2200/4626 [12:18:28<5:15:07,  7.79s/it]

{'loss': 1.1186, 'grad_norm': 4.211832046508789, 'learning_rate': 2.939893359185652e-05, 'epoch': 0.24}


                                                        
 48%|████▊     | 2210/4626 [12:19:46<5:19:12,  7.93s/it]

{'loss': 1.0302, 'grad_norm': 4.20809268951416, 'learning_rate': 2.9277750848279206e-05, 'epoch': 0.24}


                                                        
 48%|████▊     | 2220/4626 [12:21:05<5:14:56,  7.85s/it]

{'loss': 0.9059, 'grad_norm': 4.543336868286133, 'learning_rate': 2.9156568104701892e-05, 'epoch': 0.24}


                                                        
 48%|████▊     | 2230/4626 [12:22:24<5:12:39,  7.83s/it]

{'loss': 0.8443, 'grad_norm': 8.8342866897583, 'learning_rate': 2.9035385361124578e-05, 'epoch': 0.24}


                                                        
 48%|████▊     | 2240/4626 [12:23:42<5:10:43,  7.81s/it]

{'loss': 1.1427, 'grad_norm': 15.58261489868164, 'learning_rate': 2.8914202617547263e-05, 'epoch': 0.24}


                                                        
 49%|████▊     | 2250/4626 [12:25:00<5:09:16,  7.81s/it]

{'loss': 1.3988, 'grad_norm': 9.238319396972656, 'learning_rate': 2.879301987396995e-05, 'epoch': 0.24}


                                                        
 49%|████▉     | 2260/4626 [12:26:18<5:08:54,  7.83s/it]

{'loss': 1.1455, 'grad_norm': 4.064398765563965, 'learning_rate': 2.867183713039263e-05, 'epoch': 0.24}


                                                        
 49%|████▉     | 2270/4626 [12:27:36<5:08:01,  7.84s/it]

{'loss': 1.0428, 'grad_norm': 11.640456199645996, 'learning_rate': 2.8550654386815317e-05, 'epoch': 0.25}


                                                        
 49%|████▉     | 2280/4626 [12:28:55<5:09:06,  7.91s/it]

{'loss': 1.2803, 'grad_norm': 13.03378677368164, 'learning_rate': 2.8429471643238003e-05, 'epoch': 0.25}


                                                        
 50%|████▉     | 2290/4626 [12:30:14<5:06:13,  7.87s/it]

{'loss': 1.2085, 'grad_norm': 7.491578578948975, 'learning_rate': 2.830828889966069e-05, 'epoch': 0.25}


                                                        
 50%|████▉     | 2300/4626 [12:31:32<5:04:16,  7.85s/it]

{'loss': 1.0509, 'grad_norm': 11.403136253356934, 'learning_rate': 2.8187106156083377e-05, 'epoch': 0.25}


                                                        
 50%|████▉     | 2310/4626 [12:32:51<5:01:06,  7.80s/it]

{'loss': 1.0526, 'grad_norm': 5.0979743003845215, 'learning_rate': 2.8065923412506063e-05, 'epoch': 0.25}


                                                        
 50%|█████     | 2320/4626 [12:34:09<4:59:28,  7.79s/it]

{'loss': 0.9204, 'grad_norm': 4.175346374511719, 'learning_rate': 2.794474066892875e-05, 'epoch': 0.25}


                                                        
 50%|█████     | 2330/4626 [12:35:27<5:01:13,  7.87s/it]

{'loss': 1.0153, 'grad_norm': 9.520540237426758, 'learning_rate': 2.7823557925351428e-05, 'epoch': 0.25}


                                                        
 51%|█████     | 2340/4626 [12:36:46<4:57:49,  7.82s/it]

{'loss': 1.0704, 'grad_norm': 9.669951438903809, 'learning_rate': 2.7702375181774117e-05, 'epoch': 0.25}


                                                        
 51%|█████     | 2350/4626 [12:38:04<4:58:26,  7.87s/it]

{'loss': 1.0936, 'grad_norm': 17.33401107788086, 'learning_rate': 2.7581192438196802e-05, 'epoch': 0.25}


                                                        
 51%|█████     | 2360/4626 [12:39:25<5:05:51,  8.10s/it]

{'loss': 0.9056, 'grad_norm': 7.413363456726074, 'learning_rate': 2.7460009694619488e-05, 'epoch': 0.26}


                                                        
 51%|█████     | 2370/4626 [12:40:43<4:57:27,  7.91s/it]

{'loss': 1.0881, 'grad_norm': 17.528636932373047, 'learning_rate': 2.7338826951042174e-05, 'epoch': 0.26}


                                                        
 51%|█████▏    | 2380/4626 [12:42:05<5:09:26,  8.27s/it]

{'loss': 1.2667, 'grad_norm': 7.504953861236572, 'learning_rate': 2.721764420746486e-05, 'epoch': 0.26}


                                                        
 52%|█████▏    | 2390/4626 [12:43:22<4:48:25,  7.74s/it]

{'loss': 1.1476, 'grad_norm': 9.037967681884766, 'learning_rate': 2.7096461463887545e-05, 'epoch': 0.26}


                                                        
 52%|█████▏    | 2400/4626 [12:44:39<4:41:02,  7.58s/it]

{'loss': 1.0761, 'grad_norm': 10.657581329345703, 'learning_rate': 2.6975278720310227e-05, 'epoch': 0.26}


                                                        
 52%|█████▏    | 2410/4626 [12:45:55<4:43:41,  7.68s/it]

{'loss': 1.023, 'grad_norm': 14.803102493286133, 'learning_rate': 2.6854095976732913e-05, 'epoch': 0.26}


                                                        
 52%|█████▏    | 2420/4626 [12:47:12<4:43:21,  7.71s/it]

{'loss': 0.8795, 'grad_norm': 9.18317985534668, 'learning_rate': 2.67329132331556e-05, 'epoch': 0.26}


                                                        
 53%|█████▎    | 2430/4626 [12:48:30<4:42:41,  7.72s/it]

{'loss': 1.0568, 'grad_norm': 14.464031219482422, 'learning_rate': 2.6611730489578284e-05, 'epoch': 0.26}


                                                        
 53%|█████▎    | 2440/4626 [12:49:47<4:39:14,  7.66s/it]

{'loss': 1.393, 'grad_norm': 7.422569274902344, 'learning_rate': 2.649054774600097e-05, 'epoch': 0.26}


                                                        
 53%|█████▎    | 2450/4626 [12:51:04<4:39:50,  7.72s/it]

{'loss': 1.1562, 'grad_norm': 5.982183933258057, 'learning_rate': 2.6369365002423656e-05, 'epoch': 0.26}


                                                        
 53%|█████▎    | 2460/4626 [12:52:22<4:40:40,  7.77s/it]

{'loss': 1.1101, 'grad_norm': 12.97802734375, 'learning_rate': 2.6248182258846345e-05, 'epoch': 0.27}


                                                        
 53%|█████▎    | 2470/4626 [12:53:39<4:34:59,  7.65s/it]

{'loss': 1.0169, 'grad_norm': 5.038736820220947, 'learning_rate': 2.612699951526903e-05, 'epoch': 0.27}


                                                        
 54%|█████▎    | 2480/4626 [12:54:56<4:34:50,  7.68s/it]

{'loss': 0.9971, 'grad_norm': 5.967055320739746, 'learning_rate': 2.600581677169171e-05, 'epoch': 0.27}


                                                        
 54%|█████▍    | 2490/4626 [12:56:13<4:32:30,  7.65s/it]

{'loss': 1.1523, 'grad_norm': 5.2036614418029785, 'learning_rate': 2.5884634028114395e-05, 'epoch': 0.27}


                                                        
 54%|█████▍    | 2500/4626 [12:57:31<4:35:30,  7.78s/it]

{'loss': 1.0368, 'grad_norm': 4.971120357513428, 'learning_rate': 2.5763451284537084e-05, 'epoch': 0.27}


                                                        
 54%|█████▍    | 2510/4626 [12:58:49<4:35:03,  7.80s/it]

{'loss': 0.9639, 'grad_norm': 5.235806465148926, 'learning_rate': 2.564226854095977e-05, 'epoch': 0.27}


                                                        
 54%|█████▍    | 2520/4626 [13:00:07<4:33:13,  7.78s/it]

{'loss': 1.1293, 'grad_norm': 9.86626148223877, 'learning_rate': 2.5521085797382455e-05, 'epoch': 0.27}


                                                        
 55%|█████▍    | 2530/4626 [13:01:25<4:34:43,  7.86s/it]

{'loss': 0.9266, 'grad_norm': 8.253503799438477, 'learning_rate': 2.539990305380514e-05, 'epoch': 0.27}


                                                        
 55%|█████▍    | 2540/4626 [13:02:45<4:35:14,  7.92s/it]

{'loss': 1.143, 'grad_norm': 6.084354400634766, 'learning_rate': 2.5278720310227827e-05, 'epoch': 0.27}


                                                        
 55%|█████▌    | 2550/4626 [13:04:04<4:33:13,  7.90s/it]

{'loss': 0.9929, 'grad_norm': 10.229326248168945, 'learning_rate': 2.515753756665051e-05, 'epoch': 0.28}


                                                        
 55%|█████▌    | 2560/4626 [13:05:22<4:26:37,  7.74s/it]

{'loss': 0.9532, 'grad_norm': 7.522101879119873, 'learning_rate': 2.5036354823073194e-05, 'epoch': 0.28}


                                                        
 56%|█████▌    | 2570/4626 [13:06:39<4:29:50,  7.87s/it]

{'loss': 0.8686, 'grad_norm': 6.618906497955322, 'learning_rate': 2.491517207949588e-05, 'epoch': 0.28}


                                                        
 56%|█████▌    | 2580/4626 [13:07:57<4:21:35,  7.67s/it]

{'loss': 1.0032, 'grad_norm': 13.59411907196045, 'learning_rate': 2.4793989335918566e-05, 'epoch': 0.28}


                                                        
 56%|█████▌    | 2590/4626 [13:09:13<4:18:46,  7.63s/it]

{'loss': 0.7657, 'grad_norm': 6.001777648925781, 'learning_rate': 2.467280659234125e-05, 'epoch': 0.28}


                                                        
 56%|█████▌    | 2600/4626 [13:10:31<4:20:59,  7.73s/it]

{'loss': 1.1395, 'grad_norm': 6.178666114807129, 'learning_rate': 2.4551623848763937e-05, 'epoch': 0.28}


                                                        
 56%|█████▋    | 2610/4626 [13:11:48<4:16:00,  7.62s/it]

{'loss': 0.9838, 'grad_norm': 7.49594783782959, 'learning_rate': 2.4430441105186623e-05, 'epoch': 0.28}


                                                        
 57%|█████▋    | 2620/4626 [13:13:05<4:14:53,  7.62s/it]

{'loss': 1.1443, 'grad_norm': 7.107802867889404, 'learning_rate': 2.430925836160931e-05, 'epoch': 0.28}


                                                        
 57%|█████▋    | 2630/4626 [13:14:20<4:09:05,  7.49s/it]

{'loss': 1.0985, 'grad_norm': 6.117868900299072, 'learning_rate': 2.4188075618031994e-05, 'epoch': 0.28}


                                                        
 57%|█████▋    | 2640/4626 [13:15:35<4:13:34,  7.66s/it]

{'loss': 1.1195, 'grad_norm': 11.92158031463623, 'learning_rate': 2.406689287445468e-05, 'epoch': 0.29}


                                                        
 57%|█████▋    | 2650/4626 [13:16:50<4:03:44,  7.40s/it]

{'loss': 1.0497, 'grad_norm': 11.266247749328613, 'learning_rate': 2.3945710130877362e-05, 'epoch': 0.29}


                                                        
 58%|█████▊    | 2660/4626 [13:18:04<4:02:49,  7.41s/it]

{'loss': 1.0284, 'grad_norm': 8.041422843933105, 'learning_rate': 2.382452738730005e-05, 'epoch': 0.29}


                                                        
 58%|█████▊    | 2670/4626 [13:19:21<4:13:20,  7.77s/it]

{'loss': 0.9912, 'grad_norm': 9.94511604309082, 'learning_rate': 2.3703344643722737e-05, 'epoch': 0.29}


                                                        
 58%|█████▊    | 2680/4626 [13:20:37<4:07:40,  7.64s/it]

{'loss': 1.1538, 'grad_norm': 16.779008865356445, 'learning_rate': 2.358216190014542e-05, 'epoch': 0.29}


                                                        
 58%|█████▊    | 2690/4626 [13:21:54<4:05:02,  7.59s/it]

{'loss': 0.8643, 'grad_norm': 8.942322731018066, 'learning_rate': 2.3460979156568105e-05, 'epoch': 0.29}


                                                        
 58%|█████▊    | 2700/4626 [13:23:10<4:04:34,  7.62s/it]

{'loss': 1.0337, 'grad_norm': 5.133809566497803, 'learning_rate': 2.333979641299079e-05, 'epoch': 0.29}


                                                        
 59%|█████▊    | 2710/4626 [13:24:26<4:01:14,  7.55s/it]

{'loss': 0.9686, 'grad_norm': 14.02278995513916, 'learning_rate': 2.3218613669413476e-05, 'epoch': 0.29}


                                                        
 59%|█████▉    | 2720/4626 [13:25:44<4:07:50,  7.80s/it]

{'loss': 1.0321, 'grad_norm': 11.207067489624023, 'learning_rate': 2.309743092583616e-05, 'epoch': 0.29}


                                                        
 59%|█████▉    | 2730/4626 [13:27:02<4:08:54,  7.88s/it]

{'loss': 1.1264, 'grad_norm': 9.176467895507812, 'learning_rate': 2.2976248182258847e-05, 'epoch': 0.3}


                                                        
 59%|█████▉    | 2740/4626 [13:28:20<4:00:59,  7.67s/it]

{'loss': 1.08, 'grad_norm': 4.976774215698242, 'learning_rate': 2.2855065438681533e-05, 'epoch': 0.3}


                                                        
 59%|█████▉    | 2750/4626 [13:29:39<4:01:33,  7.73s/it]

{'loss': 1.0374, 'grad_norm': 4.803806781768799, 'learning_rate': 2.273388269510422e-05, 'epoch': 0.3}


                                                        
 60%|█████▉    | 2760/4626 [13:30:56<3:58:46,  7.68s/it]

{'loss': 1.0459, 'grad_norm': 11.002181053161621, 'learning_rate': 2.2612699951526904e-05, 'epoch': 0.3}


                                                        
 60%|█████▉    | 2770/4626 [13:32:15<4:08:16,  8.03s/it]

{'loss': 1.2373, 'grad_norm': 10.547489166259766, 'learning_rate': 2.249151720794959e-05, 'epoch': 0.3}


                                                        
 60%|██████    | 2780/4626 [13:33:34<4:01:54,  7.86s/it]

{'loss': 0.9881, 'grad_norm': 9.663579940795898, 'learning_rate': 2.2370334464372276e-05, 'epoch': 0.3}


                                                        
 60%|██████    | 2790/4626 [13:34:52<3:57:31,  7.76s/it]

{'loss': 0.8383, 'grad_norm': 11.494872093200684, 'learning_rate': 2.224915172079496e-05, 'epoch': 0.3}


                                                        
 61%|██████    | 2800/4626 [13:36:10<3:57:26,  7.80s/it]

{'loss': 1.0257, 'grad_norm': 8.965888977050781, 'learning_rate': 2.2127968977217644e-05, 'epoch': 0.3}


                                                        
 61%|██████    | 2810/4626 [13:37:28<3:53:11,  7.70s/it]

{'loss': 1.1107, 'grad_norm': 5.451632976531982, 'learning_rate': 2.200678623364033e-05, 'epoch': 0.3}


                                                        
 61%|██████    | 2820/4626 [13:38:45<3:51:08,  7.68s/it]

{'loss': 0.9519, 'grad_norm': 9.125125885009766, 'learning_rate': 2.1885603490063018e-05, 'epoch': 0.3}


                                                        
 61%|██████    | 2830/4626 [13:40:01<3:45:52,  7.55s/it]

{'loss': 1.068, 'grad_norm': 11.75377082824707, 'learning_rate': 2.17644207464857e-05, 'epoch': 0.31}


                                                        
 61%|██████▏   | 2840/4626 [13:41:17<3:46:16,  7.60s/it]

{'loss': 0.9485, 'grad_norm': 9.26883316040039, 'learning_rate': 2.1643238002908386e-05, 'epoch': 0.31}


                                                        
 62%|██████▏   | 2850/4626 [13:42:34<3:47:02,  7.67s/it]

{'loss': 1.1986, 'grad_norm': 13.359086990356445, 'learning_rate': 2.1522055259331072e-05, 'epoch': 0.31}


                                                        
 62%|██████▏   | 2860/4626 [13:43:51<3:43:30,  7.59s/it]

{'loss': 1.2513, 'grad_norm': 16.821903228759766, 'learning_rate': 2.1400872515753758e-05, 'epoch': 0.31}


                                                        
 62%|██████▏   | 2870/4626 [13:45:07<3:45:05,  7.69s/it]

{'loss': 0.8924, 'grad_norm': 6.948817729949951, 'learning_rate': 2.1279689772176443e-05, 'epoch': 0.31}


                                                        
 62%|██████▏   | 2880/4626 [13:46:24<3:43:02,  7.66s/it]

{'loss': 1.0138, 'grad_norm': 7.384677886962891, 'learning_rate': 2.115850702859913e-05, 'epoch': 0.31}


                                                        
 62%|██████▏   | 2890/4626 [13:47:41<3:39:14,  7.58s/it]

{'loss': 0.991, 'grad_norm': 7.424073696136475, 'learning_rate': 2.1037324285021815e-05, 'epoch': 0.31}


                                                        
 63%|██████▎   | 2900/4626 [13:48:57<3:42:07,  7.72s/it]

{'loss': 1.2244, 'grad_norm': 16.30069351196289, 'learning_rate': 2.09161415414445e-05, 'epoch': 0.31}


                                                        
 63%|██████▎   | 2910/4626 [13:50:14<3:36:55,  7.58s/it]

{'loss': 0.942, 'grad_norm': 7.547756195068359, 'learning_rate': 2.0794958797867182e-05, 'epoch': 0.31}


                                                        
 63%|██████▎   | 2920/4626 [13:51:30<3:39:02,  7.70s/it]

{'loss': 1.0531, 'grad_norm': 6.816349506378174, 'learning_rate': 2.067377605428987e-05, 'epoch': 0.32}


                                                        
 63%|██████▎   | 2930/4626 [13:52:47<3:38:23,  7.73s/it]

{'loss': 1.0428, 'grad_norm': 14.502829551696777, 'learning_rate': 2.0552593310712557e-05, 'epoch': 0.32}


                                                        
 64%|██████▎   | 2940/4626 [13:54:04<3:35:54,  7.68s/it]

{'loss': 0.9621, 'grad_norm': 6.083978652954102, 'learning_rate': 2.043141056713524e-05, 'epoch': 0.32}


                                                        
 64%|██████▍   | 2950/4626 [13:55:20<3:31:55,  7.59s/it]

{'loss': 1.2, 'grad_norm': 6.765629291534424, 'learning_rate': 2.0310227823557925e-05, 'epoch': 0.32}


                                                        
 64%|██████▍   | 2960/4626 [13:56:38<3:36:59,  7.82s/it]

{'loss': 1.0365, 'grad_norm': 6.684977054595947, 'learning_rate': 2.018904507998061e-05, 'epoch': 0.32}


                                                        
 64%|██████▍   | 2970/4626 [13:57:54<3:28:44,  7.56s/it]

{'loss': 0.8343, 'grad_norm': 8.322362899780273, 'learning_rate': 2.0067862336403296e-05, 'epoch': 0.32}


                                                        
 64%|██████▍   | 2980/4626 [13:59:09<3:23:06,  7.40s/it]

{'loss': 1.0777, 'grad_norm': 16.345500946044922, 'learning_rate': 1.9946679592825982e-05, 'epoch': 0.32}


                                                        
 65%|██████▍   | 2990/4626 [14:00:24<3:30:08,  7.71s/it]

{'loss': 1.06, 'grad_norm': 12.015225410461426, 'learning_rate': 1.9825496849248668e-05, 'epoch': 0.32}


                                                        
 65%|██████▍   | 3000/4626 [14:01:39<3:22:49,  7.48s/it]

{'loss': 1.046, 'grad_norm': 8.067842483520508, 'learning_rate': 1.9704314105671353e-05, 'epoch': 0.32}


                                                        
 65%|██████▌   | 3010/4626 [14:02:55<3:25:49,  7.64s/it]

{'loss': 0.9694, 'grad_norm': 6.264603614807129, 'learning_rate': 1.958313136209404e-05, 'epoch': 0.33}


                                                        
 65%|██████▌   | 3020/4626 [14:04:13<3:26:06,  7.70s/it]

{'loss': 1.2994, 'grad_norm': 18.096284866333008, 'learning_rate': 1.9461948618516725e-05, 'epoch': 0.33}


                                                        
 65%|██████▌   | 3030/4626 [14:05:29<3:21:41,  7.58s/it]

{'loss': 1.1135, 'grad_norm': 6.439767837524414, 'learning_rate': 1.934076587493941e-05, 'epoch': 0.33}


                                                        
 66%|██████▌   | 3040/4626 [14:06:45<3:18:27,  7.51s/it]

{'loss': 1.1573, 'grad_norm': 6.149333477020264, 'learning_rate': 1.9219583131362096e-05, 'epoch': 0.33}


                                                        
 66%|██████▌   | 3050/4626 [14:08:00<3:17:51,  7.53s/it]

{'loss': 0.9678, 'grad_norm': 10.756464958190918, 'learning_rate': 1.9098400387784782e-05, 'epoch': 0.33}


                                                        
 66%|██████▌   | 3060/4626 [14:09:15<3:15:21,  7.48s/it]

{'loss': 1.0432, 'grad_norm': 6.204366683959961, 'learning_rate': 1.8977217644207464e-05, 'epoch': 0.33}


                                                        
 66%|██████▋   | 3070/4626 [14:10:29<3:11:32,  7.39s/it]

{'loss': 1.0346, 'grad_norm': 7.137191295623779, 'learning_rate': 1.885603490063015e-05, 'epoch': 0.33}


                                                        
 67%|██████▋   | 3080/4626 [14:11:43<3:14:05,  7.53s/it]

{'loss': 1.1652, 'grad_norm': 7.3842597007751465, 'learning_rate': 1.873485215705284e-05, 'epoch': 0.33}


                                                        
 67%|██████▋   | 3090/4626 [14:12:59<3:11:19,  7.47s/it]

{'loss': 1.0846, 'grad_norm': 5.569022178649902, 'learning_rate': 1.861366941347552e-05, 'epoch': 0.33}


                                                        
 67%|██████▋   | 3100/4626 [14:14:14<3:11:33,  7.53s/it]

{'loss': 1.0803, 'grad_norm': 12.687531471252441, 'learning_rate': 1.8492486669898207e-05, 'epoch': 0.34}


                                                        
 67%|██████▋   | 3110/4626 [14:15:29<3:08:23,  7.46s/it]

{'loss': 1.0146, 'grad_norm': 9.962998390197754, 'learning_rate': 1.8371303926320892e-05, 'epoch': 0.34}


                                                        
 67%|██████▋   | 3120/4626 [14:16:44<3:06:29,  7.43s/it]

{'loss': 1.0788, 'grad_norm': 6.656383037567139, 'learning_rate': 1.8250121182743578e-05, 'epoch': 0.34}


                                                        
 68%|██████▊   | 3130/4626 [14:17:58<3:03:49,  7.37s/it]

{'loss': 0.9258, 'grad_norm': 7.609903812408447, 'learning_rate': 1.8128938439166264e-05, 'epoch': 0.34}


                                                        
 68%|██████▊   | 3140/4626 [14:19:12<3:03:45,  7.42s/it]

{'loss': 1.0158, 'grad_norm': 16.662193298339844, 'learning_rate': 1.800775569558895e-05, 'epoch': 0.34}


                                                        
 68%|██████▊   | 3150/4626 [14:20:26<3:01:57,  7.40s/it]

{'loss': 1.0555, 'grad_norm': 7.034337520599365, 'learning_rate': 1.7886572952011635e-05, 'epoch': 0.34}


                                                        
 68%|██████▊   | 3160/4626 [14:21:42<3:08:07,  7.70s/it]

{'loss': 1.0415, 'grad_norm': 9.559636116027832, 'learning_rate': 1.776539020843432e-05, 'epoch': 0.34}


                                                        
 69%|██████▊   | 3170/4626 [14:22:58<3:06:34,  7.69s/it]

{'loss': 1.1388, 'grad_norm': 11.364946365356445, 'learning_rate': 1.7644207464857003e-05, 'epoch': 0.34}


                                                        
 69%|██████▊   | 3180/4626 [14:24:16<3:07:51,  7.80s/it]

{'loss': 1.0104, 'grad_norm': 12.202136993408203, 'learning_rate': 1.7523024721279692e-05, 'epoch': 0.34}


                                                        
 69%|██████▉   | 3190/4626 [14:25:34<3:08:51,  7.89s/it]

{'loss': 1.0073, 'grad_norm': 16.437673568725586, 'learning_rate': 1.7401841977702378e-05, 'epoch': 0.34}


                                                        
 69%|██████▉   | 3200/4626 [14:26:57<3:11:18,  8.05s/it]

{'loss': 0.9932, 'grad_norm': 6.868264198303223, 'learning_rate': 1.728065923412506e-05, 'epoch': 0.35}


                                                        
 69%|██████▉   | 3210/4626 [14:28:17<3:06:40,  7.91s/it]

{'loss': 1.0579, 'grad_norm': 6.3380255699157715, 'learning_rate': 1.7159476490547746e-05, 'epoch': 0.35}


                                                        
 70%|██████▉   | 3220/4626 [14:29:36<3:06:17,  7.95s/it]

{'loss': 0.9593, 'grad_norm': 11.832599639892578, 'learning_rate': 1.703829374697043e-05, 'epoch': 0.35}


                                                        
 70%|██████▉   | 3230/4626 [14:30:56<3:02:44,  7.85s/it]

{'loss': 0.9871, 'grad_norm': 7.075822353363037, 'learning_rate': 1.6917111003393117e-05, 'epoch': 0.35}


                                                        
 70%|███████   | 3240/4626 [14:32:14<3:06:32,  8.08s/it]

{'loss': 1.1525, 'grad_norm': 6.414322853088379, 'learning_rate': 1.6795928259815803e-05, 'epoch': 0.35}


                                                        
 70%|███████   | 3250/4626 [14:33:34<3:02:55,  7.98s/it]

{'loss': 1.2513, 'grad_norm': 11.769217491149902, 'learning_rate': 1.6674745516238488e-05, 'epoch': 0.35}


                                                        
 70%|███████   | 3260/4626 [14:34:54<3:05:26,  8.15s/it]

{'loss': 1.1222, 'grad_norm': 6.618746280670166, 'learning_rate': 1.6553562772661174e-05, 'epoch': 0.35}


                                                        
 71%|███████   | 3270/4626 [14:36:15<3:04:36,  8.17s/it]

{'loss': 1.184, 'grad_norm': 6.2456278800964355, 'learning_rate': 1.643238002908386e-05, 'epoch': 0.35}


                                                        
 71%|███████   | 3280/4626 [14:37:36<3:01:21,  8.08s/it]

{'loss': 0.8982, 'grad_norm': 9.664854049682617, 'learning_rate': 1.6311197285506545e-05, 'epoch': 0.35}


                                                        
 71%|███████   | 3290/4626 [14:38:58<3:01:45,  8.16s/it]

{'loss': 0.9625, 'grad_norm': 9.171894073486328, 'learning_rate': 1.619001454192923e-05, 'epoch': 0.36}


                                                        
 71%|███████▏  | 3300/4626 [14:40:18<2:59:51,  8.14s/it]

{'loss': 0.9438, 'grad_norm': 12.807961463928223, 'learning_rate': 1.6068831798351917e-05, 'epoch': 0.36}


                                                        
 72%|███████▏  | 3310/4626 [14:41:40<3:00:51,  8.25s/it]

{'loss': 0.8726, 'grad_norm': 6.738429069519043, 'learning_rate': 1.5947649054774602e-05, 'epoch': 0.36}


                                                        
 72%|███████▏  | 3320/4626 [14:43:03<3:05:13,  8.51s/it]

{'loss': 1.0932, 'grad_norm': 6.555305480957031, 'learning_rate': 1.5826466311197284e-05, 'epoch': 0.36}


                                                        
 72%|███████▏  | 3330/4626 [14:44:25<2:53:53,  8.05s/it]

{'loss': 1.2902, 'grad_norm': 11.050431251525879, 'learning_rate': 1.570528356761997e-05, 'epoch': 0.36}


                                                        
 72%|███████▏  | 3340/4626 [14:45:45<2:53:34,  8.10s/it]

{'loss': 1.157, 'grad_norm': 5.7129926681518555, 'learning_rate': 1.558410082404266e-05, 'epoch': 0.36}


                                                        
 72%|███████▏  | 3350/4626 [14:47:06<2:52:49,  8.13s/it]

{'loss': 0.974, 'grad_norm': 6.874536037445068, 'learning_rate': 1.546291808046534e-05, 'epoch': 0.36}


                                                        
 73%|███████▎  | 3360/4626 [14:48:27<2:50:50,  8.10s/it]

{'loss': 1.1046, 'grad_norm': 5.466594696044922, 'learning_rate': 1.5341735336888027e-05, 'epoch': 0.36}


                                                        
 73%|███████▎  | 3370/4626 [14:49:48<2:50:20,  8.14s/it]

{'loss': 0.8774, 'grad_norm': 5.224320888519287, 'learning_rate': 1.5220552593310713e-05, 'epoch': 0.36}


                                                        
 73%|███████▎  | 3380/4626 [14:51:10<2:49:43,  8.17s/it]

{'loss': 0.9124, 'grad_norm': 13.376509666442871, 'learning_rate': 1.50993698497334e-05, 'epoch': 0.37}


                                                        
 73%|███████▎  | 3390/4626 [14:52:29<2:45:50,  8.05s/it]

{'loss': 1.1374, 'grad_norm': 9.307226181030273, 'learning_rate': 1.4978187106156082e-05, 'epoch': 0.37}


                                                        
 73%|███████▎  | 3400/4626 [14:53:50<2:46:05,  8.13s/it]

{'loss': 1.1227, 'grad_norm': 12.789205551147461, 'learning_rate': 1.485700436257877e-05, 'epoch': 0.37}


                                                        
 74%|███████▎  | 3410/4626 [14:55:12<2:46:34,  8.22s/it]

{'loss': 0.9947, 'grad_norm': 9.329752922058105, 'learning_rate': 1.4735821619001455e-05, 'epoch': 0.37}


                                                        
 74%|███████▍  | 3420/4626 [14:56:31<2:38:25,  7.88s/it]

{'loss': 1.1372, 'grad_norm': 4.262957572937012, 'learning_rate': 1.4614638875424141e-05, 'epoch': 0.37}


                                                        
 74%|███████▍  | 3430/4626 [14:57:52<2:41:57,  8.12s/it]

{'loss': 0.9544, 'grad_norm': 4.491634368896484, 'learning_rate': 1.4493456131846825e-05, 'epoch': 0.37}


                                                        
 74%|███████▍  | 3440/4626 [14:59:15<2:44:48,  8.34s/it]

{'loss': 0.796, 'grad_norm': 9.643993377685547, 'learning_rate': 1.437227338826951e-05, 'epoch': 0.37}


                                                        
 75%|███████▍  | 3450/4626 [15:00:35<2:38:05,  8.07s/it]

{'loss': 0.9811, 'grad_norm': 10.881569862365723, 'learning_rate': 1.4251090644692196e-05, 'epoch': 0.37}


                                                        
 75%|███████▍  | 3460/4626 [15:01:56<2:35:21,  7.99s/it]

{'loss': 0.9888, 'grad_norm': 5.120680332183838, 'learning_rate': 1.4129907901114884e-05, 'epoch': 0.37}


                                                        
 75%|███████▌  | 3470/4626 [15:03:14<2:30:00,  7.79s/it]

{'loss': 0.8917, 'grad_norm': 4.730879783630371, 'learning_rate': 1.4008725157537566e-05, 'epoch': 0.38}


                                                        
 75%|███████▌  | 3480/4626 [15:04:30<2:22:27,  7.46s/it]

{'loss': 1.1281, 'grad_norm': 8.992775917053223, 'learning_rate': 1.3887542413960253e-05, 'epoch': 0.38}


                                                        
 75%|███████▌  | 3490/4626 [15:05:44<2:20:52,  7.44s/it]

{'loss': 0.9087, 'grad_norm': 4.990135192871094, 'learning_rate': 1.3766359670382939e-05, 'epoch': 0.38}


                                                        
 76%|███████▌  | 3500/4626 [15:06:59<2:19:44,  7.45s/it]

{'loss': 1.1938, 'grad_norm': 8.072131156921387, 'learning_rate': 1.3645176926805623e-05, 'epoch': 0.38}


                                                        
 76%|███████▌  | 3510/4626 [15:08:13<2:18:35,  7.45s/it]

{'loss': 1.1207, 'grad_norm': 7.955018043518066, 'learning_rate': 1.3523994183228309e-05, 'epoch': 0.38}


                                                        
 76%|███████▌  | 3520/4626 [15:09:27<2:16:15,  7.39s/it]

{'loss': 1.0302, 'grad_norm': 8.933940887451172, 'learning_rate': 1.3402811439650994e-05, 'epoch': 0.38}


                                                        
 76%|███████▋  | 3530/4626 [15:10:41<2:14:36,  7.37s/it]

{'loss': 1.1143, 'grad_norm': 9.498982429504395, 'learning_rate': 1.328162869607368e-05, 'epoch': 0.38}


                                                        
 77%|███████▋  | 3540/4626 [15:11:55<2:14:48,  7.45s/it]

{'loss': 1.1291, 'grad_norm': 8.74307918548584, 'learning_rate': 1.3160445952496364e-05, 'epoch': 0.38}


                                                        
 77%|███████▋  | 3550/4626 [15:13:09<2:12:50,  7.41s/it]

{'loss': 1.1303, 'grad_norm': 8.17066764831543, 'learning_rate': 1.303926320891905e-05, 'epoch': 0.38}


                                                        
 77%|███████▋  | 3560/4626 [15:14:23<2:10:59,  7.37s/it]

{'loss': 1.0265, 'grad_norm': 6.914201736450195, 'learning_rate': 1.2918080465341737e-05, 'epoch': 0.38}


                                                        
 77%|███████▋  | 3570/4626 [15:15:37<2:09:04,  7.33s/it]

{'loss': 1.1644, 'grad_norm': 9.566732406616211, 'learning_rate': 1.2796897721764423e-05, 'epoch': 0.39}


                                                        
 77%|███████▋  | 3580/4626 [15:16:52<2:09:34,  7.43s/it]

{'loss': 1.1875, 'grad_norm': 5.019099235534668, 'learning_rate': 1.2675714978187107e-05, 'epoch': 0.39}


                                                        
 78%|███████▊  | 3590/4626 [15:18:07<2:08:35,  7.45s/it]

{'loss': 1.0717, 'grad_norm': 8.469128608703613, 'learning_rate': 1.2554532234609792e-05, 'epoch': 0.39}


                                                        
 78%|███████▊  | 3600/4626 [15:19:21<2:06:36,  7.40s/it]

{'loss': 1.085, 'grad_norm': 5.885849475860596, 'learning_rate': 1.2433349491032478e-05, 'epoch': 0.39}


                                                        
 78%|███████▊  | 3610/4626 [15:20:35<2:06:09,  7.45s/it]

{'loss': 1.0253, 'grad_norm': 4.656200408935547, 'learning_rate': 1.2312166747455164e-05, 'epoch': 0.39}


                                                        
 78%|███████▊  | 3620/4626 [15:21:49<2:04:29,  7.43s/it]

{'loss': 0.9681, 'grad_norm': 4.242521286010742, 'learning_rate': 1.2190984003877848e-05, 'epoch': 0.39}


                                                        
 78%|███████▊  | 3630/4626 [15:23:03<2:02:10,  7.36s/it]

{'loss': 1.073, 'grad_norm': 11.61385440826416, 'learning_rate': 1.2069801260300533e-05, 'epoch': 0.39}


                                                        
 79%|███████▊  | 3640/4626 [15:24:17<2:01:49,  7.41s/it]

{'loss': 1.0872, 'grad_norm': 8.098515510559082, 'learning_rate': 1.1948618516723219e-05, 'epoch': 0.39}


                                                        
 79%|███████▉  | 3650/4626 [15:25:32<2:01:45,  7.49s/it]

{'loss': 0.9523, 'grad_norm': 10.096136093139648, 'learning_rate': 1.1827435773145904e-05, 'epoch': 0.39}


                                                        
 79%|███████▉  | 3660/4626 [15:26:46<2:00:00,  7.45s/it]

{'loss': 1.2157, 'grad_norm': 8.326165199279785, 'learning_rate': 1.170625302956859e-05, 'epoch': 0.4}


                                                        
 79%|███████▉  | 3670/4626 [15:28:00<1:58:25,  7.43s/it]

{'loss': 0.913, 'grad_norm': 9.732887268066406, 'learning_rate': 1.1585070285991276e-05, 'epoch': 0.4}


                                                        
 80%|███████▉  | 3680/4626 [15:29:15<1:58:13,  7.50s/it]

{'loss': 0.9838, 'grad_norm': 8.874852180480957, 'learning_rate': 1.146388754241396e-05, 'epoch': 0.4}


                                                        
 80%|███████▉  | 3690/4626 [15:30:30<1:56:26,  7.46s/it]

{'loss': 1.0046, 'grad_norm': 13.835150718688965, 'learning_rate': 1.1342704798836647e-05, 'epoch': 0.4}


                                                        
 80%|███████▉  | 3700/4626 [15:31:45<1:54:19,  7.41s/it]

{'loss': 1.0579, 'grad_norm': 16.23939323425293, 'learning_rate': 1.1221522055259331e-05, 'epoch': 0.4}


                                                        
 80%|████████  | 3710/4626 [15:33:00<1:54:34,  7.51s/it]

{'loss': 0.9328, 'grad_norm': 4.118071556091309, 'learning_rate': 1.1100339311682017e-05, 'epoch': 0.4}


                                                        
 80%|████████  | 3720/4626 [15:34:15<1:53:00,  7.48s/it]

{'loss': 1.1194, 'grad_norm': 9.235114097595215, 'learning_rate': 1.0979156568104702e-05, 'epoch': 0.4}


                                                        
 81%|████████  | 3730/4626 [15:35:29<1:51:04,  7.44s/it]

{'loss': 1.0465, 'grad_norm': 7.225974082946777, 'learning_rate': 1.0857973824527388e-05, 'epoch': 0.4}


                                                        
 81%|████████  | 3740/4626 [15:36:44<1:49:49,  7.44s/it]

{'loss': 0.9932, 'grad_norm': 8.916173934936523, 'learning_rate': 1.0736791080950074e-05, 'epoch': 0.4}


                                                        
 81%|████████  | 3750/4626 [15:37:58<1:47:56,  7.39s/it]

{'loss': 0.9682, 'grad_norm': 4.818258762359619, 'learning_rate': 1.0615608337372758e-05, 'epoch': 0.41}


                                                        
 81%|████████▏ | 3760/4626 [15:39:12<1:47:39,  7.46s/it]

{'loss': 1.054, 'grad_norm': 7.611611366271973, 'learning_rate': 1.0494425593795443e-05, 'epoch': 0.41}


                                                        
 81%|████████▏ | 3770/4626 [15:40:27<1:46:32,  7.47s/it]

{'loss': 1.3293, 'grad_norm': 4.610212326049805, 'learning_rate': 1.0373242850218129e-05, 'epoch': 0.41}


                                                        
 82%|████████▏ | 3780/4626 [15:41:42<1:45:31,  7.48s/it]

{'loss': 1.1189, 'grad_norm': 4.230982303619385, 'learning_rate': 1.0252060106640815e-05, 'epoch': 0.41}


                                                        
 82%|████████▏ | 3790/4626 [15:42:57<1:43:48,  7.45s/it]

{'loss': 1.1483, 'grad_norm': 8.607665061950684, 'learning_rate': 1.01308773630635e-05, 'epoch': 0.41}


                                                        
 82%|████████▏ | 3800/4626 [15:44:11<1:41:36,  7.38s/it]

{'loss': 1.0394, 'grad_norm': 4.604429721832275, 'learning_rate': 1.0009694619486186e-05, 'epoch': 0.41}


                                                        
 82%|████████▏ | 3810/4626 [15:45:30<1:45:39,  7.77s/it]

{'loss': 1.1433, 'grad_norm': 4.596811771392822, 'learning_rate': 9.88851187590887e-06, 'epoch': 0.41}


                                                        
 83%|████████▎ | 3820/4626 [15:46:44<1:40:27,  7.48s/it]

{'loss': 0.852, 'grad_norm': 9.617094039916992, 'learning_rate': 9.767329132331557e-06, 'epoch': 0.41}


                                                        
 83%|████████▎ | 3830/4626 [15:47:58<1:38:43,  7.44s/it]

{'loss': 1.1296, 'grad_norm': 4.1446075439453125, 'learning_rate': 9.646146388754241e-06, 'epoch': 0.41}


                                                        
 83%|████████▎ | 3840/4626 [15:49:13<1:37:20,  7.43s/it]

{'loss': 0.943, 'grad_norm': 3.769333839416504, 'learning_rate': 9.524963645176927e-06, 'epoch': 0.42}


                                                        
 83%|████████▎ | 3850/4626 [15:50:27<1:36:09,  7.43s/it]

{'loss': 1.0724, 'grad_norm': 11.646713256835938, 'learning_rate': 9.403780901599613e-06, 'epoch': 0.42}


                                                        
 83%|████████▎ | 3860/4626 [15:51:42<1:34:57,  7.44s/it]

{'loss': 1.0304, 'grad_norm': 4.158473014831543, 'learning_rate': 9.282598158022298e-06, 'epoch': 0.42}


                                                        
 84%|████████▎ | 3870/4626 [15:52:56<1:33:04,  7.39s/it]

{'loss': 1.0435, 'grad_norm': 4.017221927642822, 'learning_rate': 9.161415414444984e-06, 'epoch': 0.42}


                                                        
 84%|████████▍ | 3880/4626 [15:54:10<1:32:50,  7.47s/it]

{'loss': 1.2162, 'grad_norm': 9.015952110290527, 'learning_rate': 9.040232670867668e-06, 'epoch': 0.42}


                                                        
 84%|████████▍ | 3890/4626 [15:55:25<1:31:54,  7.49s/it]

{'loss': 1.0295, 'grad_norm': 9.064664840698242, 'learning_rate': 8.919049927290354e-06, 'epoch': 0.42}


                                                        
 84%|████████▍ | 3900/4626 [15:56:40<1:31:24,  7.55s/it]

{'loss': 1.124, 'grad_norm': 7.415674209594727, 'learning_rate': 8.79786718371304e-06, 'epoch': 0.42}


                                                        
 85%|████████▍ | 3910/4626 [15:57:57<1:30:22,  7.57s/it]

{'loss': 0.826, 'grad_norm': 4.343564033508301, 'learning_rate': 8.676684440135725e-06, 'epoch': 0.42}


                                                        
 85%|████████▍ | 3920/4626 [15:59:12<1:28:14,  7.50s/it]

{'loss': 1.3008, 'grad_norm': 7.879161834716797, 'learning_rate': 8.55550169655841e-06, 'epoch': 0.42}


                                                        
 85%|████████▍ | 3930/4626 [16:00:27<1:27:06,  7.51s/it]

{'loss': 1.1438, 'grad_norm': 7.968414306640625, 'learning_rate': 8.434318952981096e-06, 'epoch': 0.42}


                                                        
 85%|████████▌ | 3940/4626 [16:01:45<1:32:23,  8.08s/it]

{'loss': 0.8905, 'grad_norm': 7.902964115142822, 'learning_rate': 8.31313620940378e-06, 'epoch': 0.43}


                                                        
 85%|████████▌ | 3950/4626 [16:03:02<1:25:12,  7.56s/it]

{'loss': 0.904, 'grad_norm': 7.392732620239258, 'learning_rate': 8.191953465826468e-06, 'epoch': 0.43}


                                                        
 86%|████████▌ | 3960/4626 [16:04:20<1:28:35,  7.98s/it]

{'loss': 1.0782, 'grad_norm': 7.574976444244385, 'learning_rate': 8.070770722249152e-06, 'epoch': 0.43}


                                                        
 86%|████████▌ | 3970/4626 [16:05:43<1:31:44,  8.39s/it]

{'loss': 1.1457, 'grad_norm': 9.520365715026855, 'learning_rate': 7.949587978671837e-06, 'epoch': 0.43}


                                                        
 86%|████████▌ | 3980/4626 [16:07:09<1:31:09,  8.47s/it]

{'loss': 1.0666, 'grad_norm': 14.849432945251465, 'learning_rate': 7.828405235094523e-06, 'epoch': 0.43}


                                                        
 86%|████████▋ | 3990/4626 [16:08:33<1:29:06,  8.41s/it]

{'loss': 1.1634, 'grad_norm': 4.391905307769775, 'learning_rate': 7.707222491517209e-06, 'epoch': 0.43}


                                                        
 86%|████████▋ | 4000/4626 [16:09:57<1:27:58,  8.43s/it]

{'loss': 1.0501, 'grad_norm': 4.070496082305908, 'learning_rate': 7.586039747939893e-06, 'epoch': 0.43}


                                                        
 87%|████████▋ | 4010/4626 [16:11:22<1:27:32,  8.53s/it]

{'loss': 1.1011, 'grad_norm': 4.217963218688965, 'learning_rate': 7.464857004362578e-06, 'epoch': 0.43}


                                                        
 87%|████████▋ | 4020/4626 [16:12:48<1:26:19,  8.55s/it]

{'loss': 0.9724, 'grad_norm': 3.985227108001709, 'learning_rate': 7.343674260785265e-06, 'epoch': 0.43}


                                                        
 87%|████████▋ | 4030/4626 [16:14:13<1:26:18,  8.69s/it]

{'loss': 1.0347, 'grad_norm': 14.551783561706543, 'learning_rate': 7.2224915172079495e-06, 'epoch': 0.44}


                                                        
 87%|████████▋ | 4040/4626 [16:15:36<1:21:24,  8.33s/it]

{'loss': 1.2758, 'grad_norm': 9.298091888427734, 'learning_rate': 7.101308773630635e-06, 'epoch': 0.44}


                                                        
 88%|████████▊ | 4050/4626 [16:17:01<1:21:39,  8.51s/it]

{'loss': 1.0408, 'grad_norm': 8.494921684265137, 'learning_rate': 6.98012603005332e-06, 'epoch': 0.44}


                                                        
 88%|████████▊ | 4060/4626 [16:18:25<1:18:08,  8.28s/it]

{'loss': 0.9513, 'grad_norm': 10.464649200439453, 'learning_rate': 6.8589432864760065e-06, 'epoch': 0.44}


                                                        
 88%|████████▊ | 4070/4626 [16:19:49<1:18:11,  8.44s/it]

{'loss': 1.0915, 'grad_norm': 4.296815872192383, 'learning_rate': 6.737760542898691e-06, 'epoch': 0.44}


                                                        
 88%|████████▊ | 4080/4626 [16:21:13<1:15:35,  8.31s/it]

{'loss': 1.1587, 'grad_norm': 12.505781173706055, 'learning_rate': 6.616577799321377e-06, 'epoch': 0.44}


                                                        
 88%|████████▊ | 4090/4626 [16:22:38<1:15:06,  8.41s/it]

{'loss': 1.0646, 'grad_norm': 12.11531925201416, 'learning_rate': 6.495395055744062e-06, 'epoch': 0.44}


                                                        
 89%|████████▊ | 4100/4626 [16:24:03<1:14:56,  8.55s/it]

{'loss': 0.8863, 'grad_norm': 4.549040794372559, 'learning_rate': 6.374212312166748e-06, 'epoch': 0.44}


                                                        
 89%|████████▉ | 4110/4626 [16:25:28<1:15:02,  8.73s/it]

{'loss': 1.1406, 'grad_norm': 6.416681289672852, 'learning_rate': 6.253029568589433e-06, 'epoch': 0.44}


                                                        
 89%|████████▉ | 4120/4626 [16:26:54<1:12:12,  8.56s/it]

{'loss': 0.9058, 'grad_norm': 6.817674160003662, 'learning_rate': 6.131846825012119e-06, 'epoch': 0.45}


                                                        
 89%|████████▉ | 4130/4626 [16:28:17<1:09:01,  8.35s/it]

{'loss': 1.199, 'grad_norm': 13.114566802978516, 'learning_rate': 6.0106640814348035e-06, 'epoch': 0.45}


                                                        
 89%|████████▉ | 4140/4626 [16:29:43<1:09:31,  8.58s/it]

{'loss': 0.986, 'grad_norm': 12.621912002563477, 'learning_rate': 5.889481337857489e-06, 'epoch': 0.45}


                                                        
 90%|████████▉ | 4150/4626 [16:31:08<1:06:31,  8.39s/it]

{'loss': 1.204, 'grad_norm': 6.2978668212890625, 'learning_rate': 5.768298594280175e-06, 'epoch': 0.45}


                                                        
 90%|████████▉ | 4160/4626 [16:32:30<1:03:01,  8.11s/it]

{'loss': 1.149, 'grad_norm': 13.398597717285156, 'learning_rate': 5.6471158507028605e-06, 'epoch': 0.45}


                                                        
 90%|█████████ | 4170/4626 [16:33:50<1:01:05,  8.04s/it]

{'loss': 0.9229, 'grad_norm': 6.008903980255127, 'learning_rate': 5.525933107125545e-06, 'epoch': 0.45}


                                                        
 90%|█████████ | 4180/4626 [16:35:10<59:13,  7.97s/it]

{'loss': 1.114, 'grad_norm': 6.087188243865967, 'learning_rate': 5.404750363548231e-06, 'epoch': 0.45}


                                                      
 91%|█████████ | 4190/4626 [16:36:28<57:34,  7.92s/it]

{'loss': 1.1481, 'grad_norm': 5.125988960266113, 'learning_rate': 5.283567619970917e-06, 'epoch': 0.45}


                                                      
 91%|█████████ | 4200/4626 [16:37:49<56:57,  8.02s/it]

{'loss': 0.9227, 'grad_norm': 4.404721260070801, 'learning_rate': 5.162384876393602e-06, 'epoch': 0.45}


                                                      
 91%|█████████ | 4210/4626 [16:39:09<55:41,  8.03s/it]

{'loss': 1.0789, 'grad_norm': 12.270506858825684, 'learning_rate': 5.041202132816287e-06, 'epoch': 0.46}


                                                      
 91%|█████████ | 4220/4626 [16:40:30<55:20,  8.18s/it]

{'loss': 1.0678, 'grad_norm': 6.362585067749023, 'learning_rate': 4.920019389238972e-06, 'epoch': 0.46}


                                                      
 91%|█████████▏| 4230/4626 [16:41:52<54:35,  8.27s/it]

{'loss': 1.0762, 'grad_norm': 13.000589370727539, 'learning_rate': 4.798836645661658e-06, 'epoch': 0.46}


                                                      
 92%|█████████▏| 4240/4626 [16:43:15<53:00,  8.24s/it]

{'loss': 0.9997, 'grad_norm': 8.661541938781738, 'learning_rate': 4.677653902084343e-06, 'epoch': 0.46}


                                                      
 92%|█████████▏| 4250/4626 [16:44:38<51:51,  8.27s/it]

{'loss': 1.1083, 'grad_norm': 4.316239356994629, 'learning_rate': 4.556471158507029e-06, 'epoch': 0.46}


                                                      
 92%|█████████▏| 4260/4626 [16:46:00<50:07,  8.22s/it]

{'loss': 0.9832, 'grad_norm': 6.632246494293213, 'learning_rate': 4.435288414929714e-06, 'epoch': 0.46}


                                                      
 92%|█████████▏| 4270/4626 [16:47:24<49:26,  8.33s/it]

{'loss': 1.0404, 'grad_norm': 11.928914070129395, 'learning_rate': 4.3141056713523994e-06, 'epoch': 0.46}


                                                      
 93%|█████████▎| 4280/4626 [16:48:47<48:00,  8.32s/it]

{'loss': 0.7993, 'grad_norm': 8.082335472106934, 'learning_rate': 4.192922927775085e-06, 'epoch': 0.46}


                                                      
 93%|█████████▎| 4290/4626 [16:50:10<46:12,  8.25s/it]

{'loss': 1.0004, 'grad_norm': 4.26029109954834, 'learning_rate': 4.071740184197771e-06, 'epoch': 0.46}


                                                      
 93%|█████████▎| 4300/4626 [16:51:35<46:06,  8.49s/it]

{'loss': 1.054, 'grad_norm': 9.616801261901855, 'learning_rate': 3.9505574406204556e-06, 'epoch': 0.46}


                                                      
 93%|█████████▎| 4310/4626 [16:52:57<43:43,  8.30s/it]

{'loss': 1.1211, 'grad_norm': 4.193675518035889, 'learning_rate': 3.829374697043141e-06, 'epoch': 0.47}


                                                      
 93%|█████████▎| 4320/4626 [16:54:21<42:45,  8.38s/it]

{'loss': 1.1825, 'grad_norm': 6.637325286865234, 'learning_rate': 3.708191953465827e-06, 'epoch': 0.47}


                                                      
 94%|█████████▎| 4330/4626 [16:55:45<41:44,  8.46s/it]

{'loss': 1.1283, 'grad_norm': 13.299017906188965, 'learning_rate': 3.587009209888512e-06, 'epoch': 0.47}


                                                      
 94%|█████████▍| 4340/4626 [16:57:09<40:32,  8.50s/it]

{'loss': 1.0726, 'grad_norm': 4.4534807205200195, 'learning_rate': 3.4658264663111978e-06, 'epoch': 0.47}


                                                      
 94%|█████████▍| 4350/4626 [16:58:36<40:13,  8.74s/it]

{'loss': 1.0783, 'grad_norm': 4.657155513763428, 'learning_rate': 3.3446437227338826e-06, 'epoch': 0.47}


                                                      
 94%|█████████▍| 4360/4626 [17:00:01<37:09,  8.38s/it]

{'loss': 1.058, 'grad_norm': 11.738633155822754, 'learning_rate': 3.223460979156568e-06, 'epoch': 0.47}


                                                      
 94%|█████████▍| 4370/4626 [17:01:24<35:00,  8.21s/it]

{'loss': 1.0304, 'grad_norm': 9.680414199829102, 'learning_rate': 3.102278235579254e-06, 'epoch': 0.47}


                                                      
 95%|█████████▍| 4380/4626 [17:02:48<34:14,  8.35s/it]

{'loss': 0.9132, 'grad_norm': 12.242466926574707, 'learning_rate': 2.9810954920019387e-06, 'epoch': 0.47}


                                                      
 95%|█████████▍| 4390/4626 [17:04:11<32:53,  8.36s/it]

{'loss': 1.0086, 'grad_norm': 8.03893756866455, 'learning_rate': 2.8599127484246244e-06, 'epoch': 0.47}


                                                      
 95%|█████████▌| 4400/4626 [17:05:34<30:52,  8.20s/it]

{'loss': 1.1013, 'grad_norm': 12.355254173278809, 'learning_rate': 2.7387300048473096e-06, 'epoch': 0.48}


                                                      
 95%|█████████▌| 4410/4626 [17:06:54<28:59,  8.05s/it]

{'loss': 0.9891, 'grad_norm': 8.314918518066406, 'learning_rate': 2.6175472612699953e-06, 'epoch': 0.48}


                                                      
 96%|█████████▌| 4420/4626 [17:08:14<26:43,  7.79s/it]

{'loss': 1.246, 'grad_norm': 4.272399425506592, 'learning_rate': 2.4963645176926805e-06, 'epoch': 0.48}


                                                      
 96%|█████████▌| 4430/4626 [17:09:30<24:50,  7.60s/it]

{'loss': 0.9178, 'grad_norm': 8.220647811889648, 'learning_rate': 2.375181774115366e-06, 'epoch': 0.48}


                                                      
 96%|█████████▌| 4440/4626 [17:10:48<25:07,  8.11s/it]

{'loss': 1.0139, 'grad_norm': 6.736879825592041, 'learning_rate': 2.2539990305380514e-06, 'epoch': 0.48}


                                                      
 96%|█████████▌| 4450/4626 [17:12:06<22:25,  7.65s/it]

{'loss': 1.0635, 'grad_norm': 8.865259170532227, 'learning_rate': 2.1328162869607367e-06, 'epoch': 0.48}


                                                      
 96%|█████████▋| 4460/4626 [17:13:23<21:00,  7.59s/it]

{'loss': 0.9249, 'grad_norm': 6.607445240020752, 'learning_rate': 2.0116335433834223e-06, 'epoch': 0.48}


                                                      
 97%|█████████▋| 4470/4626 [17:14:41<21:04,  8.10s/it]

{'loss': 0.9494, 'grad_norm': 8.708245277404785, 'learning_rate': 1.8904507998061076e-06, 'epoch': 0.48}


                                                      
 97%|█████████▋| 4480/4626 [17:16:06<20:37,  8.47s/it]

{'loss': 0.9074, 'grad_norm': 4.774657249450684, 'learning_rate': 1.769268056228793e-06, 'epoch': 0.48}


                                                      
 97%|█████████▋| 4490/4626 [17:17:29<18:47,  8.29s/it]

{'loss': 0.8435, 'grad_norm': 4.539141654968262, 'learning_rate': 1.6480853126514785e-06, 'epoch': 0.49}


                                                      
 97%|█████████▋| 4500/4626 [17:18:53<17:30,  8.33s/it]

{'loss': 1.1381, 'grad_norm': 6.937796592712402, 'learning_rate': 1.526902569074164e-06, 'epoch': 0.49}


                                                      
 97%|█████████▋| 4510/4626 [17:20:16<16:05,  8.32s/it]

{'loss': 1.0895, 'grad_norm': 6.633351802825928, 'learning_rate': 1.4057198254968494e-06, 'epoch': 0.49}


                                                      
 98%|█████████▊| 4520/4626 [17:21:40<14:54,  8.44s/it]

{'loss': 0.9874, 'grad_norm': 8.15105152130127, 'learning_rate': 1.2845370819195348e-06, 'epoch': 0.49}


                                                      
 98%|█████████▊| 4530/4626 [17:23:05<13:36,  8.50s/it]

{'loss': 1.0995, 'grad_norm': 4.336312770843506, 'learning_rate': 1.16335433834222e-06, 'epoch': 0.49}


                                                      
 98%|█████████▊| 4540/4626 [17:24:28<12:03,  8.42s/it]

{'loss': 0.9951, 'grad_norm': 7.94674015045166, 'learning_rate': 1.0421715947649055e-06, 'epoch': 0.49}


                                                      
 98%|█████████▊| 4550/4626 [17:25:52<10:34,  8.35s/it]

{'loss': 0.9262, 'grad_norm': 9.567802429199219, 'learning_rate': 9.20988851187591e-07, 'epoch': 0.49}


                                                      
 99%|█████████▊| 4560/4626 [17:27:16<09:11,  8.35s/it]

{'loss': 1.0689, 'grad_norm': 4.227486610412598, 'learning_rate': 7.998061076102764e-07, 'epoch': 0.49}


                                                      
 99%|█████████▉| 4570/4626 [17:28:40<07:50,  8.40s/it]

{'loss': 1.1371, 'grad_norm': 6.682175159454346, 'learning_rate': 6.786233640329618e-07, 'epoch': 0.49}


                                                      
 99%|█████████▉| 4580/4626 [17:29:59<05:54,  7.70s/it]

{'loss': 0.9272, 'grad_norm': 4.444616794586182, 'learning_rate': 5.574406204556471e-07, 'epoch': 0.5}


                                                      
 99%|█████████▉| 4590/4626 [17:31:15<04:33,  7.61s/it]

{'loss': 0.979, 'grad_norm': 8.275160789489746, 'learning_rate': 4.362578768783325e-07, 'epoch': 0.5}


                                                      
 99%|█████████▉| 4600/4626 [17:32:30<03:16,  7.55s/it]

{'loss': 1.227, 'grad_norm': 12.798565864562988, 'learning_rate': 3.1507513330101795e-07, 'epoch': 0.5}


                                                      
100%|█████████▉| 4610/4626 [17:33:46<02:01,  7.57s/it]

{'loss': 1.0184, 'grad_norm': 6.516831398010254, 'learning_rate': 1.9389238972370337e-07, 'epoch': 0.5}


                                                      
100%|█████████▉| 4620/4626 [17:35:01<00:45,  7.55s/it]

{'loss': 1.2287, 'grad_norm': 9.696372032165527, 'learning_rate': 7.270964614638875e-08, 'epoch': 0.5}


                                                      
100%|██████████| 4626/4626 [17:35:47<00:00, 13.69s/it]


{'train_runtime': 63347.474, 'train_samples_per_second': 0.146, 'train_steps_per_second': 0.073, 'train_loss': 1.0525455187266903, 'epoch': 0.5}


100%|██████████| 579/579 [1:14:27<00:00,  7.72s/it]


TypeError: argmax(): argument 'input' (position 1) must be Tensor, not numpy.ndarray

 32%|███▏      | 75/232 [19:39:16<41:08:37, 943.42s/it]
