In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        pass
        # print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from pathlib import Path

root_dir = Path("/home/mihirneal/Developer/algonauts/")
os.path.exists(root_dir / "algonauts_2025.competitors")

True

In [3]:
import fnmatch
from typing import Any, Dict, List, Tuple
from torch import nn
from PIL import Image
import ast 

class HuggingFaceFeatureExtractor:
    """
    A feature extractor for Hugging Face (or any PyTorch) models that captures
    intermediate activations from any layer specified by exact name or glob pattern.

    Example usage:
        from transformers import BertModel
        model = BertModel.from_pretrained("bert-base-uncased")
        # Specify layers (using glob patterns is supported)
        layers_to_extract = ["encoder.layer.*.output"]

        # Using the extractor as a context manager ensures hooks are removed automatically.
        with HuggingFaceFeatureExtractor(model, layers_to_extract, detach=True) as extractor:
            # Perform a forward pass as usual
            outputs = model(input_ids, attention_mask=mask)
            # Get a copy of the extracted features
            features = extractor.features
            # Now 'features' is a dict mapping layer names to their activation tensors.
    """

    def __init__(self, model: nn.Module, layers: List[str], detach: bool = True):
        self.model = model
        self.detach = detach
        # Expand layer patterns into full module names
        self.layers = self._expand_layers(model, layers)
        self._features: Dict[str, Any] = {}
        self._handles: Dict[str, Any] = {}
        self._register_hooks()

    def _register_hooks(self):
        """Register forward hooks on each specified layer."""
        for layer in self.layers:
            sub_module = self.model.get_submodule(layer)
            handle = sub_module.register_forward_hook(self._make_hook(layer))
            self._handles[layer] = handle

    def _make_hook(self, layer_name: str):
        def hook(module: nn.Module, inputs: Tuple[Any, ...], output: Any):
            # Optionally detach to break the graph and save memory.
            self._features[layer_name] = output.detach() if self.detach else output
        return hook

    def clear(self):
        """Clear the stored features before a new forward pass."""
        self._features.clear()

    @property
    def features(self) -> Dict[str, Any]:
        """Return a copy of the captured features."""
        return dict(self._features)

    def __call__(self, *args, **kwargs) -> Any:
        """
        Run the model forward. This automatically clears previous features,
        then performs a forward pass, capturing intermediate activations.
        Returns the model's original output.
        """
        self.clear()
        return self.model(*args, **kwargs)

    def remove_hooks(self):
        """Remove all registered hooks."""
        for handle in self._handles.values():
            handle.remove()
        self._handles.clear()

    def __enter__(self):
        """Enter context: hooks are already registered."""
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        """Exit context: remove all hooks."""
        self.remove_hooks()

    @staticmethod
    def _expand_layers(model: nn.Module, layers: List[str]) -> List[str]:
        """
        Expand a list of layer names and/or glob patterns to all matching module names
        in the given model. Raises an error if a specified name or pattern doesn't match.
        """
        all_layers = [name for name, _ in model.named_modules() if name]  # skip the root module ''
        all_layers_set = set(all_layers)
        expanded = []
        special_chars = set("*?[]")
        for layer in layers:
            if not any(char in layer for char in special_chars):
                if layer not in all_layers_set:
                    raise ValueError(f"Layer '{layer}' not found in the model.")
                expanded.append(layer)
            else:
                matches = fnmatch.filter(all_layers, layer)
                if not matches:
                    raise ValueError(f"No layers match the pattern '{layer}'.")
                expanded.extend(matches)
        return expanded

In [4]:
import pandas as pd
import torchaudio
import cv2
import torch
import torch.nn.functional as F
from typing import Tuple, List, Callable




def load_transcript(
    path: str
) -> pd.DataFrame:
    """
    Loads a transcript file (TSV) into a pandas DataFrame.

    Parameters:
        path (str): Path to the transcript file.

    Returns:
        pd.DataFrame: DataFrame containing the transcript data.
    """
    try:
        df = pd.read_csv(path, sep='\t')
        return df
    except Exception as e:
        raise RuntimeError(f"Error loading transcript from {path}: {e}")


def load_audio(
    path: str,
    sampling_rate: int = 48000,
    stereo: bool = True
) -> (torch.Tensor, int):
    """
    Loads an audio file using torchaudio, converts the waveform to half precision,
    optionally converts stereo audio to mono, resamples it to the specified sampling_rate
    if needed, and returns the waveform and sample rate.

    Parameters:
        path (str): Path to the audio file.
        sampling_rate (int): Desired sampling rate for the output waveform.
        stereo (bool): If False, converts stereo audio to mono.

    Returns:
        tuple: (waveform_fp16, sampling_rate) where waveform_fp16 is a tensor in float16.
    """
    try:
        # Set the backend to 'ffmpeg' if available
        torchaudio.set_audio_backend("ffmpeg")
        waveform, orig_sr = torchaudio.load(path)
        
        # Convert to mono if stereo is False and the waveform has multiple channels
        if not stereo and waveform.size(0) > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        
        # Resample if original sample rate is different from the desired sampling rate
        if orig_sr != sampling_rate:
            resampler = torchaudio.transforms.Resample(orig_freq=orig_sr, new_freq=sampling_rate)
            waveform = resampler(waveform)
        
        # Convert the waveform to half precision (float16)
        waveform_fp16 = waveform.half()
        del waveform
        return waveform_fp16, sampling_rate
    except Exception as e:
        raise RuntimeError(f"Error loading audio from {path}: {e}")


def load_video(
    path: str,
    resolution: Tuple[int, int] = (224, 224),
    tensor_dtype: torch.dtype = torch.float16,
    verbose: bool = True,
) -> torch.Tensor:
    """
    Loads a video file, reads its frames, converts each frame from BGR to RGB,
    resizes to 224x224, and returns a tensor containing all frames.

    Parameters:
        path (str): Path to the video file.

    Returns:
        torch.Tensor: Tensor of shape [num_frames, 3, 224, 224] containing the video frames.
    """
    cap = cv2.VideoCapture(path)

    if not cap.isOpened():
        raise IOError("Cannot open video file: {}".format(path))

    # Get video FPS and calculate number of frames for 10 seconds
    fps = cap.get(cv2.CAP_PROP_FPS)
    num_frames_to_read = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if verbose:
        print("Total number of frames in the video:", cap.get(cv2.CAP_PROP_FRAME_COUNT))
        print("Original Resolution:", (cap.get(cv2.CAP_PROP_FRAME_WIDTH), cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))
        print("FPS:", fps)
        print("Duration (seconds):", num_frames_to_read / fps)
        print("Target Resolution:", resolution)

    frames = torch.zeros(num_frames_to_read, 3, 224, 224, dtype=tensor_dtype)

    for i in range(num_frames_to_read):
        ret, frame = cap.read()

        if not ret:
            break

        # Optionally, convert the frame from BGR to RGB (if needed)
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Convert the frame (numpy array) to a torch tensor and permute dimensions to [C, H, W]
        frame_tensor = torch.from_numpy(frame_rgb).permute(2, 0, 1) 
        # Resize the frame to 224x224
        frame_tensor = torch.nn.functional.interpolate(frame_tensor.unsqueeze(0), size=224, mode='bilinear', align_corners=False)
        frames[i] = frame_tensor

    cap.release()

    if verbose:
        print(f"Read {len(frames)} frames.")
        print(f"Frames shape: {frames.shape}")

    return frames, fps

In [5]:
from pathlib import Path
import h5py
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
import math


def extract_features(
    parts: List[str],
    movies_base: str,
    transcripts_base: str,
    output_dir: str,
    extraction_fn: Callable,
    interval: int = 1.49,
    verbose: bool = True,
    modality: str = 'all',
    past_context_in_seconds: int = 30,
    splits_overlap: float = 0.5,
    ignore_done = None
):
    """
    Extracts features from the specified parts of the dataset using the provided extraction function.

    Parameters:
        parts (List[str]): List of parts to extract features from. This is the subdirectory name under friends and movie10 folders.
        movies_base (str): Path to the base directory containing movie files.
        transcripts_base (str): Path to the base directory containing transcript files.
        output_dir (str): Path to the output directory where features will be saved.
        interval (int): Interval (in seconds) at which to extract features. Default is 1.49 seconds (the TR for the dataset).
        extraction_fn (function): Function that extracts features from the stimuli. The function should take the following arguments:
            - video: torch.Tensor containing video frames (num_frames, 3, 224, 224)
            - audio: torch.Tensor containing audio waveform (2, num_samples)
            - transcript: array containing strings of words (num_words,)
            - verbose: bool indicating whether to print verbose output.
            and should return a dictionary mapping layer names to extracted features as torch.Tensor.
        verbose (bool): Whether to print verbose output.
        modality (str): Modality to extract features from. Default is 'all'. Options are 'video', 'audio', 'transcript'.


    """
    global video_section_g, audio_section_g, transcript_section_g

    movies_base = Path(movies_base)
    transcripts_base = Path(transcripts_base)

    # Verify that the base directories exist.
    if not movies_base.exists():
        raise FileNotFoundError(f"Movies directory not found: {movies_base}")
    if not transcripts_base.exists():
        raise FileNotFoundError(f"Transcripts directory not found: {transcripts_base}")

    # Iterate through all directories under movies_base.
    for folder in movies_base.rglob('*'):
        if folder.is_dir() and folder.name in parts:
            # Iterate through mkv files in the matched directory.
            for movie_file in folder.glob('*.mkv'):
                # Compute the relative path from movies_base.
                try:
                    rel_folder = folder.relative_to(movies_base)
                except ValueError:
                    # Skip directories that are not under movies_base.
                    continue

                print(rel_folder)
                if "friends" in str(rel_folder):
                    # Build the corresponding transcript file path.
                    transcript_file = transcripts_base / rel_folder / movie_file.with_suffix('.tsv').name
                else:
                    transcript_file = transcripts_base / rel_folder / f"movie10_{movie_file.with_suffix('.tsv').name}"

                print(f"Movie:      {movie_file}")
                print(f"Transcript: {transcript_file}")

                if str(movie_file).split('/')[-1].split('.')[0] + '.h5' in ignore_done:
                    continue

                # Load video frames, audio waveform, and transcript.
                video, audio, transcript, sample_rate, fps_video = None, None, None, None, None
                if modality == 'all' or modality == 'video':
                    video, fps_video = load_video(movie_file, verbose=verbose)
                if modality == 'all' or modality == 'audio' or modality == 'video' or modality == 'transcript':
                    audio, sample_rate = load_audio(movie_file)
                if modality == 'all' or modality == 'transcript':
                    transcript = load_transcript(transcript_file)

                # round fps video
                # if fps_video:
                #     fps_video = round(fps_video)

                if transcript is not None:
                    transcript = resample_transcript(transcript, interval)
                    
                total_duration = audio.shape[1] / sample_rate
                num_intervals_tr = int(total_duration // interval)

                if verbose:
                    print(f"Total duration: {total_duration:.2f} seconds")
                    print(f"Number of intervals: {num_intervals_tr}")
                    print(f"Sample rate: {sample_rate}")

                # Create the output directory if it doesn't exist.
                output_folder = Path(output_dir) / rel_folder
                output_folder.mkdir(parents=True, exist_ok=True)

                # Create a h5 file to store the features.
                output_file = output_folder / movie_file.with_suffix('.h5').name

                if verbose:
                    print(f"Output file: {output_file}")

                seconds_duration = int(audio.shape[1] / sample_rate)
                num_splits = max(1, int(seconds_duration / past_context_in_seconds))
                print(f"Num splits: {num_splits}")

                total_iterations = math.ceil((num_splits / (1 - splits_overlap)) - 1)
                # Create a HDF5 file to store the features.
                with h5py.File(output_file, 'w') as f:
                    features_datasets = {} 
                    # Extract features at each interval.
                    fixed_distance_interval = math.ceil(num_intervals_tr / num_splits)
                    for i in tqdm(range(total_iterations)):
                        index = math.ceil((i * fixed_distance_interval) - (i * splits_overlap * fixed_distance_interval))

                        if index >= num_intervals_tr:        # ← guard clause
                            break
                    
                        # compute future_offset safely
                        future_offset = min(fixed_distance_interval - 1,
                                            num_intervals_tr - index - 1)
                        
                        # if i == total_iterations - 1:
                        #     future_offset = num_intervals_tr - index - 1
                        # else:
                        #     future_offset = math.ceil(num_intervals_tr / num_splits) - 1

                        end_index = index + future_offset

                        # print("First ", index, future_offset)
                        video_section, audio_section, transcript_section = extract_section(
                            video, audio, transcript, interval, index, sample_rate, modality, fps_video, past_offset = 0, future_offset = future_offset, split_by_tr = True
                        )
                        
                        # if i == total_iterations - 1:
                        #     video_section_g = video_section
                        #     audio_section_g = audio_section
                        #     transcript_section_g = transcript_section

                        # print(video_section.shape, audio_section.shape, len(transcript_section))
                        # # skip the rest of the code for now
                        # continue
                        
                        # if i == 100:
                        #     video_section_g = video_section
                        #     audio_section_g = audio_section
                        #     transcript_section_g = transcript_section
    
                            
                        #     # convert the video_section from int8 to float32
                        #     video_section = video_section.int()
                        #     # plot the first and the last frame of the video
                        #     plt.imshow(video_section[0].permute(1, 2, 0).cpu().numpy())
                        #     plt.show()
                        #     plt.imshow(video_section[-1].permute(1, 2, 0).cpu().numpy())    
                        #     plt.show()
    
    
                        #     # display the audio section as an html audio element
                        #     torchaudio.save("audio.wav", audio_section.float(), sample_rate)
                        #     from IPython.display import Audio
                        #     Audio("audio.wav")
    
    
                        #     # print the transcript section
                        #     print(transcript_section)
                        
                        #     # break for testing
                        #     assert False
                        
                            
        
                        output_features = extraction_fn(video_section, audio_section, transcript_section, verbose)
                        
                        for layer_name, tensor in output_features.items():
                            assert tensor.shape[0] == video_section.shape[0], f"Error on layer: {layer_name}, the number of TRs of the output features should be the same as the number of TRs of the video section. Got {tensor.shape[0]} and {video_section.shape[0]}"

                        for layer_name, tensor in output_features.items():
                            # Convert the tensor to a numpy array (on CPU) before storing.
                            tensor_np = tensor.cpu().numpy() # shape [batch_size, feature_dim1, feature_dim2, ...]
                            batch_size = tensor_np.shape[0]
                            if layer_name not in features_datasets:
                                # Create a new dataset and initialize it with the first interval's data.
                                features_datasets[layer_name] = f.create_dataset(
                                    layer_name,
                                    # data=tensor_np[np.newaxis, ...],
                                    data=tensor_np,
                                    maxshape=(None,) + tensor_np.shape[1::],
                                    dtype=np.float16,
                                    chunks=True,
                                )
                            else:
                                ds = features_datasets[layer_name]
                                # ds.resize(ds.shape[0] + 1, axis=0)
                                last_shape = ds.shape[0]
                                ds.resize(end_index + 1, axis=0)
                                # ds[-1] = tensor_np
                                ds[last_shape:end_index] = tensor_np[-(end_index - last_shape)::]
                                
                        # if features_dataset is None:
                        #     features_max_shape = (None,) + output_features.shape
                        #     print(features_max_shape, output_features.shape)
                        #     features_dataset = f.create_dataset(
                        #         'features', 
                        #         shape= output_features.unsqueeze(0).shape,
                        #         maxshape=features_max_shape,
                        #         dtype=np.float16,
                        #         chunks=True,    
                        #     )
                        # else:
                        #     features_dataset.resize(features_dataset.shape[0] + 1, axis=0)
                        #     features_dataset[-1] = output_features

def resample_transcript(transcript: pd.DataFrame, new_interval: float) -> pd.DataFrame:
    """
    Pre-aggregates transcript data into new time intervals.
    
    Parameters:
        transcript (pd.DataFrame): DataFrame with columns 'words_per_tr', 'onsets_per_tr', and 'durations_per_tr'.
        new_interval (float): Desired interval in seconds for grouping.
        
    Returns:
        pd.DataFrame: New DataFrame where each row aggregates words whose end time (onset + duration)
                      falls within the same new interval. Intervals with no transcript words
                      are represented with empty text and empty arrays.
    """
    all_words = []
    all_onsets = []
    all_durations = []

    for _, row in transcript.iterrows():
        # Skip rows without valid onsets.
        if not row['onsets_per_tr'] or row['onsets_per_tr'] == []:
            continue

        # Convert string representations if needed.
        onsets = row['onsets_per_tr']
        words = row['words_per_tr']
        durations = row['durations_per_tr']
        if isinstance(onsets, str):
            onsets = ast.literal_eval(onsets)
        if isinstance(words, str):
            words = ast.literal_eval(words)
        if isinstance(durations, str):
            durations = ast.literal_eval(durations)

        all_words.extend(words)
        all_onsets.extend(onsets)
        all_durations.extend(durations)

    # Create a DataFrame with one row per word.
    df = pd.DataFrame({
        'word': all_words,
        'onset': all_onsets,
        'duration': all_durations
    })
    df['word_end'] = df['onset'] + df['duration']
    
    # Determine the new interval index for each word (based on word_end)
    df['new_index'] = (df['word_end'] // new_interval).astype(int)
    
    # Group by the new interval index.
    grouped = df.groupby('new_index').agg({
        'word': list,
        'onset': list,
        'duration': list,
        'word_end': list
    }).reset_index()
    
    # Ensure max_index is an integer. If df is empty, set max_index to 0.
    max_index = df['new_index'].max()
    if pd.isna(max_index):
        max_index = 0
    else:
        max_index = int(max_index)
    
    # Create a complete DataFrame with all interval indices from 0 up to the maximum.
    complete_intervals = pd.DataFrame({'new_index': range(max_index + 1)})
    
    # Merge the complete intervals with the grouped data so that empty intervals are kept.
    result = complete_intervals.merge(grouped, on='new_index', how='left')
    
    # Replace any missing values with empty lists.
    for col in ['word', 'onset', 'duration', 'word_end']:
        result[col] = result[col].apply(lambda x: x if isinstance(x, list) else [])
    
    # (Optional) Create a text column that joins the words, resulting in an empty string for empty intervals.
    result['text'] = result['word'].apply(lambda x: ' '.join(x))
    
    return result

In [6]:
import math
import torch
import pandas as pd
from typing import Tuple, List
import einops

def extract_section(
    video: torch.Tensor,
    audio: torch.Tensor,
    transcript: pd.DataFrame,
    interval: float,
    index: int,
    sample_rate: int,
    modality: str = 'all',
    fps_video: float = 30,
    past_offset: int = 0,   # number of intervals (including current) to include from the past
    future_offset: int = 0,  # number of intervals after the current one to include
    split_by_tr: bool = False
) -> Tuple[torch.Tensor, torch.Tensor, List[str]]:
    """
    Extracts a section of audio, video, and transcript data based on the interval and index,
    with optional offsets for past and future intervals. If part of the requested window is
    out of bounds, the missing parts are padded with zeros (or empty strings for transcript).

    Parameters:
        video (torch.Tensor): Tensor containing video frames (num_frames, 3, 224, 224).
        audio (torch.Tensor): Tensor containing audio waveform (channels, num_samples).
        transcript (pd.DataFrame): DataFrame containing transcript data (assumed one row per interval).
        interval (float): Duration (in seconds) of one segment/interval.
        index (int): Index (zero-indexed) of the current interval.
        sample_rate (int): Sample rate of the audio waveform.
        modality (str): Modality to extract features from. Options are 'video', 'audio', 'transcript', or 'all'.
        fps_video (float): Frames per second of the video.
        past_offset (int): Number of intervals to include from the past (including the current one).
            For example, past_offset=5 with index=100 returns intervals 96-100.
        future_offset (int): Number of intervals after the current one to include.
            For example, future_offset=2 with index=100 returns intervals 100-102.

    Returns:
        tuple: (video_section, audio_section, transcript_section) where:
            - video_section is a torch.Tensor of shape (requested_frames, *video.shape[1:]),
              padded with zeros if necessary.
            - audio_section is a torch.Tensor of shape (channels, requested_samples),
              padded with zeros if necessary.
            - transcript_section is a list of strings of length (number of requested intervals),
              where missing intervals are filled with empty strings.
    """
    # Determine the range of intervals to extract.
    # If past_offset > 0, we include the current interval and the (past_offset - 1) preceding intervals.
    extraction_start_index = index - past_offset + 1 if past_offset > 0 else index
    extraction_end_index = index + future_offset  # inclusive
    total_intervals = extraction_end_index - extraction_start_index + 1

    # print(index, past_offset, future_offset, total_intervals)
    # Determine the corresponding time boundaries.
    # Note that a given interval i spans [i * interval, (i+1) * interval).
    requested_start_time = extraction_start_index * interval
    requested_end_time = (extraction_end_index + 1) * interval  # exclusive end

    # ---- Audio Extraction ----
    audio_section = None
    if modality in ['all', 'audio']:
        # Total samples requested
        total_requested_samples = int(round(total_intervals * interval * sample_rate))
        # Create output tensor filled with zeros.
        audio_section = torch.zeros(audio.shape[0], total_requested_samples)
        
        # Compute the global sample indices corresponding to the requested time window.
        requested_start_sample = int(round(requested_start_time * sample_rate))
        requested_end_sample = int(round(requested_end_time * sample_rate))
        
        # Determine the part available from the source audio.
        source_start = max(0, requested_start_sample)
        source_end = min(audio.shape[1], requested_end_sample)
        
        # Determine where to paste the available audio in the output tensor.
        target_offset = 0
        if requested_start_sample < 0:
            target_offset = -requested_start_sample  # number of samples to pad at beginning
        
        # Compute the number of samples to copy.
        num_samples_to_copy = source_end - source_start
        if num_samples_to_copy > 0:
            audio_section[:, target_offset:target_offset + num_samples_to_copy] = audio[:, source_start:source_end]
        if split_by_tr:
            audio_section = einops.rearrange(audio_section, 'c (tr t) -> tr c t', tr=total_intervals)

    # ---- Video Extraction ----
    video_section = None
    if modality in ['all', 'video']:
        # Compute requested frame indices.
        requested_video_start = int(round(requested_start_time * fps_video))
        requested_video_end = int(round(requested_end_time * fps_video))  # exclusive end
        total_requested_frames = requested_video_end - requested_video_start
        
        # Create output tensor filled with zeros.
        video_section = torch.zeros(total_requested_frames, *video.shape[1:])
        
        # Determine the available frames.
        source_frame_start = max(0, requested_video_start)
        source_frame_end = min(video.shape[0], requested_video_end)
        
        # Determine target offset in frames.
        target_offset_frames = 0
        if requested_video_start < 0:
            target_offset_frames = -requested_video_start
        
        num_frames_to_copy = source_frame_end - source_frame_start
        if num_frames_to_copy > 0:
            video_section[target_offset_frames:target_offset_frames + num_frames_to_copy] = video[source_frame_start:source_frame_end]
        if split_by_tr:
            B, C, H, W = video_section.shape
            tr = total_intervals            # 137
            f  = B // tr                    # 35
            new_len = f * tr                # 35 * 137 = 4795

            # pick new_len indices that span 0 … B-1 evenly
            # torch.linspace gives floats; .round().long() makes them integer
            inds = torch.linspace(0, B-1, steps=new_len).round().long()

            # gather those frames
            vs = video_section[inds]

            # reshape into (35, 3, 137, H, W)
            video_section = einops.rearrange(vs, '(tr f) c w h -> tr f c w h', tr=tr)

    # ---- Transcript Extraction ----
    transcript_section = []
    if modality in ['all', 'transcript']:
        # For transcript, we assume one row per interval.
        # Build a list of length total_intervals.
        for i in range(total_intervals):
            global_idx = extraction_start_index + i
            if global_idx < 0 or global_idx >= len(transcript):
                transcript_section.append("")
            else:
                if split_by_tr:
                    transcript_section = transcript_section + [transcript['word'].iloc[global_idx]]
                else:
                    transcript_section = transcript_section + transcript['word'].iloc[global_idx]

    return video_section, audio_section, transcript_section


### Play with your model and the feature extractor here

In [8]:
import math
import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms import InterpolationMode
from transformers import AutoConfig, AutoModel, AutoTokenizer
from typing import Dict, List, Tuple, Union
from pathlib import Path
from torchvision.transforms.functional import to_pil_image

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

def build_transform(input_size):
    return T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
    ])

def find_closest_aspect_ratio(ar, target_ratios, width, height, image_size):
    best_diff = float('inf')
    best = (1, 1)
    area = width * height
    for (w, h) in target_ratios:
        target_ar = w / h
        diff = abs(ar - target_ar)
        # pick the ratio with smallest diff; on tie prefer larger original image area
        if diff < best_diff or (diff == best_diff and area > 0.5 * image_size**2 * w * h):
            best_diff = diff
            best = (w, h)
    return best

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_w, orig_h = image.size
    ar = orig_w / orig_h

    # build all (i,j) pairs whose product ∈ [min_num, max_num]
    target_ratios = sorted(
        {(i, j)
         for n in range(min_num, max_num + 1)
         for i in range(1, n + 1)
         for j in range(1, n + 1)
         if min_num <= i * j <= max_num},
        key=lambda x: x[0] * x[1]
    )

    w_mul, h_mul = find_closest_aspect_ratio(ar, target_ratios, orig_w, orig_h, image_size)
    target_w, target_h = image_size * w_mul, image_size * h_mul
    blocks = w_mul * h_mul

    resized = image.resize((target_w, target_h))
    cols = target_w // image_size

    crops = []
    for idx in range(blocks):
        row = idx // cols
        col = idx % cols
        box = (
            col * image_size,
            row * image_size,
            (col + 1) * image_size,
            (row + 1) * image_size
        )
        crops.append(resized.crop(box))

    if use_thumbnail and len(crops) != 1:
        crops.append(image.resize((image_size, image_size)))

    return crops

def _preprocess_single_pil(pil_img, transform, input_size, max_num):
    crops = dynamic_preprocess(
        pil_img,
        image_size=input_size,
        use_thumbnail=True,
        max_num=max_num
    )
    return [transform(c) for c in crops]   # list of tensors


def load_image(
    imgs: Union[str, Path, List[Union[str, Path]], torch.Tensor],
    input_size: int = 448,
    max_num: int = 12,
) -> torch.Tensor:
    """
    Parameters
    ----------
    imgs :  • str/Path : path to one image file
            • list/tuple of paths
            • torch.Tensor  [n,3,H,W] or [3,H,W]  (values 0‑255, dtype int / fp16)
    input_size : target side length (square patch size)
    max_num    : maximum #crops per original image

    Returns
    -------
    pixel_values : Tensor  [total_crops, 3, input_size, input_size]
                   normalized to ImageNet mean/std  (dtype = float32)
    """

    transform = build_transform(input_size=input_size)

    # --------------------------------------------------------
    # Phase 1: collect all PIL images --------------------------------
    # --------------------------------------------------------
    pil_images = []

    # 1) path or list‑of‑paths
    if isinstance(imgs, (str, Path)):
        pil_images.append(Image.open(imgs).convert('RGB'))

    elif isinstance(imgs, (list, tuple)) and imgs and isinstance(imgs[0], (str, Path)):
        for p in imgs:
            pil_images.append(Image.open(p).convert('RGB'))

    # 2) tensor input
    elif isinstance(imgs, torch.Tensor):
        if imgs.ndim == 3:                 # [3,H,W]  -> add batch dim
            imgs = imgs.unsqueeze(0)
        assert imgs.ndim == 4 and imgs.shape[1] == 3, \
            "Expect tensor shape [n,3,H,W] or [3,H,W]"

        # move to CPU, ensure uint8
        imgs_cpu = imgs.detach().to('cpu')
        if imgs_cpu.dtype != torch.uint8:
            imgs_cpu = imgs_cpu.round().clamp(0, 255).to(torch.uint8)

        for i in range(imgs_cpu.size(0)):
            pil_images.append(to_pil_image(imgs_cpu[i]))

    else:
        raise TypeError("`imgs` must be a path, list of paths, or a [n,3,H,W] tensor")

    # --------------------------------------------------------
    # Phase 2: dynamic tiling + transforms ------------------
    # --------------------------------------------------------
    pixel_tensors = []
    for pil in pil_images:
        pixel_tensors.extend(
            _preprocess_single_pil(pil, transform, input_size, max_num)
        )

    # --------------------------------------------------------
    # Phase 3: stack to one tensor  --------------------------
    # --------------------------------------------------------
    if not pixel_tensors:
        raise RuntimeError("No images found after preprocessing")

    return torch.stack(pixel_tensors)      # [total_crops, 3, input_size, input_size]

def split_model(model_name_or_path):
    device_map = {}
    world_size = max(1, torch.cuda.device_count())

    # load config
    config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True)
    num_layers = config.llm_config.num_hidden_layers

    # allocate layers (first GPU counts as half)
    per_gpu = math.ceil(num_layers / (world_size - 0.5))
    counts = [per_gpu] * world_size
    counts[0] = math.ceil(counts[0] * 0.5)

    layer_idx = 0
    for gpu_idx, cnt in enumerate(counts):
        for _ in range(cnt):
            device_map[f'language_model.model.layers.{layer_idx}'] = gpu_idx
            layer_idx += 1

    # map the rest to GPU 0
    for key in [
        'vision_model', 'mlp1',
        'language_model.model.tok_embeddings',
        'language_model.model.embed_tokens',
        'language_model.output',
        'language_model.model.norm',
        'language_model.model.rotary_emb',
        'language_model.lm_head'
    ]:
        device_map[key] = 0

    # ensure last layer lands on GPU 0
    device_map[f'language_model.model.layers.{num_layers-1}'] = 0

    return device_map

# now load your model & tokenizer
path = 'OpenGVLab/InternVL3-8B'
device_map = split_model(path)
cache_dir = "/home/mihirneal/Developer/hf_cache"
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    cache_dir=cache_dir,
    # load_in_8bit=True,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True,
    device_map=device_map
).eval()

tokenizer = AutoTokenizer.from_pretrained(
    path,
    trust_remote_code=True,
    use_fast=False
)


Fetching 4 files: 100%|██████████| 4/4 [02:33<00:00, 38.28s/it] 
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.42s/it]


In [9]:
"""
Conversation prompt templates.
We kindly request that you import fastchat instead of copying this file if you wish to use it.
If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
Modified from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
"""

import dataclasses
from enum import IntEnum, auto


class SeparatorStyle(IntEnum):
    """Separator styles."""

    ADD_COLON_SINGLE = auto()
    ADD_COLON_TWO = auto()
    ADD_COLON_SPACE_SINGLE = auto()
    NO_COLON_SINGLE = auto()
    NO_COLON_TWO = auto()
    ADD_NEW_LINE_SINGLE = auto()
    LLAMA2 = auto()
    CHATGLM = auto()
    CHATML = auto()
    CHATINTERN = auto()
    DOLLY = auto()
    RWKV = auto()
    PHOENIX = auto()
    ROBIN = auto()
    FALCON_CHAT = auto()
    CHATGLM3 = auto()
    INTERNVL_ZH = auto()
    MPT = auto()


@dataclasses.dataclass
class Conversation:
    """A class that manages prompt templates and keeps all conversation history."""

    # The name of this template
    name: str
    # The template of the system prompt
    system_template: str = '{system_message}'
    # The system message
    system_message: str = ''
    # The names of two roles
    roles: Tuple[str] = ('USER', 'ASSISTANT')
    # All messages. Each item is (role, message).
    messages: List[List[str]] = ()
    # The number of few shot examples
    offset: int = 0
    # The separator style and configurations
    sep_style: SeparatorStyle = SeparatorStyle.ADD_COLON_SINGLE
    sep: str = '\n'
    sep2: str = None
    # Stop criteria (the default one is EOS token)
    stop_str: Union[str, List[str]] = None
    # Stops generation if meeting any token in this list
    stop_token_ids: List[int] = None

    def get_prompt(self) -> str:
        """Get the prompt for generation."""
        system_prompt = self.system_template.format(system_message=self.system_message)
        if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
            ret = system_prompt + self.sep
            for role, message in self.messages:
                if message:
                    ret += role + ': ' + message + self.sep
                else:
                    ret += role + ':'
            return ret
        elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
            seps = [self.sep, self.sep2]
            ret = system_prompt + seps[0]
            for i, (role, message) in enumerate(self.messages):
                if message:
                    ret += role + ': ' + message + seps[i % 2]
                else:
                    ret += role + ':'
            return ret
        elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
            ret = system_prompt + self.sep
            for role, message in self.messages:
                if message:
                    ret += role + ': ' + message + self.sep
                else:
                    ret += role + ': '  # must be end with a space
            return ret
        elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
            ret = '' if system_prompt == '' else system_prompt + self.sep
            for role, message in self.messages:
                if message:
                    ret += role + '\n' + message + self.sep
                else:
                    ret += role + '\n'
            return ret
        elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
            ret = system_prompt
            for role, message in self.messages:
                if message:
                    ret += role + message + self.sep
                else:
                    ret += role
            return ret
        elif self.sep_style == SeparatorStyle.NO_COLON_TWO:
            seps = [self.sep, self.sep2]
            ret = system_prompt
            for i, (role, message) in enumerate(self.messages):
                if message:
                    ret += role + message + seps[i % 2]
                else:
                    ret += role
            return ret
        elif self.sep_style == SeparatorStyle.RWKV:
            ret = system_prompt
            for i, (role, message) in enumerate(self.messages):
                if message:
                    ret += (
                        role
                        + ': '
                        + message.replace('\r\n', '\n').replace('\n\n', '\n')
                    )
                    ret += '\n\n'
                else:
                    ret += role + ':'
            return ret
        elif self.sep_style == SeparatorStyle.LLAMA2:
            seps = [self.sep, self.sep2]
            if self.system_message:
                ret = system_prompt
            else:
                ret = '[INST] '
            for i, (role, message) in enumerate(self.messages):
                tag = self.roles[i % 2]
                if message:
                    if i == 0:
                        ret += message + ' '
                    else:
                        ret += tag + ' ' + message + seps[i % 2]
                else:
                    ret += tag
            return ret
        elif self.sep_style == SeparatorStyle.CHATGLM:
            # source: https://huggingface.co/THUDM/chatglm-6b/blob/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py#L1302-L1308
            # source2: https://huggingface.co/THUDM/chatglm2-6b/blob/e186c891cf64310ac66ef10a87e6635fa6c2a579/modeling_chatglm.py#L926
            round_add_n = 1 if self.name == 'chatglm2' else 0
            if system_prompt:
                ret = system_prompt + self.sep
            else:
                ret = ''

            for i, (role, message) in enumerate(self.messages):
                if i % 2 == 0:
                    ret += f'[Round {i//2 + round_add_n}]{self.sep}'

                if message:
                    ret += f'{role}：{message}{self.sep}'
                else:
                    ret += f'{role}：'
            return ret
        elif self.sep_style == SeparatorStyle.CHATML:
            ret = '' if system_prompt == '' else system_prompt + self.sep + '\n'
            for role, message in self.messages:
                if message:
                    ret += role + '\n' + message + self.sep + '\n'
                else:
                    ret += role + '\n'
            return ret
        elif self.sep_style == SeparatorStyle.CHATGLM3:
            ret = ''
            if self.system_message:
                ret += system_prompt
            for role, message in self.messages:
                if message:
                    ret += role + '\n' + ' ' + message
                else:
                    ret += role
            return ret
        elif self.sep_style == SeparatorStyle.CHATINTERN:
            # source: https://huggingface.co/internlm/internlm-chat-7b-8k/blob/bd546fa984b4b0b86958f56bf37f94aa75ab8831/modeling_internlm.py#L771
            seps = [self.sep, self.sep2]
            ret = system_prompt
            for i, (role, message) in enumerate(self.messages):
                # if i % 2 == 0:
                #     ret += "<s>"
                if message:
                    ret += role + ':' + message + seps[i % 2] + '\n'
                else:
                    ret += role + ':'
            return ret
        elif self.sep_style == SeparatorStyle.DOLLY:
            seps = [self.sep, self.sep2]
            ret = system_prompt
            for i, (role, message) in enumerate(self.messages):
                if message:
                    ret += role + ':\n' + message + seps[i % 2]
                    if i % 2 == 1:
                        ret += '\n\n'
                else:
                    ret += role + ':\n'
            return ret
        elif self.sep_style == SeparatorStyle.PHOENIX:
            ret = system_prompt
            for role, message in self.messages:
                if message:
                    ret += role + ': ' + '<s>' + message + '</s>'
                else:
                    ret += role + ': ' + '<s>'
            return ret
        elif self.sep_style == SeparatorStyle.ROBIN:
            ret = system_prompt + self.sep
            for role, message in self.messages:
                if message:
                    ret += role + ':\n' + message + self.sep
                else:
                    ret += role + ':\n'
            return ret
        elif self.sep_style == SeparatorStyle.FALCON_CHAT:
            ret = ''
            if self.system_message:
                ret += system_prompt + self.sep
            for role, message in self.messages:
                if message:
                    ret += role + ': ' + message + self.sep
                else:
                    ret += role + ':'

            return ret
        elif self.sep_style == SeparatorStyle.INTERNVL_ZH:
            seps = [self.sep, self.sep2]
            ret = self.system_message + seps[0]
            for i, (role, message) in enumerate(self.messages):
                if message:
                    ret += role + ': ' + message + seps[i % 2]
                else:
                    ret += role + ':'
            return ret
        elif self.sep_style == SeparatorStyle.MPT:
            ret = system_prompt + self.sep
            for role, message in self.messages:
                if message:
                    if type(message) is tuple:
                        message, _, _ = message
                    ret += role + message + self.sep
                else:
                    ret += role
            return ret
        else:
            raise ValueError(f'Invalid style: {self.sep_style}')

    def set_system_message(self, system_message: str):
        """Set the system message."""
        self.system_message = system_message

    def append_message(self, role: str, message: str):
        """Append a new message."""
        self.messages.append([role, message])

    def update_last_message(self, message: str):
        """Update the last output.
        The last message is typically set to be None when constructing the prompt,
        so we need to update it in-place after getting the response from a model.
        """
        self.messages[-1][1] = message

    def to_gradio_chatbot(self):
        """Convert the conversation to gradio chatbot format."""
        ret = []
        for i, (role, msg) in enumerate(self.messages[self.offset :]):
            if i % 2 == 0:
                ret.append([msg, None])
            else:
                ret[-1][-1] = msg
        return ret

    def to_openai_api_messages(self):
        """Convert the conversation to OpenAI chat completion format."""
        ret = [{'role': 'system', 'content': self.system_message}]

        for i, (_, msg) in enumerate(self.messages[self.offset :]):
            if i % 2 == 0:
                ret.append({'role': 'user', 'content': msg})
            else:
                if msg is not None:
                    ret.append({'role': 'assistant', 'content': msg})
        return ret

    def copy(self):
        return Conversation(
            name=self.name,
            system_template=self.system_template,
            system_message=self.system_message,
            roles=self.roles,
            messages=[[x, y] for x, y in self.messages],
            offset=self.offset,
            sep_style=self.sep_style,
            sep=self.sep,
            sep2=self.sep2,
            stop_str=self.stop_str,
            stop_token_ids=self.stop_token_ids,
        )

    def dict(self):
        return {
            'template_name': self.name,
            'system_message': self.system_message,
            'roles': self.roles,
            'messages': self.messages,
            'offset': self.offset,
        }


# A global registry for all conversation templates
conv_templates: Dict[str, Conversation] = {}


def register_conv_template(template: Conversation, override: bool = False):
    """Register a new conversation template."""
    if not override:
        assert (
            template.name not in conv_templates
        ), f'{template.name} has been registered.'

    conv_templates[template.name] = template


def get_conv_template(name: str) -> Conversation:
    """Get a conversation template."""
    return conv_templates[name].copy()


# Both Hermes-2 and internlm2-chat are chatml-format conversation templates. The difference
# is that during training, the preprocessing function for the Hermes-2 template doesn't add
# <s> at the beginning of the tokenized sequence, while the internlm2-chat template does.
# Therefore, they are completely equivalent during inference.
register_conv_template(
    Conversation(
        name='Hermes-2',
        system_template='<|im_start|>system\n{system_message}',
        # note: The new system prompt was not used here to avoid changes in benchmark performance.
        # system_message='我是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
        system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
        roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
        sep_style=SeparatorStyle.MPT,
        sep='<|im_end|>',
        stop_str='<|endoftext|>',
    )
)


register_conv_template(
    Conversation(
        name='internlm2-chat',
        system_template='<|im_start|>system\n{system_message}',
        # note: The new system prompt was not used here to avoid changes in benchmark performance.
        # system_message='我是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
        system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
        roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
        sep_style=SeparatorStyle.MPT,
        sep='<|im_end|>',
    )
)


register_conv_template(
    Conversation(
        name='phi3-chat',
        system_template='<|system|>\n{system_message}',
        # note: The new system prompt was not used here to avoid changes in benchmark performance.
        # system_message='我是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
        system_message='你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。',
        roles=('<|user|>\n', '<|assistant|>\n'),
        sep_style=SeparatorStyle.MPT,
        sep='<|end|>',
    )
)


register_conv_template(
    Conversation(
        name='internvl2_5',
        system_template='<|im_start|>system\n{system_message}',
        system_message='你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
        roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
        sep_style=SeparatorStyle.MPT,
        sep='<|im_end|>\n',
    )
)


In [None]:
import requests
from io import BytesIO

os.makedirs('examples', exist_ok=True)
urls = [
    "https://picsum.photos/512",
    "https://picsum.photos/512"
]
for idx, url in enumerate(urls, start=1):
    resp = requests.get(url)
    img = Image.open(BytesIO(resp.content)).convert("RGB")
    path = f"./examples/image{idx}.jpg"
    img.save(path)
    print(f"Downloaded {path}")

Downloaded ./examples/image1.jpg
Downloaded ./examples/image2.jpg


: 

In [10]:
import torch

# ------------------------------------------------------------------
# Utility: prepare a single prompt exactly as `model.chat` does
# ------------------------------------------------------------------
def build_prompt(model, tokenizer, question, num_patches_list,
                 IMG_START_TOKEN='<img>', IMG_END_TOKEN='</img>',
                 IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'):
    """
    Returns prompt text + (img_context_token_id already set in model)
    """
    if '<image>' not in question:
        question = '<image>\n' + question
    template = get_conv_template(model.template)
    template.system_message = model.system_message
    template.append_message(template.roles[0], question)   # user
    template.append_message(template.roles[1], None)       # assistant (to be filled)
    prompt = template.get_prompt()

    # replace each <image> placeholder with the correct number of IMG_CONTEXT_TOKENs
    for n_patches in num_patches_list:
        image_tokens = (
            IMG_START_TOKEN
            + IMG_CONTEXT_TOKEN * model.num_image_token * n_patches
            + IMG_END_TOKEN
        )
        prompt = prompt.replace('<image>', image_tokens, 1)

    # store the special‑token id inside the model (needed by forward)
    model.img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
    return prompt


# ------------------------------------------------------------------
# Main helper: run forward() and return logits
# ------------------------------------------------------------------


@torch.no_grad()
def get_logits(model, tokenizer, pixel_values, question, num_patches_list, device='cuda'):
    """
    pixel_values       : (total_image_crops, 3, H, W)  — concatenated crops
    num_patches_list   : e.g. [12, 12]  (one entry per *original* image)
    returns logits     : shape (B, seq_len, vocab)
    """
    prompt = build_prompt(model, tokenizer, question, num_patches_list)
    print(prompt)
    tokenizer.padding_side = 'left'
    model_inputs = tokenizer(prompt, return_tensors='pt')
    input_ids      = model_inputs['input_ids'     ].to(device)
    attention_mask = model_inputs['attention_mask'].to(device)
    
    # forward() wants an image‑level flag; 1 = this crop is used
    image_flags = torch.ones(pixel_values.size(0), 1,
                             dtype=torch.long, device=device)
    global am
    am = attention_mask.detach()
    outputs = model(
        pixel_values=pixel_values.to(device),
        input_ids=input_ids,
        attention_mask=attention_mask,
        image_flags=image_flags,
        return_dict=True
    )
    return outputs.logits        # (1, seq_len, vocab)


# ------------------------------------------------------------------
# Example usage
# ------------------------------------------------------------------

# ① Load/prepare crops just like in your working chat example
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values  = torch.cat((pixel_values1, pixel_values2), dim=0)   # (24, 3, 448, 448)
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]  # [12, 12]

# ② Ask your question
question = "<image>\nImage‑2: <image>\nDescribe the two images in detail."

# ③ Forward pass → logits
logits = get_logits(model, tokenizer, pixel_values, question, num_patches_list)

print("Logits shape:", logits.shape) 

FileNotFoundError: [Errno 2] No such file or directory: './examples/image1.jpg'

In [12]:
def make_pair_chunk(words: List[str],
                    num_img_tokens: int,
                    IMG_START_TOKEN='<img>',
                    IMG_END_TOKEN='</img>',
                    IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'):
    return (
        IMG_START_TOKEN
        + IMG_CONTEXT_TOKEN * num_img_tokens
        + IMG_END_TOKEN
        + ' '
        + ' '.join(words)
    )

# ------------------------------------------------------------------
# Main helper
# ------------------------------------------------------------------
@torch.no_grad()
def logits_by_pair(extractor, tokenizer,
                   pixel_values: torch.Tensor,              # [n, 3, H, W]
                   sentences: List[List[str]],              # len == n
                   use_template: bool = False               # wrap in chat template?
                  ) -> Tuple[torch.Tensor,
                             List[Tuple[int,int]],
                             torch.Tensor]:
    """
    Returns:
        logits        : [1, seq_len, vocab]
        token_ranges  : list of (start, end) indices per pair
        avg_logits    : [n, vocab]  (avg over sequence tokens of each pair)
    """
    assert len(pixel_values) == len(sentences), "n images must match n sentences"
    n = len(sentences)
    device = next(extractor.model.parameters()).device

    # ------------------------------------------------------------------
    # 1) Build the text prompt
    # ------------------------------------------------------------------
    num_img_tokens = extractor.model.num_image_token          # K: one image -> K tokens
    pair_chunks = [
        make_pair_chunk(sentences[i], num_img_tokens)
        for i in range(n)
    ]
    body = '\n'.join(pair_chunks)                   # "<img>... </img>  words\n..."

    if use_template:
        tpl = get_conv_template(extractor.model.template)
        tpl.system_message = extractor.model.system_message
        tpl.append_message(tpl.roles[0], body)
        tpl.append_message(tpl.roles[1], None)      # assistant placeholder
        prompt = tpl.get_prompt()
    else:
        prompt = body

    # Replace <image> placeholders (if any) already expanded above → nothing to do
    # but we still need the model to know what <IMG_CONTEXT> id is
    IMG_CONTEXT_TOKEN = '<IMG_CONTEXT>'
    extractor.model.img_context_token_id = tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)

    # ------------------------------------------------------------------
    # 2) Tokenise
    # ------------------------------------------------------------------
    tokenizer.padding_side = 'left'
    enc = tokenizer(prompt, return_tensors='pt')
    input_ids      = enc['input_ids'     ].to(device)
    attention_mask = enc['attention_mask'].to(device)

    # ------------------------------------------------------------------
    # 3) Build image_flags (all ones, one per image)
    # ------------------------------------------------------------------
    image_flags = torch.ones(pixel_values.size(0), 1, dtype=torch.long, device=device)

    # ------------------------------------------------------------------
    # 4) Forward pass  -> logits
    # ------------------------------------------------------------------
    logits = extractor(
        pixel_values=pixel_values.to(device),
        input_ids=input_ids,
        attention_mask=attention_mask,
        image_flags=image_flags,
        return_dict=True
    ).logits                                               # [1, seq_len, vocab]

    # ------------------------------------------------------------------
    # 5) Find token‑range for each pair  (start index of every <img>)
    # ------------------------------------------------------------------
    img_start_id = tokenizer.convert_tokens_to_ids('<img>')
    token_ids = input_ids[0]                               # (seq_len,)
    start_idxs = (token_ids == img_start_id).nonzero(as_tuple=False).flatten().tolist()
    assert len(start_idxs) == n, "didn't find <img> marker for every image"

    token_ranges = []
    for i in range(n):
        s = start_idxs[i]
        e = start_idxs[i+1]-1 if i < n-1 else len(token_ids)-1
        token_ranges.append((s, e))

    # ------------------------------------------------------------------
    # 6) Average logits over each range -> [n, vocab]
    # ------------------------------------------------------------------
    avg_logits = torch.stack([
        logits[0, s:e+1].mean(dim=0)              # mean over sequence dimension
        for (s, e) in token_ranges
    ], dim=0)                                     # [n, vocab]

    return logits, token_ranges, avg_logits

In [13]:
model

InternVLChatModel(
  (vision_model): InternVisionModel(
    (embeddings): InternVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14))
    )
    (encoder): InternVisionEncoder(
      (layers): ModuleList(
        (0): InternVisionEncoderLayer(
          (attn): InternAttention(
            (qkv): Linear8bitLt(in_features=1024, out_features=3072, bias=True)
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear8bitLt(in_features=1024, out_features=1024, bias=True)
          )
          (mlp): InternMLP(
            (act): GELUActivation()
            (fc1): Linear8bitLt(in_features=1024, out_features=4096, bias=True)
            (fc2): Linear8bitLt(in_features=4096, out_features=1024, bias=True)
          )
          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)
          (

In [14]:
layers_to_extract = layers_to_extract = [
    "language_model.model.layers.10.post_attention_layernorm",
    "language_model.model.layers.15.post_attention_layernorm",
    "language_model.model.layers.20.post_attention_layernorm",
    "language_model.model.norm"
]

# Use the feature extractor as a context manager so that hooks are cleaned up automatically.
with HuggingFaceFeatureExtractor(model, layers_to_extract, detach=True) as extractor:
    with torch.no_grad():
        # # Run a forward pass. The extractor clears previous features automatically.
        # outputs = extractor(inputs['pixel_values'].to('cuda'))
    
        # # Retrieve the extracted features.
        # features = extractor.features
    
        # # Print out the shape of the main model output.
        # print("Main model output (last_hidden_state) shape:", outputs.shape)
    
        # # Iterate over the extracted features and print their shapes.
        # for layer_name, activation in features.items():
        #     print(f"Layer: {layer_name}, Feature shape: {activation.shape}")
        pixel_values = pixel_values
        sentences = [['primera', 'oracion', 'nnn en'], ['segunda', 'oracion', 'a veces 222']]
        
        logits, ranges, avg_logits = logits_by_pair(
            extractor, tokenizer,
            pixel_values, sentences,
            use_template=False   # or True
        )
        print(logits.shape)        # [1, seq_len, 151674]
        print(ranges)              # e.g. [(0,160), (161,280), ...]
        print(avg_logits.shape)    # [n, 151674]

        features = extractor.features
    
        # Iterate over the extracted features and print their shapes.
        for layer_name, activation in features.items():
            avg_activation = torch.stack([
                activation[0, s:e+1].mean(dim=0)              # mean over sequence dimension
                for (s, e) in ranges
            ], dim=0)  
            print(f"Layer: {layer_name}, Feature shape: {activation.shape}, Averaged feature shape: {avg_activation.shape}, Sample: {avg_activation[0,0:5]}")

torch.Size([1, 532, 151674])
[(0, 264), (265, 531)]
torch.Size([2, 151674])
Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 532, 3584]), Averaged feature shape: torch.Size([2, 3584]), Sample: tensor([-0.0608, -0.1104, -0.6680, -0.2949,  0.1553], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 532, 3584]), Averaged feature shape: torch.Size([2, 3584]), Sample: tensor([-0.1865, -0.0942, -0.5039, -0.1406, -0.1523], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 532, 3584]), Averaged feature shape: torch.Size([2, 3584]), Sample: tensor([-0.0854, -0.1182, -0.8359, -0.2988, -0.3379], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 532, 3584]), Averaged feature shape: torch.Size([2, 3584]), Sample: tensor

### Write the extract_fn

In [15]:
from functools import wraps

# Define the extractor.
extractor = HuggingFaceFeatureExtractor(model, layers_to_extract, detach=True)


def select_16_frames(video_section, target_frames=16):
    num_frames = video_section.shape[0]
    if num_frames >= target_frames:
        # Uniformly sample 16 indices from 0 to num_frames - 1
        indices = torch.linspace(0, num_frames - 1, steps=target_frames).long()
        selected_frames = video_section[indices]
    else:
        # Repeat frames to reach exactly 16 frames
        repeats = target_frames // num_frames
        remainder = target_frames % num_frames
        repeated_frames = video_section.repeat(repeats, 1, 1, 1)
        if remainder > 0:
            extra_frames = video_section[:remainder]
            repeated_frames = torch.cat([repeated_frames, extra_frames], dim=0)
        selected_frames = repeated_frames
    return selected_frames

def extract_fn(
    video: torch.Tensor, 
    audio: torch.Tensor, 
    transcript: List[List[str]], 
    verbose: bool
) -> Dict[str, torch.Tensor]:
    # Modify this function using the feature extractor
    # video is a tensor with shape [fps * interval, 3, heigth, width] on fp16 from 0-255
    # audio is a tensor with shape [1 if mono 2 if stereo, sampling_rate * interval] on fp16
    # transcript is list of strings of words.

    dict_return = {}
    with torch.no_grad():
        pixel_values = video[:,0] # select the first frame of each chunk
        pixel_values = load_image(pixel_values).to(torch.bfloat16).cuda()
        sentences = transcript
        
        logits, ranges, avg_logits = logits_by_pair(
            extractor, tokenizer,
            pixel_values, sentences,
            use_template=False   # or True
        )
        
        # print(logits.shape)        # [1, seq_len, 151674]
        # print(ranges)              # e.g. [(0,160), (161,280), ...]
        # print(avg_logits.shape)    # [n, 151674]

        features = extractor.features
    
        # Iterate over the extracted features and print their shapes.
        for layer_name, activation in features.items():
            avg_activation = torch.stack([
                activation[0, s:e+1].mean(dim=0)              # mean over sequence dimension
                for (s, e) in ranges
            ], dim=0)  
            print(f"Layer: {layer_name}, Feature shape: {activation.shape}, Averaged feature shape: {avg_activation.shape}, Sample: {avg_activation[0,0:5]}")
            dict_return[layer_name] = avg_activation.to(torch.float16).cpu()
    return dict_return

### Start extraction

In [None]:
# Example usage:
parts = ['s1', 's2', 's3', 's4', 's5', 's6', 's7', 'wolf', 'life', 'bourne', 'figures'] 
movies_base = root_dir / "algonauts_2025.competitors/stimuli/movies"
transcripts_base = root_dir / "algonauts_2025.competitors/stimuli/transcripts"
out_dir = '/kaggle/working/'
ignore_done = [
    "friends_s01e01a.h5",
    "friends_s01e01b.h5",
    "friends_s01e02a.h5",
    "friends_s01e02b.h5",
    "friends_s01e03b.h5",
    "friends_s01e04a.h5",
    "friends_s01e04b.h5",
    "friends_s01e05a.h5",
    "friends_s01e05b.h5",
    "friends_s01e06a.h5",
    "friends_s01e06b.h5",
    "friends_s01e07a.h5",
    "friends_s01e07b.h5",
    "friends_s01e08b.h5",
    "friends_s01e09a.h5",
    "friends_s01e09b.h5",
    "friends_s01e10a.h5",
    "friends_s01e10b.h5",
    "friends_s01e11a.h5",
    "friends_s01e12a.h5",
    "friends_s01e12b.h5",
    "friends_s01e13a.h5",
    "friends_s01e13b.h5",
    "friends_s01e14a.h5",
    "friends_s01e15b.h5",
    "friends_s01e16a.h5",
    "friends_s01e17b.h5",
    "friends_s01e18a.h5",
    "friends_s01e18b.h5",
    "friends_s01e19a.h5",
    "friends_s01e19b.h5",
    "friends_s01e20a.h5",
    "friends_s01e20b.h5",
    "friends_s01e21a.h5",
    "friends_s01e21b.h5",
    "friends_s01e22a.h5",
    "friends_s01e22b.h5",
    "friends_s01e23a.h5",
    "friends_s01e23b.h5",
    "friends_s01e24a.h5",
    "friends_s01e24b.h5"
]


extract_features(parts = parts, movies_base = movies_base, transcripts_base = transcripts_base, output_dir = out_dir, extraction_fn = extract_fn, verbose = True, modality = 'all', past_context_in_seconds = 20, splits_overlap=0.5, ignore_done = ignore_done)

friends/s1
Movie:      /kaggle/input/algonauts2025nsl/algonauts_2025.competitors/stimuli/movies/friends/s1/friends_s01e21b.mkv
Transcript: /kaggle/input/algonauts2025nsl/algonauts_2025.competitors/stimuli/transcripts/friends/s1/friends_s01e21b.tsv
friends/s1
Movie:      /kaggle/input/algonauts2025nsl/algonauts_2025.competitors/stimuli/movies/friends/s1/friends_s01e04a.mkv
Transcript: /kaggle/input/algonauts2025nsl/algonauts_2025.competitors/stimuli/transcripts/friends/s1/friends_s01e04a.tsv
friends/s1
Movie:      /kaggle/input/algonauts2025nsl/algonauts_2025.competitors/stimuli/movies/friends/s1/friends_s01e08b.mkv
Transcript: /kaggle/input/algonauts2025nsl/algonauts_2025.competitors/stimuli/transcripts/friends/s1/friends_s01e08b.tsv
friends/s1
Movie:      /kaggle/input/algonauts2025nsl/algonauts_2025.competitors/stimuli/movies/friends/s1/friends_s01e19a.mkv
Transcript: /kaggle/input/algonauts2025nsl/algonauts_2025.competitors/stimuli/transcripts/friends/s1/friends_s01e19a.tsv
friends/

  torchaudio.set_audio_backend("ffmpeg")


Total duration: 731.81 seconds
Number of intervals: 491
Sample rate: 48000
Output file: /kaggle/working/friends/s1/friends_s01e14b.h5
Num splits: 36


  1%|▏         | 1/71 [00:11<13:51, 11.88s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1260, -0.0693,  0.0708, -0.5938, -0.4180], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2832, -0.2949, -0.0640, -0.1943, -0.0228], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0315,  0.0432, -0.5391, -0.1826, -0.6172], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 1.3594, -0.3477, -1.4453, -0.0173, -1.4688], device='cuda:0',
   

  3%|▎         | 2/71 [00:23<13:49, 12.02s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2617,  0.0977, -0.1426, -0.1973,  0.1089], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3340,  0.1865, -0.2383, -0.1934, -0.0327], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1230,  0.1953, -0.8945, -0.1729, -0.5000], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6172,  1.0078, -0.5508,  0.9414, -0.0767], device='cuda:0',
   

  4%|▍         | 3/71 [00:36<13:55, 12.29s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2070,  0.1875, -0.0996, -0.1670,  0.0796], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2734,  0.2031,  0.0339, -0.1055, -0.0684], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3184,  0.1309, -0.7266, -0.0947, -0.6953], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5508,  0.3926, -0.7227,  0.8945, -0.4375], device='cuda:0',
   

  6%|▌         | 4/71 [00:49<13:49, 12.38s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2012,  0.0566, -0.2324, -0.1182,  0.2891], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2891,  0.0903, -0.1895, -0.0649, -0.0315], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1133,  0.1826, -0.9492, -0.0181, -0.6172], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7188, -0.0317, -0.8594,  1.2891, -0.6758], device='cuda:0',
   

  7%|▋         | 5/71 [01:01<13:32, 12.31s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0317,  0.1816,  0.0762, -0.1396,  0.0447], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2676,  0.1641, -0.0444, -0.1084,  0.0703], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1611,  0.1982, -0.7422, -0.1025, -0.6133], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1387,  0.2930, -0.3652,  1.0859,  0.4023], device='cuda:0',
   

  8%|▊         | 6/71 [01:13<13:18, 12.28s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0640,  0.1484, -0.1943, -0.1416,  0.1050], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3359,  0.1011, -0.0170, -0.0247, -0.0630], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2559,  0.1279, -0.8828,  0.0278, -0.5508], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-1.2891e-01, -1.6797e-01, -2.0020e-01,  1.6797e+00,  7.4863e-05],


 10%|▉         | 7/71 [01:25<13:09, 12.33s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1592,  0.2129, -0.0815, -0.1992,  0.1060], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2314,  0.2412,  0.0152, -0.1289,  0.0057], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3418,  0.1221, -0.7461, -0.1211, -0.6797], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4297,  0.5000, -0.6562,  1.2422, -0.3867], device='cuda:0',
   

 11%|█▏        | 8/71 [01:39<13:11, 12.57s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2656,  0.4824,  0.0035, -0.1602,  0.3203], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2314,  0.3086, -0.2266, -0.1396, -0.1089], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.2061, -0.8320, -0.1211, -0.6367], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5078,  0.4238, -1.3516,  1.6016, -0.1279], device='cuda:0',
   

 13%|█▎        | 9/71 [01:52<13:10, 12.75s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3320,  0.1680, -0.1074, -0.1191,  0.4160], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3730,  0.0150, -0.1113, -0.1396, -0.0015], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1875,  0.0723, -0.8516, -0.1152, -0.5352], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1128,  0.4707, -0.3320,  1.8828, -0.5352], device='cuda:0',
   

 14%|█▍        | 10/71 [02:04<12:57, 12.74s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0654,  0.0408, -0.2715, -0.1504,  0.1113], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2852,  0.1099, -0.1035, -0.1777, -0.0161], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2275,  0.1904, -0.8516, -0.1299, -0.7812], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3145,  0.6562, -0.7500,  1.1016, -0.6484], device='cuda:0',
   

 15%|█▌        | 11/71 [02:17<12:47, 12.79s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2969,  0.2178, -0.1436, -0.2695,  0.1865], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2139,  0.1299, -0.1338, -0.0737,  0.0025], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1147,  0.1543, -0.7812, -0.0708, -0.5898], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0203, -0.2676, -1.0547,  1.3594, -0.1826], device='cuda:0',
   

 17%|█▋        | 12/71 [02:30<12:35, 12.80s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3535,  0.2129, -0.2295, -0.2305,  0.0703], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1992,  0.1846, -0.3164, -0.2002, -0.0688], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0457,  0.1973, -0.8633, -0.2158, -0.5469], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1445,  1.2188, -1.2891,  1.1797, -0.3965], device='cuda:0',
   

 18%|█▊        | 13/71 [02:43<12:26, 12.87s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3635, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3848,  0.2354, -0.1768, -0.1748,  0.1128], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3635, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2021,  0.1367, -0.3828, -0.0767, -0.0388], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3635, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0356,  0.1982, -0.9219, -0.1069, -0.4375], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3635, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7578,  0.5977, -1.2188,  1.7109, -0.0693], device='cuda:0',
   

 20%|█▉        | 14/71 [02:56<12:17, 12.94s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3639, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2148,  0.1035, -0.3281, -0.2871,  0.0273], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3639, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1406,  0.0996, -0.1738, -0.2383, -0.0649], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3639, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1187,  0.1514, -0.7852, -0.2695, -0.4941], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3639, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0762,  0.5117, -0.1875,  0.6211, -0.3184], device='cuda:0',
   

 21%|██        | 15/71 [03:10<12:15, 13.14s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2812,  0.1787, -0.2910, -0.2520,  0.0762], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2451,  0.0957, -0.1641, -0.2139, -0.0432], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1699,  0.2100, -0.8945, -0.1455, -0.4902], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2695,  0.7734, -0.5859,  0.8594,  0.2314], device='cuda:0',
   

 23%|██▎       | 16/71 [03:23<12:05, 13.18s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3705, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1572,  0.1523, -0.4238, -0.2832,  0.0352], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3705, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-2.9297e-01,  1.7834e-04, -4.3457e-02, -1.8555e-01,  9.7168e-02],
       device='cuda:1', dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3705, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2129,  0.1846, -0.8008, -0.1318, -0.5234], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3705, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2949,  0.5703, -0.3457,  0.9258, -0.0447], 

 24%|██▍       | 17/71 [03:36<11:51, 13.18s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2051,  0.1270, -0.4199, -0.2637, -0.0388], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1846,  0.1553, -0.1953, -0.1504, -0.0688], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1064,  0.1167, -0.9570, -0.1650, -0.4648], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0427,  0.7461, -0.3633,  1.5391, -0.3711], device='cuda:0',
   

 25%|██▌       | 18/71 [03:49<11:37, 13.16s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3643, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1611,  0.1484, -0.3164, -0.2988, -0.0153], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3643, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1221,  0.0679, -0.2461, -0.1631,  0.0383], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3643, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1348,  0.1045, -0.8594, -0.0762, -0.3926], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3643, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1621,  0.6992, -0.2578,  1.1016, -0.3008], device='cuda:0',
   

 27%|██▋       | 19/71 [04:03<11:26, 13.21s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2227,  0.1816, -0.4238, -0.3027, -0.0267], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1953,  0.0947, -0.0913, -0.2061,  0.0198], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1089,  0.2236, -0.8789, -0.1187, -0.4434], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0066,  0.3262, -0.6523,  0.8477, -0.4141], device='cuda:0',
   

 28%|██▊       | 20/71 [04:16<11:12, 13.18s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2793,  0.1943, -0.2734, -0.2910,  0.0889], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3184,  0.1357, -0.1445, -0.2334,  0.0106], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0986,  0.1846, -0.8555, -0.2246, -0.5820], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1006,  0.2930, -0.6484,  0.8047, -0.3086], device='cuda:0',
   

 30%|██▉       | 21/71 [04:29<10:59, 13.20s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2256,  0.1582, -0.1768, -0.2734,  0.0014], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773,  0.1709, -0.0035, -0.2197,  0.0432], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1631,  0.1650, -0.8242, -0.1768, -0.4688], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3027,  0.4082, -0.5430,  0.8672, -0.1416], device='cuda:0',
   

 31%|███       | 22/71 [04:42<10:48, 13.24s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1191,  0.1904, -0.1084, -0.3105, -0.0493], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1963,  0.1855, -0.0388, -0.2402,  0.1270], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1943,  0.1572, -0.8555, -0.1377, -0.5586], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0850,  0.0479, -0.7500,  0.7383, -0.4688], device='cuda:0',
   

 32%|███▏      | 23/71 [04:56<10:38, 13.31s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3262,  0.0693, -0.5625, -0.2383, -0.1494], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1641,  0.1309, -0.2383, -0.0884, -0.1621], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1670,  0.2109, -0.8867, -0.0254, -0.4531], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3281,  0.3516, -0.5156,  0.8945, -0.5391], device='cuda:0',
   

 34%|███▍      | 24/71 [05:09<10:24, 13.29s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3649, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2031,  0.1465, -0.4199, -0.2930,  0.0576], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3649, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1592,  0.2031, -0.0693, -0.2129,  0.0068], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3649, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2275,  0.2012, -0.8320, -0.1758, -0.5312], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3649, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5586,  0.7617, -0.1484,  1.0234, -0.5664], device='cuda:0',
   

 35%|███▌      | 25/71 [05:23<10:14, 13.36s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2422,  0.1504, -0.2188, -0.2100,  0.1260], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.1128, -0.1357, -0.1650, -0.0121], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0593,  0.1187, -0.8711, -0.1689, -0.6484], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1235,  0.1787, -0.5508,  0.6680, -0.2578], device='cuda:0',
   

 37%|███▋      | 26/71 [05:36<10:06, 13.47s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2285, -0.0457, -0.0996,  0.0282,  0.1553], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2168,  0.2422, -0.0913, -0.0184, -0.0129], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2656,  0.2441, -0.5391, -0.0806, -0.6406], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1426, -0.1318, -0.9062,  0.6719,  0.1611], device='cuda:0',
   

 38%|███▊      | 27/71 [05:50<09:51, 13.45s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.0579, -0.1885, -0.0884,  0.3887], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1602,  0.1118, -0.2236, -0.0864, -0.1709], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2139,  0.1982, -0.5977, -0.1270, -0.4531], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7109, -0.6680, -0.5703,  1.2344,  0.6641], device='cuda:0',
   

 39%|███▉      | 28/71 [06:03<09:39, 13.48s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2344,  0.1719, -0.0554, -0.0903,  0.1797], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2012,  0.1152, -0.1670, -0.0123, -0.1748], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1514,  0.0117, -0.9180, -0.0085, -0.6211], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0334,  0.3379, -0.9648,  1.1172,  0.2119], device='cuda:0',
   

 41%|████      | 29/71 [06:17<09:28, 13.53s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3699, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1396,  0.2559, -0.1069, -0.0233,  0.3535], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3699, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1846,  0.1377, -0.0317,  0.0140, -0.1494], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3699, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1377,  0.1436, -0.8867,  0.0898, -0.6133], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3699, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7734,  0.0830, -0.6641,  1.6094,  0.4922], device='cuda:0',
   

 42%|████▏     | 30/71 [06:30<09:11, 13.45s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0796,  0.0491, -0.2129, -0.0165,  0.1187], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2656,  0.1738, -0.2012, -0.0344,  0.0962], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2754,  0.2930, -0.8008,  0.0070, -0.5547], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0723,  0.1387, -1.1562,  1.0469,  0.1338], device='cuda:0',
   

 44%|████▎     | 31/71 [06:44<08:57, 13.43s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4395,  0.0723, -0.1836, -0.1060,  0.2031], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1206,  0.0227, -0.3164, -0.1201, -0.0884], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1211,  0.1289, -0.7461, -0.1108, -0.4355], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4551, -1.3516, -0.3008,  0.6445,  0.9922], device='cuda:0',
   

 45%|████▌     | 32/71 [06:57<08:44, 13.44s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3633,  0.1128, -0.1152, -0.0131,  0.1934], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1572,  0.0276, -0.2949, -0.1279, -0.0723], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1074,  0.1895, -0.7617, -0.0649, -0.4258], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6094, -1.6172, -0.3301,  0.8633,  0.9648], device='cuda:0',
   

 46%|████▋     | 33/71 [07:11<08:31, 13.46s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1904,  0.0327, -0.2832, -0.1328,  0.2051], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3535,  0.2793, -0.2617, -0.0320,  0.1289], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2256,  0.2637, -0.7891,  0.0991, -0.4668], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3398,  0.6055, -1.0625,  1.7500, -0.1533], device='cuda:0',
   

 48%|████▊     | 34/71 [07:24<08:16, 13.41s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2500,  0.2832, -0.3164, -0.1719,  0.2207], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3203,  0.3398, -0.3145, -0.0566,  0.0479], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1738,  0.3594, -0.9258,  0.1040, -0.5078], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1118,  1.4375, -1.2031,  2.4688, -0.2402], device='cuda:0',
   

 49%|████▉     | 35/71 [07:37<08:02, 13.40s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2490,  0.1709, -0.3574, -0.2754,  0.2422], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2373,  0.2637, -0.3086, -0.1816,  0.0977], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0317,  0.2754, -0.8555, -0.1416, -0.5703], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3359,  1.1172, -1.2344,  1.5000, -0.2178], device='cuda:0',
   

 51%|█████     | 36/71 [07:51<07:47, 13.37s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2324,  0.2148, -0.1289, -0.1133,  0.2275], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3184,  0.2656, -0.1973,  0.0088,  0.1348], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1289,  0.2637, -0.9141,  0.0991, -0.5039], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0136,  1.4844, -1.6016,  2.4531,  0.2109], device='cuda:0',
   

 52%|█████▏    | 37/71 [08:04<07:36, 13.44s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2871,  0.1367, -0.1572, -0.0801,  0.2227], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2832,  0.1924, -0.3008,  0.0132,  0.0737], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0417,  0.2676, -0.9141,  0.0957, -0.5430], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2158,  1.2422, -1.3594,  2.3438, -0.0356], device='cuda:0',
   

 54%|█████▎    | 38/71 [08:17<07:20, 13.36s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3047,  0.1875, -0.2695, -0.1943,  0.2373], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1738,  0.2891, -0.3125, -0.1123,  0.0574], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0515,  0.2402, -0.9258, -0.0391, -0.6094], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3125,  1.3125, -1.0859,  1.7188,  0.0479], device='cuda:0',
   

 55%|█████▍    | 39/71 [08:31<07:06, 13.34s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1934,  0.2061, -0.1514, -0.1660,  0.2031], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2949,  0.2695, -0.1104, -0.0471,  0.2051], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2031,  0.3145, -0.8750,  0.1035, -0.4414], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2119,  1.0469, -1.6094,  2.5156, -0.1064], device='cuda:0',
   

 56%|█████▋    | 40/71 [08:44<06:53, 13.34s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1621,  0.2109, -0.1797, -0.1592,  0.0928], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3066,  0.2295, -0.1680, -0.0374,  0.2207], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0698,  0.2812, -0.8359,  0.0265, -0.4707], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1187,  1.1484, -1.7266,  1.9766,  0.0342], device='cuda:0',
   

 58%|█████▊    | 41/71 [08:57<06:37, 13.26s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0547,  0.2949, -0.1992, -0.2002,  0.1289], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3125,  0.3398,  0.0192, -0.1270,  0.3047], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2119,  0.2539, -0.8516, -0.0320, -0.4512], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4316,  1.3516, -1.2500,  2.0938, -0.2754], device='cuda:0',
   

 59%|█████▉    | 42/71 [09:11<06:26, 13.31s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1084,  0.2158, -0.2139, -0.1953,  0.2158], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3594,  0.2910, -0.1147, -0.0520,  0.2197], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1328,  0.2695, -0.8633,  0.0413, -0.4922], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0058,  1.2266, -1.3125,  2.1719,  0.1177], device='cuda:0',
   

 61%|██████    | 43/71 [09:24<06:10, 13.22s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3632, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2295,  0.1572, -0.1187, -0.1855,  0.3301], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3632, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3379,  0.2793, -0.1338, -0.0840,  0.1572], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3632, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0737,  0.2695, -0.9648, -0.0073, -0.5234], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3632, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4102,  1.5469, -1.5859,  2.2344, -0.0520], device='cuda:0',
   

 62%|██████▏   | 44/71 [09:37<05:56, 13.21s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1387,  0.1875, -0.0884, -0.2969,  0.1270], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1533,  0.2812, -0.3496, -0.1992,  0.0013], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0283,  0.2305, -0.9805, -0.1309, -0.5938], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2949,  0.3652, -0.9805,  1.0000,  0.3125], device='cuda:0',
   

 63%|██████▎   | 45/71 [09:50<05:46, 13.31s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1855, -0.0588, -0.2988, -0.1816,  0.0874], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1416,  0.2432, -0.1631, -0.0273, -0.0781], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0884,  0.2412, -0.8711,  0.0908, -0.5352], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-1.0938,  0.8164, -0.2676,  1.1250,  0.3555], device='cuda:0',
   

 65%|██████▍   | 46/71 [10:04<05:36, 13.48s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.0503, -0.0427, -0.3047,  0.0679], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2490,  0.0898, -0.1299, -0.2393,  0.0366], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2402,  0.1138, -0.8984, -0.2168, -0.5352], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1875,  0.3906, -1.0781,  0.5078, -0.7852], device='cuda:0',
   

 66%|██████▌   | 47/71 [10:18<05:23, 13.48s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2432,  0.0123, -0.2168, -0.2832,  0.1230], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2910,  0.1650, -0.2969, -0.2041,  0.1187], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1816,  0.1787, -0.8320, -0.2090, -0.5117], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3398,  0.1895, -1.1250,  0.4961, -0.6367], device='cuda:0',
   

 68%|██████▊   | 48/71 [10:31<05:10, 13.48s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2295,  0.0223, -0.3047, -0.3516, -0.0603], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2080,  0.0742, -0.1270, -0.2207,  0.0496], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1504,  0.1196, -0.8164, -0.2197, -0.4199], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1021,  0.4551, -0.5195,  0.6289, -0.4375], device='cuda:0',
   

 69%|██████▉   | 49/71 [10:45<04:56, 13.50s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2100,  0.0608, -0.1572, -0.2148,  0.0698], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3164,  0.1328, -0.1138, -0.1807,  0.1050], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1455,  0.1523, -0.7891, -0.1631, -0.4727], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2480,  0.6211, -0.5898,  0.5586, -0.5430], device='cuda:0',
   

 70%|███████   | 50/71 [10:58<04:45, 13.59s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2373,  0.0009, -0.2559, -0.2090,  0.1934], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.1396, -0.1602, -0.1406,  0.0097], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1318,  0.1729, -0.8516, -0.1436, -0.4766], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2988,  0.6680, -1.0078,  0.9922, -0.4141], device='cuda:0',
   

 72%|███████▏  | 51/71 [11:12<04:31, 13.59s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2383,  0.0967, -0.1436, -0.3184,  0.1426], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773,  0.0859,  0.0339, -0.2246,  0.1309], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2168,  0.1709, -0.8398, -0.2256, -0.4141], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0967,  0.6797, -1.0234,  0.5977, -0.7344], device='cuda:0',
   

 73%|███████▎  | 52/71 [11:26<04:19, 13.66s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2002, -0.0952, -0.2197, -0.2715,  0.0864], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1836,  0.0728, -0.1602, -0.1523,  0.0410], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0574,  0.2363, -0.8203, -0.0806, -0.6367], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3340,  0.7539, -0.3828,  0.8398, -0.7070], device='cuda:0',
   

 75%|███████▍  | 53/71 [11:40<04:08, 13.82s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2988,  0.0435, -0.1611, -0.3145,  0.0854], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.1064, -0.2168, -0.2061,  0.0942], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1650,  0.1807, -0.8516, -0.2021, -0.4004], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2197,  0.4297, -1.1562,  0.4453, -0.7344], device='cuda:0',
   

 76%|███████▌  | 54/71 [11:54<03:55, 13.86s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.0598, -0.2461, -0.3184,  0.0820], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2656,  0.1016, -0.1895, -0.2363,  0.1196], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1484,  0.1797, -0.8594, -0.2451, -0.4375], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2070,  0.5703, -0.9102,  0.5508, -0.8320], device='cuda:0',
   

 77%|███████▋  | 55/71 [12:08<03:42, 13.91s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773,  0.1357, -0.1611, -0.3125,  0.1079], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2715,  0.1006, -0.1182, -0.2578,  0.1187], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2188,  0.1738, -0.8398, -0.2490, -0.4668], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0474,  0.2578, -0.9141,  0.7500, -0.5430], device='cuda:0',
   

 79%|███████▉  | 56/71 [12:22<03:27, 13.86s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3535,  0.1289, -0.4258, -0.2871,  0.0447], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1602,  0.0068, -0.2773, -0.1650, -0.0129], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1787,  0.1377, -0.8867, -0.1680, -0.5156], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6094,  0.6094, -0.3652,  0.8438, -0.5156], device='cuda:0',
   

 80%|████████  | 57/71 [12:35<03:12, 13.71s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3438,  0.1299, -0.4023, -0.3027,  0.0776], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1709,  0.0776, -0.2246, -0.1602,  0.0249], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1416,  0.2051, -0.8789, -0.1377, -0.5547], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2021,  0.8984, -0.4766,  0.9922, -0.8086], device='cuda:0',
   

 82%|████████▏ | 58/71 [12:49<02:57, 13.64s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2793,  0.0056, -0.4492, -0.2393,  0.0884], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2539,  0.0359, -0.2930, -0.1885,  0.0693], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2461,  0.1836, -0.7227, -0.2373, -0.5117], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.3984,  0.8711, -0.9492,  0.3828, -1.2188], device='cuda:0',
   

 83%|████████▎ | 59/71 [13:03<02:44, 13.73s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2363,  0.1260, -0.4238, -0.2432,  0.1035], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1562,  0.0967, -0.4160, -0.2148,  0.0903], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1387,  0.1719, -0.8516, -0.2559, -0.4609], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.2002,  1.1562, -0.4355,  0.3438, -0.5508], device='cuda:0',
   

 85%|████████▍ | 60/71 [13:16<02:30, 13.72s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2988,  0.1338, -0.4590, -0.2422,  0.0693], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1289,  0.1235, -0.3379, -0.1328,  0.0496], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1484,  0.2695, -0.8672, -0.0957, -0.5234], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1816,  0.8242, -0.6914,  1.4297, -0.7734], device='cuda:0',
   

 86%|████████▌ | 61/71 [13:30<02:18, 13.82s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1416,  0.1729, -0.1079, -0.3145,  0.0996], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.1016, -0.0762, -0.2422,  0.1270], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2158,  0.1660, -0.8086, -0.1963, -0.4824], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3496,  0.5078, -0.9648,  0.6953, -0.8906], device='cuda:0',
   

 87%|████████▋ | 62/71 [13:45<02:05, 13.95s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3638, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3145,  0.1021, -0.2285, -0.2520,  0.1689], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3638, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2051,  0.2871, -0.0913, -0.1699, -0.1436], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3638, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1699,  0.1738, -0.8008, -0.1309, -0.6680], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3638, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.4375, -0.5234,  1.0234, -0.3770], device='cuda:0',
   

 89%|████████▊ | 63/71 [13:58<01:50, 13.85s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1299,  0.2354, -0.2402, -0.0840, -0.3262], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0483, -0.0854, -0.3926, -0.0121, -0.1777], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1196,  0.1050, -0.7852,  0.2129, -0.5273], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0859, -0.0537, -0.7656,  1.1250,  0.4531], device='cuda:0',
   

 90%|█████████ | 64/71 [14:12<01:36, 13.82s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1406,  0.0339, -0.1992, -0.1719,  0.1670], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2314,  0.3535, -0.0942, -0.0623, -0.0684], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2363,  0.2949, -0.7773,  0.0117, -0.6797], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1074,  0.2949, -1.5781,  1.3516, -0.5664], device='cuda:0',
   

 92%|█████████▏| 65/71 [14:26<01:23, 13.86s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1689,  0.0188, -0.2285, -0.1992,  0.2676], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2676,  0.1021, -0.0918, -0.1455,  0.0942], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1572,  0.2236, -0.8086, -0.2158, -0.5547], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2832,  0.2773, -0.4785,  0.6797,  0.9492], device='cuda:0',
   

 93%|█████████▎| 66/71 [14:40<01:09, 13.96s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0005,  0.1182, -0.0674, -0.1895,  0.1934], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2422,  0.3301, -0.2188, -0.0737,  0.0791], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2109,  0.2598, -0.8984, -0.0439, -0.5078], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5352,  0.3262, -1.2812,  0.9336, -0.1982], device='cuda:0',
   

 94%|█████████▍| 67/71 [14:54<00:55, 13.95s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1104,  0.0010, -0.1855, -0.1592,  0.1992], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2256,  0.2988, -0.0269, -0.0718, -0.0928], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1592,  0.3086, -0.7812,  0.0118, -0.6289], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0864,  0.0447, -1.2500,  1.1562, -0.3457], device='cuda:0',
   

 96%|█████████▌| 68/71 [15:08<00:41, 13.91s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2852,  0.2715,  0.0513, -0.1025,  0.1021], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.2969, -0.2471, -0.0415, -0.0874], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2617,  0.2285, -0.6992, -0.0018, -0.6016], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5117,  0.2734, -0.7500,  1.4922,  0.0547], device='cuda:0',
   

 97%|█████████▋| 69/71 [15:22<00:27, 13.95s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0205,  0.1416, -0.0981, -0.0515, -0.0957], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2500,  0.1904, -0.0840, -0.0413, -0.0703], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3418, -0.0029, -0.6172, -0.0168, -0.6172], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0369,  0.2676, -0.6172,  1.2656, -0.2354], device='cuda:0',
   

 99%|█████████▊| 70/71 [15:28<00:11, 11.71s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 2092, 3584]), Averaged feature shape: torch.Size([8, 3584]), Sample: tensor([-0.3828,  0.0287, -0.6484, -0.0796,  0.1992], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 2092, 3584]), Averaged feature shape: torch.Size([8, 3584]), Sample: tensor([-0.2227,  0.1016, -0.1406, -0.1245, -0.2217], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 2092, 3584]), Averaged feature shape: torch.Size([8, 3584]), Sample: tensor([-0.1699,  0.2656, -0.7305, -0.1230, -0.4785], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 2092, 3584]), Averaged feature shape: torch.Size([8, 3584]), Sample: tensor([-0.4688, -0.2754, -1.1953,  0.1777,  0.6133], device='cuda:0',
       

100%|██████████| 71/71 [15:29<00:00, 13.09s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 259, 3584]), Averaged feature shape: torch.Size([1, 3584]), Sample: tensor([-0.1079, -0.0688,  0.0481, -0.6094, -0.4043], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 259, 3584]), Averaged feature shape: torch.Size([1, 3584]), Sample: tensor([-0.2930, -0.3066, -0.0199, -0.2002, -0.0427], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 259, 3584]), Averaged feature shape: torch.Size([1, 3584]), Sample: tensor([ 0.0251,  0.0084, -0.5312, -0.1885, -0.6211], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 259, 3584]), Averaged feature shape: torch.Size([1, 3584]), Sample: tensor([ 1.4219, -0.2637, -1.5469,  0.0292, -1.4922], device='cuda:0',
       dtyp




Total number of frames in the video: 21280.0
Original Resolution: (720.0, 480.0)
FPS: 29.968454258675077
Duration (seconds): 710.08
Target Resolution: (224, 224)
Read 21280 frames.
Frames shape: torch.Size([21280, 3, 224, 224])


  torchaudio.set_audio_backend("ffmpeg")


Total duration: 710.09 seconds
Number of intervals: 476
Sample rate: 48000
Output file: /kaggle/working/friends/s1/friends_s01e11b.h5
Num splits: 35


  1%|▏         | 1/69 [00:13<15:09, 13.38s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1133, -0.0549,  0.0742, -0.5938, -0.3945], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2793, -0.2852, -0.0020, -0.1895, -0.0292], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0405,  0.0381, -0.5312, -0.1504, -0.5938], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 1.3047, -0.2598, -1.4922,  0.1309, -1.5000], device='cuda:0',
   

  3%|▎         | 2/69 [00:27<15:06, 13.52s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3535,  0.2236, -0.3105, -0.2305,  0.1758], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2402,  0.2656,  0.0967, -0.1934, -0.2041], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1826,  0.3945, -0.8242, -0.2617, -0.6562], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-1.0078,  0.7617, -0.6875,  0.4648,  0.0077], device='cuda:0',
   

  4%|▍         | 3/69 [00:40<14:53, 13.54s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2793,  0.1953, -0.2754, -0.0361,  0.0503], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1846,  0.2012, -0.1348,  0.0034, -0.0850], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1299,  0.1631, -0.6367, -0.0664, -0.4902], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0029,  0.0674, -1.1719,  0.6445, -0.6133], device='cuda:0',
   

  6%|▌         | 4/69 [00:53<14:30, 13.40s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3640, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0967,  0.1680, -0.1768, -0.1211,  0.2422], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3640, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2373,  0.1953, -0.1533, -0.0085, -0.0347], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3640, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1445,  0.2500, -0.6289, -0.0100, -0.5703], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3640, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0728, -0.3262, -0.8242,  1.3984, -0.7227], device='cuda:0',
   

  7%|▋         | 5/69 [01:07<14:20, 13.44s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0762,  0.1865, -0.1021, -0.2266,  0.2871], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2812,  0.1475,  0.1289, -0.1807,  0.0581], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1807,  0.0981, -0.8516, -0.1270, -0.5469], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7109,  0.1533, -0.2949,  1.2188, -0.2266], device='cuda:0',
   

  9%|▊         | 6/69 [01:20<14:04, 13.41s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2383,  0.1006, -0.0635, -0.0840,  0.2637], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1875,  0.0972, -0.1299, -0.0142, -0.1104], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2637,  0.1157, -0.7617, -0.0786, -0.2754], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-1.0312,  0.5234, -1.1719,  1.3672, -0.0420], device='cuda:0',
   

 10%|█         | 7/69 [01:34<13:53, 13.44s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1045,  0.1787, -0.1250,  0.0850,  0.1689], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2480,  0.0762, -0.0771,  0.0292, -0.0036], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1113,  0.1611, -0.8438,  0.0603, -0.6406], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6523,  0.7109, -1.1719,  1.1328,  0.5000], device='cuda:0',
   

 12%|█▏        | 8/69 [01:47<13:44, 13.51s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1240,  0.1367,  0.0903,  0.0209,  0.1348], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.0791, -0.1147,  0.0102, -0.0287], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1885,  0.2285, -0.8398,  0.0171, -0.6133], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6172,  0.7148, -1.0625,  1.2344,  0.4863], device='cuda:0',
   

 13%|█▎        | 9/69 [02:01<13:37, 13.62s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2217,  0.0674,  0.0728, -0.0249,  0.1436], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3320,  0.0540, -0.1621, -0.0586, -0.0029], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0483,  0.2246, -0.8398, -0.0547, -0.5547], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6094,  1.3672, -0.8750,  1.0234,  0.1748], device='cuda:0',
   

 14%|█▍        | 10/69 [02:15<13:19, 13.55s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2969,  0.0977,  0.1973,  0.0116,  0.3320], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2461,  0.0137, -0.1318,  0.0601, -0.1055], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1187,  0.1895, -0.8750,  0.0767, -0.5781], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7031,  1.1328, -0.7383,  1.7578,  0.4863], device='cuda:0',
   

 16%|█▌        | 11/69 [02:28<13:09, 13.61s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2217,  0.2812, -0.0391, -0.1973,  0.0366], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2812,  0.0820,  0.1338, -0.1416, -0.0410], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1318,  0.1719, -0.8164, -0.1084, -0.4941], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5547,  0.0198, -0.9180,  0.1953, -0.2285], device='cuda:0',
   

 17%|█▋        | 12/69 [02:42<13:01, 13.71s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1279,  0.1699, -0.4062, -0.1167,  0.2227], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3945,  0.1660, -0.1465, -0.0737,  0.0427], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3125,  0.2812, -0.7344, -0.1104, -0.5781], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0786, -0.1108, -1.4766,  0.0090, -0.1846], device='cuda:0',
   

 19%|█▉        | 13/69 [02:56<12:54, 13.84s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3145,  0.2598, -0.0233, -0.2578,  0.1572], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1865,  0.0148,  0.1572, -0.1709, -0.1069], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0047,  0.0757, -0.8047, -0.2178, -0.4941], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6992,  0.3867, -1.4844,  0.2754,  0.0610], device='cuda:0',
   

 20%|██        | 14/69 [03:10<12:44, 13.90s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4805,  0.2471, -0.1377, -0.2393,  0.3145], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2969,  0.0928, -0.0952, -0.2012, -0.1147], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1108,  0.2109, -0.8555, -0.2676, -0.5273], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4980,  0.7461, -1.1406,  0.5039, -0.1338], device='cuda:0',
   

 22%|██▏       | 15/69 [03:25<12:34, 13.97s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3723, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1543,  0.2500, -0.0991, -0.1094,  0.1641], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3723, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2471,  0.1235, -0.2363, -0.0430,  0.0264], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3723, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2656,  0.1270, -0.7969, -0.0053, -0.6172], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3723, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2217, -0.0781, -0.6914,  0.7109,  0.1953], device='cuda:0',
   

 23%|██▎       | 16/69 [03:39<12:21, 13.99s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3699, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3750,  0.1797,  0.0571, -0.2793,  0.2295], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3699, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2383,  0.0039, -0.0388, -0.2490, -0.0942], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3699, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1699,  0.1226, -0.8125, -0.2969, -0.4902], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3699, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.8633,  0.2314, -0.7695,  0.2354, -0.1641], device='cuda:0',
   

 25%|██▍       | 17/69 [03:52<12:03, 13.91s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1094,  0.1562, -0.0767, -0.0874, -0.0172], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2871,  0.1396, -0.0309, -0.0520, -0.1279], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2148,  0.2578, -0.6914, -0.0791, -0.6992], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.4414, -1.1406,  0.2812, -0.4258], device='cuda:0',
   

 26%|██▌       | 18/69 [04:06<11:45, 13.83s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1855,  0.0947, -0.2910, -0.1338, -0.1187], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2754,  0.1572,  0.0036, -0.1504, -0.1396], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2422,  0.1973, -0.7539, -0.1621, -0.6367], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4336,  0.8203, -0.8203,  0.3145, -0.2559], device='cuda:0',
   

 28%|██▊       | 19/69 [04:20<11:29, 13.79s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2559,  0.2930,  0.0640, -0.1602,  0.2734], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2139,  0.2109, -0.3848, -0.0796, -0.0986], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-1.1621e-01,  1.9727e-01, -9.2578e-01, -4.5776e-04, -5.2344e-01],
       device='cuda:1', dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1367,  0.6094, -0.8828,  1.8906,  0.1309], 

 29%|██▉       | 20/69 [04:33<11:10, 13.69s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1953,  0.2451, -0.0903, -0.1777,  0.2852], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2148,  0.2910, -0.1118, -0.0659, -0.0439], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1562,  0.3047, -0.8008,  0.0045, -0.4414], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1426,  0.3281, -0.8125,  1.8203,  0.0227], device='cuda:0',
   

 30%|███       | 21/69 [04:47<10:59, 13.75s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1650,  0.1631, -0.2461, -0.0613,  0.0413], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2559,  0.2295, -0.1895, -0.0248, -0.0442], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1523,  0.2256, -0.8242,  0.1289, -0.5625], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0089, -0.0903, -1.2734,  1.9688, -0.3555], device='cuda:0',
   

 32%|███▏      | 22/69 [05:00<10:42, 13.68s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3105,  0.2578, -0.1025, -0.1914,  0.3926], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3262,  0.2441, -0.3574, -0.0698, -0.0267], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0386,  0.2402, -0.8203, -0.0138, -0.6133], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.3418,  0.5391, -0.7734,  1.9062,  0.1416], device='cuda:0',
   

 33%|███▎      | 23/69 [05:14<10:21, 13.52s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2344,  0.0938, -0.0522, -0.2012,  0.3047], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2676,  0.2402, -0.3262, -0.0571, -0.0806], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1699,  0.2354, -0.8125,  0.0425, -0.5469], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1553,  0.3379, -0.5859,  1.9062, -0.4883], device='cuda:0',
   

 35%|███▍      | 24/69 [05:27<10:10, 13.56s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2061,  0.1982, -0.1084, -0.1895,  0.2832], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2793,  0.2354, -0.0422, -0.0811,  0.0623], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1953,  0.2461, -0.9219,  0.0806, -0.5977], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5195,  0.6562, -1.6094,  2.1875,  0.3711], device='cuda:0',
   

 36%|███▌      | 25/69 [05:41<09:59, 13.63s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1338,  0.1396, -0.1211, -0.0527,  0.2246], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2471,  0.3301, -0.2715,  0.0028, -0.0554], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1963,  0.2578, -0.6875,  0.1436, -0.5898], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4629,  0.5781, -0.6758,  1.7266, -0.2949], device='cuda:0',
   

 38%|███▊      | 26/69 [05:55<09:47, 13.67s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0564,  0.0791, -0.2266,  0.0510,  0.2578], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2793,  0.2051, -0.1562,  0.0233,  0.0442], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2812,  0.2422, -0.7812,  0.1279, -0.5508], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5586,  0.4180, -1.5469,  1.9844, -0.2402], device='cuda:0',
   

 39%|███▉      | 27/69 [06:08<09:33, 13.65s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1396,  0.0908, -0.1924,  0.0282,  0.3984], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3594,  0.2617, -0.1934,  0.0242,  0.0669], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3223,  0.2432, -0.7031,  0.1079, -0.5586], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4102,  0.4688, -1.0391,  1.5078, -0.6680], device='cuda:0',
   

 41%|████      | 28/69 [06:22<09:21, 13.70s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2021,  0.2041, -0.1318, -0.1709,  0.0835], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.2871, -0.1436, -0.0549,  0.0781], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1504,  0.2129, -0.8359,  0.0269, -0.4004], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.2773,  0.4629, -1.4062,  1.6250, -0.4336], device='cuda:0',
   

 42%|████▏     | 29/69 [06:36<09:12, 13.81s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1416,  0.0820, -0.1738, -0.1953,  0.1270], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2334,  0.2695,  0.0189, -0.1562,  0.0259], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1660,  0.2930, -0.9023, -0.0623, -0.5781], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1299,  0.3496, -0.9375,  1.5156, -0.6406], device='cuda:0',
   

 43%|████▎     | 30/69 [06:50<09:01, 13.87s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2363,  0.1660, -0.2754, -0.1875,  0.0801], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1924,  0.2178, -0.2852, -0.0635, -0.0664], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1118,  0.2578, -0.7578, -0.0162, -0.5430], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0334,  0.4023, -1.4219,  1.5156, -0.1289], device='cuda:0',
   

 45%|████▍     | 31/69 [07:04<08:45, 13.82s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3712, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2656,  0.1201, -0.1387, -0.1709,  0.2715], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3712, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2197,  0.1436, -0.0613, -0.0493, -0.0059], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3712, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0417,  0.1777, -0.8984,  0.0811, -0.6484], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3712, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4062,  0.6719, -1.2266,  1.6406,  0.1895], device='cuda:0',
   

 46%|████▋     | 32/69 [07:18<08:32, 13.84s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2256,  0.0938, -0.2656, -0.1455,  0.0771], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2539,  0.3125, -0.2656, -0.0122, -0.0481], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1963,  0.2275, -0.7578,  0.0933, -0.4961], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1533,  0.9219, -1.6719,  1.6953, -0.3613], device='cuda:0',
   

 48%|████▊     | 33/69 [07:32<08:18, 13.85s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3701, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2275,  0.3242,  0.0554, -0.3242,  0.2559], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3701, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2471,  0.0977, -0.0554, -0.1465, -0.0101], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3701, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0913,  0.2158, -0.8281, -0.1094, -0.5781], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3701, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3398,  0.6367, -1.5547,  1.4766, -0.2832], device='cuda:0',
   

 49%|████▉     | 34/69 [07:46<08:11, 14.04s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3718, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1641,  0.3223,  0.0317, -0.2988,  0.2207], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3718, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1963,  0.0669, -0.0393, -0.1885, -0.0127], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3718, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1543,  0.1768, -0.8281, -0.1895, -0.5234], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3718, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4199,  0.6953, -1.3516,  0.4805, -0.2295], device='cuda:0',
   

 51%|█████     | 35/69 [08:00<07:55, 13.97s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2451,  0.4102,  0.0103, -0.2832,  0.3066], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2266,  0.0693, -0.0342, -0.1611,  0.0415], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0513,  0.2236, -0.7812, -0.1396, -0.5859], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5156,  0.9219, -1.4844,  0.7773, -0.2344], device='cuda:0',
   

 52%|█████▏    | 36/69 [08:14<07:39, 13.92s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2422,  0.0830, -0.4238, -0.2256,  0.2246], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2070,  0.3359, -0.3496, -0.1006,  0.0820], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1855,  0.4082, -0.7031,  0.0302, -0.5742], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773,  1.0391, -1.1875,  1.3203, -0.6211], device='cuda:0',
   

 54%|█████▎    | 37/69 [08:28<07:30, 14.08s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1992,  0.3789, -0.2266, -0.0791,  0.2656], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2383,  0.0635,  0.0708, -0.0540, -0.0079], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0869,  0.1631, -0.9336,  0.0242, -0.4453], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.8008,  0.2305, -1.0938,  1.4766, -0.0840], device='cuda:0',
   

 55%|█████▌    | 38/69 [08:43<07:19, 14.17s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3713, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3301,  0.1543, -0.1367, -0.0859,  0.0234], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3713, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2021,  0.2129,  0.0957, -0.0366, -0.2412], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3713, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2070,  0.2773, -0.6641, -0.0383, -0.5508], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3713, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6484, -0.0019, -0.8984,  1.3828,  0.0236], device='cuda:0',
   

 57%|█████▋    | 39/69 [08:57<07:04, 14.15s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3613,  0.2969, -0.0033,  0.0065,  0.3633], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.1260, -0.1455, -0.0013, -0.2129], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0938,  0.1670, -0.8516, -0.0598, -0.5000], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4453,  0.3242, -0.8047,  1.6641,  0.3008], device='cuda:0',
   

 58%|█████▊    | 40/69 [09:10<06:44, 13.95s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3047,  0.3711, -0.0408,  0.0303,  0.3477], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3027,  0.1060, -0.0796,  0.0337, -0.2070], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0977,  0.1514, -0.8594,  0.0248, -0.5000], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7852,  0.6875, -0.8750,  1.4844,  0.2314], device='cuda:0',
   

 59%|█████▉    | 41/69 [09:24<06:26, 13.80s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3711,  0.3750, -0.1504,  0.0057,  0.3730], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3066,  0.0820, -0.0588,  0.0199, -0.1562], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1406,  0.1953, -0.8945, -0.0027, -0.5195], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5664,  0.4805, -0.8867,  1.6094,  0.2598], device='cuda:0',
   

 61%|██████    | 42/69 [09:37<06:11, 13.75s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3184,  0.3359, -0.1562,  0.0444,  0.3574], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2793,  0.0381,  0.0454,  0.0231, -0.1152], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0957,  0.1885, -0.8906,  0.0064, -0.4766], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7422,  0.4355, -0.7422,  1.5078,  0.3145], device='cuda:0',
   

 62%|██████▏   | 43/69 [09:51<05:59, 13.82s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.1934, -0.2676, -0.0510,  0.1172], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2695,  0.2852, -0.1099, -0.0108, -0.1426], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3652,  0.3301, -0.6758,  0.0197, -0.5000], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-1.0938,  0.8086, -0.7969,  1.3828, -0.2812], device='cuda:0',
   

 64%|██████▍   | 44/69 [10:06<05:48, 13.93s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4258,  0.2695, -0.0576,  0.0388,  0.3379], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2617,  0.0471, -0.1104,  0.0226, -0.1504], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1494,  0.2012, -0.8789,  0.0170, -0.4473], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4863,  0.2373, -1.1250,  1.7344,  0.0359], device='cuda:0',
   

 65%|██████▌   | 45/69 [10:19<05:32, 13.86s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3535,  0.2148, -0.4121, -0.1494,  0.2969], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2383,  0.0195, -0.1572, -0.0693, -0.1992], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0430,  0.0601, -1.0703,  0.0405, -0.6133], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4102, -0.3301, -1.0938,  1.2031,  0.1318], device='cuda:0',
   

 67%|██████▋   | 46/69 [10:33<05:18, 13.83s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3711, -0.1602, -0.4629, -0.2520,  0.0811], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3945, -0.0869,  0.0236, -0.3262, -0.2500], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1299,  0.1011, -1.0391, -0.4922, -0.4902], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.7266, -0.5938, -1.1797,  0.8789, -0.3340], device='cuda:0',
   

 68%|██████▊   | 47/69 [10:47<05:02, 13.74s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1465,  0.0109, -0.1611, -0.1553,  0.0130], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3340,  0.0903,  0.0229, -0.1157, -0.1416], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1650,  0.1631, -0.8594, -0.0806, -0.7891], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3340,  0.8945, -0.5156,  0.6133,  0.0282], device='cuda:0',
   

 70%|██████▉   | 48/69 [11:00<04:44, 13.57s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1758,  0.1816, -0.3496, -0.1514,  0.0796], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3496,  0.0957, -0.0276, -0.0576,  0.0464], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1924,  0.1436, -0.7383,  0.0498, -0.5977], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4512,  0.2969, -0.7422,  0.9297, -0.2480], device='cuda:0',
   

 71%|███████   | 49/69 [11:14<04:33, 13.67s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1338,  0.2412, -0.4355, -0.2559, -0.0513], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2715,  0.2656,  0.1621, -0.1118, -0.0967], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1807,  0.2207, -0.6484, -0.1318, -0.5664], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3945,  0.5195, -0.8555, -0.2119, -0.0742], device='cuda:0',
   

 72%|███████▏  | 50/69 [11:27<04:18, 13.63s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0371,  0.1650, -0.0623, -0.3750,  0.1396], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3125,  0.0544, -0.1406, -0.3164,  0.0942], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2285,  0.1924, -0.7188, -0.3574, -0.5742], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4570,  0.5352, -1.0156,  0.1123,  0.6250], device='cuda:0',
   

 74%|███████▍  | 51/69 [11:41<04:05, 13.62s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1226,  0.2412, -0.1050, -0.2354,  0.1533], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3691,  0.0317, -0.0576, -0.1177,  0.1797], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0977,  0.1187, -0.7812, -0.0796, -0.5977], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0278,  0.9102, -1.2734,  0.6484,  0.3379], device='cuda:0',
   

 75%|███████▌  | 52/69 [11:54<03:51, 13.61s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2637,  0.2148, -0.0947, -0.2090,  0.0693], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2285,  0.0085,  0.2656, -0.1465, -0.0415], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0879,  0.1240, -0.8633, -0.1201, -0.6758], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3594,  0.0271, -0.9766,  0.6719,  0.2002], device='cuda:0',
   

 77%|███████▋  | 53/69 [12:08<03:39, 13.69s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3702, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2520,  0.1455, -0.3828, -0.1914, -0.0583], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3702, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2520,  0.1328,  0.1318, -0.1807, -0.1797], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3702, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0938,  0.1211, -0.7148, -0.2773, -0.6133], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3702, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6836,  0.0349, -0.8672,  0.3008, -0.2031], device='cuda:0',
   

 78%|███████▊  | 54/69 [12:22<03:24, 13.65s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4160,  0.2451, -0.1157, -0.2236,  0.2520], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3223,  0.0820,  0.0977, -0.1729, -0.0518], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0830,  0.1846, -0.8398, -0.2256, -0.4785], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5703,  1.1328, -1.0938,  0.4688,  0.3477], device='cuda:0',
   

 80%|███████▉  | 55/69 [12:36<03:12, 13.72s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3496,  0.3223, -0.1689, -0.1670,  0.2637], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2676,  0.0291, -0.0757, -0.1445, -0.0957], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0371,  0.1235, -0.8477, -0.1953, -0.5703], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5352,  0.5312, -1.3281,  1.1016, -0.1738], device='cuda:0',
   

 81%|████████  | 56/69 [12:50<02:59, 13.78s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2539,  0.3594,  0.0104, -0.2246,  0.1855], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2275,  0.0608,  0.2334, -0.1504, -0.0383], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1270,  0.1758, -0.8750, -0.1777, -0.4922], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6562,  0.4727, -1.2344,  1.0312,  0.1992], device='cuda:0',
   

 83%|████████▎ | 57/69 [13:03<02:44, 13.71s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3652,  0.2715, -0.3652, -0.3848, -0.0659], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2715,  0.1021,  0.1660, -0.2539, -0.0391], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1709,  0.1777, -0.6875, -0.2871, -0.4980], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2695,  0.4570, -1.2344,  0.0153,  0.5078], device='cuda:0',
   

 84%|████████▍ | 58/69 [13:17<02:30, 13.69s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3750,  0.2314, -0.2129, -0.3984, -0.1143], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3047,  0.1196,  0.0212, -0.2754, -0.0579], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2100,  0.1484, -0.5859, -0.3301, -0.5000], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3828,  0.8125, -0.8789, -0.2656,  0.4727], device='cuda:0',
   

 86%|████████▌ | 59/69 [13:30<02:16, 13.62s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1328,  0.1953,  0.0654, -0.2432,  0.2852], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3145,  0.1660, -0.0272, -0.1396,  0.0330], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1436,  0.2314, -0.7812, -0.0679, -0.6172], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5234,  1.0312, -0.5078,  1.2969,  0.1758], device='cuda:0',
   

 87%|████████▋ | 60/69 [13:44<02:02, 13.64s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3164,  0.2246, -0.4473, -0.2910, -0.1069], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3223,  0.1377,  0.2734, -0.2637, -0.1318], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1279,  0.1660, -0.7969, -0.3145, -0.5312], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.3262, -0.9062, -0.4766,  0.1631], device='cuda:0',
   

 88%|████████▊ | 61/69 [13:58<01:49, 13.65s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5078,  0.1260, -0.3125, -0.1826,  0.3145], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3438,  0.0493, -0.0080, -0.1436, -0.1309], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2500,  0.1128, -0.7852, -0.2852, -0.5703], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7852,  0.5039, -0.4023, -0.0070, -0.0058], device='cuda:0',
   

 90%|████████▉ | 62/69 [14:11<01:35, 13.67s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3926,  0.2969, -0.1226, -0.2051,  0.1602], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2090, -0.0073,  0.0081, -0.1309, -0.1953], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0537,  0.1621, -0.8711, -0.1914, -0.5273], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7734,  0.2266, -1.3203,  0.7773,  0.4004], device='cuda:0',
   

 91%|█████████▏| 63/69 [14:25<01:21, 13.62s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3027,  0.2285,  0.0913, -0.1885,  0.0962], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2344,  0.0928, -0.0576, -0.1592, -0.0938], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0496,  0.2275, -0.8906, -0.1982, -0.5625], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6211,  0.3418, -1.0781,  0.4219,  0.7891], device='cuda:0',
   

 93%|█████████▎| 64/69 [14:39<01:08, 13.63s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0342,  0.0386, -0.1660, -0.2061,  0.0562], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1621,  0.4102, -0.1030, -0.0476, -0.0596], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3340,  0.2910, -0.7656,  0.0049, -0.6445], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4453,  0.8203, -1.1484,  1.2656, -0.6953], device='cuda:0',
   

 94%|█████████▍| 65/69 [14:52<00:54, 13.63s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1069,  0.1934, -0.1914, -0.1348,  0.0889], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2363,  0.1045,  0.0211, -0.1089, -0.1099], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1582,  0.0864, -0.7617, -0.0430, -0.6758], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6445,  0.4590, -0.0869,  1.2109, -0.3047], device='cuda:0',
   

 96%|█████████▌| 66/69 [15:06<00:41, 13.67s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1050,  0.2080, -0.3633, -0.1118,  0.0869], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3633,  0.2324, -0.2266, -0.0583,  0.0820], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4219,  0.2637, -0.7031,  0.0199, -0.6914], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3867,  0.4531, -1.0938,  1.1172, -0.3164], device='cuda:0',
   

 97%|█████████▋| 67/69 [15:19<00:27, 13.64s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0352,  0.1826, -0.3184, -0.0625,  0.0815], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2090,  0.2090, -0.2334, -0.0566, -0.0143], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2100,  0.2002, -0.6953, -0.0176, -0.5039], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0601,  0.7578, -1.3906,  1.1484, -0.0522], device='cuda:0',
   

 99%|█████████▊| 68/69 [15:25<00:13, 13.61s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 1815, 3584]), Averaged feature shape: torch.Size([7, 3584]), Sample: tensor([ 0.0464,  0.1309, -0.0894, -0.1875, -0.1377], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 1815, 3584]), Averaged feature shape: torch.Size([7, 3584]), Sample: tensor([-0.1885,  0.3223, -0.2773, -0.1592, -0.0219], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 1815, 3584]), Averaged feature shape: torch.Size([7, 3584]), Sample: tensor([-0.3848,  0.1592, -0.6797, -0.0510, -0.6445], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 1815, 3584]), Averaged feature shape: torch.Size([7, 3584]), Sample: tensor([-0.3301,  0.7617, -0.5898,  1.3906, -0.3301], device='cuda:0',
       




Total number of frames in the video: 21288.0
Original Resolution: (720.0, 480.0)
FPS: 29.968454258675077
Duration (seconds): 710.3469473684211
Target Resolution: (224, 224)
Read 21288 frames.
Frames shape: torch.Size([21288, 3, 224, 224])


  torchaudio.set_audio_backend("ffmpeg")


Total duration: 710.35 seconds
Number of intervals: 476
Sample rate: 48000
Output file: /kaggle/working/friends/s1/friends_s01e15a.h5
Num splits: 35


  1%|▏         | 1/69 [00:12<14:35, 12.88s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1191, -0.0571,  0.0569, -0.6211, -0.3887], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773, -0.2930, -0.0059, -0.2100, -0.0250], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0364,  0.0579, -0.5000, -0.1943, -0.6328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 1.2734, -0.1211, -1.5234,  0.1328, -1.4453], device='cuda:0',
   

  3%|▎         | 2/69 [00:25<14:21, 12.85s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2139,  0.1196, -0.2441, -0.2539,  0.1196], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1826,  0.2773, -0.3535, -0.0762,  0.1113], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0869,  0.1895, -0.8789,  0.0574, -0.4961], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1826,  0.6680, -0.5820,  1.8047, -0.2910], device='cuda:0',
   

  4%|▍         | 3/69 [00:38<14:20, 13.04s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3630, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2178,  0.1904, -0.1514, -0.1328,  0.0977], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3630, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.3262, -0.3613, -0.0728,  0.0874], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3630, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1953,  0.2617, -0.8906, -0.0124, -0.4238], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3630, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3672,  0.8516, -0.9258,  1.8672,  0.2383], device='cuda:0',
   

  6%|▌         | 4/69 [00:52<14:14, 13.14s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0908, -0.0466,  0.0498, -0.6445, -0.2480], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3262, -0.2109,  0.0352, -0.1748, -0.0026], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0248,  0.0227, -0.5938, -0.1553, -0.5508], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 1.2734,  0.2080, -1.2969,  0.3672, -1.3203], device='cuda:0',
   

  7%|▋         | 5/69 [01:05<14:02, 13.17s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3698, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3184, -0.0354,  0.0157, -0.1621,  0.1875], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3698, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1196,  0.2041,  0.0486,  0.0145, -0.0530], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3698, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2197,  0.1865, -0.7930, -0.0708, -0.4219], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3698, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.3125,  0.7656, -0.4473,  0.8633,  0.1992], device='cuda:0',
   

  9%|▊         | 6/69 [01:19<14:01, 13.35s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0359,  0.1396,  0.3203, -0.1025,  0.0762], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1895,  0.0708,  0.1865, -0.1455,  0.0254], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0076,  0.1475, -0.9375, -0.0265, -0.5430], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4375,  0.4277, -0.2734,  1.6406,  0.0216], device='cuda:0',
   

 10%|█         | 7/69 [01:32<13:53, 13.44s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4219,  0.0659,  0.0010, -0.1162,  0.1738], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-2.7344e-01,  2.2461e-01, -1.2695e-01,  2.2316e-04, -8.7402e-02],
       device='cuda:1', dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1748,  0.2178, -0.7109, -0.0422, -0.4668], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2354,  0.3770, -0.7539,  0.8867, -0.0762], 

 12%|█▏        | 8/69 [01:46<13:41, 13.46s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3730, -0.0483, -0.3906, -0.4551,  0.5977], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3750, -0.0024,  0.0125, -0.2910, -0.1416], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1260, -0.0503, -0.9023, -0.2949, -0.4668], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1631, -0.4727, -1.1719,  0.1904, -0.9336], device='cuda:0',
   

 13%|█▎        | 9/69 [01:59<13:28, 13.48s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1348,  0.2383, -0.2891, -0.1182,  0.0884], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2109,  0.3477, -0.1865, -0.0723,  0.0498], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1118,  0.3398, -0.9102,  0.0189, -0.6133], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4297,  0.8711, -0.5391,  2.5000, -0.4414], device='cuda:0',
   

 14%|█▍        | 10/69 [02:13<13:18, 13.53s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1992,  0.3223, -0.1328, -0.1118,  0.3242], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3164,  0.1079, -0.2002, -0.0879,  0.1787], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1816,  0.1348, -0.7500, -0.0786, -0.4336], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0187,  0.4062, -0.6250,  1.0547, -0.0442], device='cuda:0',
   

 16%|█▌        | 11/69 [02:27<13:15, 13.71s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2871,  0.1455, -0.3496, -0.0027,  0.2236], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2734,  0.1533, -0.2393,  0.0540, -0.0569], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0608,  0.2168, -1.1328,  0.1177, -0.6641], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1001,  0.6406, -1.1328,  2.8594, -0.2256], device='cuda:0',
   

 17%|█▋        | 12/69 [02:41<13:05, 13.78s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2734,  0.2334, -0.0757, -0.1602,  0.2617], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2236,  0.0669, -0.1367, -0.0243,  0.0664], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0859,  0.1104, -0.6328,  0.0077, -0.5117], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1377,  0.2539, -0.7305,  1.3281, -0.0688], device='cuda:0',
   

 19%|█▉        | 13/69 [02:55<12:49, 13.74s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2812,  0.0077, -0.0815, -0.0496,  0.2266], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2871,  0.0815, -0.2520,  0.0254, -0.1138], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0962,  0.1074, -1.0547,  0.1201, -0.6875], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2695,  0.5391, -0.7578,  2.8750, -0.0674], device='cuda:0',
   

 20%|██        | 14/69 [03:08<12:30, 13.65s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1758,  0.1328, -0.3027, -0.1172,  0.0503], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2217,  0.2412, -0.1289, -0.0030,  0.1172], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1455,  0.2002, -0.8359,  0.1758, -0.3555], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6641,  1.0703, -1.2969,  2.6094, -0.0129], device='cuda:0',
   

 22%|██▏       | 15/69 [03:21<12:11, 13.55s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0544,  0.1074, -0.0894, -0.0613,  0.3066], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2832,  0.1953, -0.1592, -0.0732,  0.1777], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0017,  0.2432, -0.8398, -0.1128, -0.5938], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2100,  0.4688, -0.8711,  0.8164,  0.0259], device='cuda:0',
   

 23%|██▎       | 16/69 [03:35<11:55, 13.50s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1426,  0.2021, -0.3672, -0.1357,  0.1045], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2031,  0.1167, -0.1011, -0.0601,  0.0500], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1147,  0.0703, -0.9375,  0.0286, -0.6406], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.9258,  0.3926, -0.7852,  1.8203, -0.2520], device='cuda:0',
   

 25%|██▍       | 17/69 [03:48<11:38, 13.44s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2471,  0.0500, -0.3086, -0.1069,  0.3301], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3301,  0.2061, -0.2559,  0.0164, -0.0742], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1748,  0.3105, -0.9805,  0.0718, -0.6367], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1719,  1.0859, -0.6836,  2.5312,  0.1035], device='cuda:0',
   

 26%|██▌       | 18/69 [04:02<11:27, 13.47s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0466,  0.1904, -0.1689, -0.1069,  0.1699], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2158,  0.1270, -0.0148, -0.0698,  0.1182], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1338,  0.0869, -0.8867, -0.0082, -0.5508], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.8320,  0.2754, -0.8750,  1.6719,  0.0493], device='cuda:0',
   

 28%|██▊       | 19/69 [04:15<11:15, 13.51s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1689,  0.0596, -0.2695, -0.0427,  0.3164], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3320,  0.2871, -0.3262,  0.0089, -0.0137], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1641,  0.2949, -0.9062,  0.0630, -0.5898], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1680,  0.6641, -0.6875,  2.6875,  0.2969], device='cuda:0',
   

 29%|██▉       | 20/69 [04:29<11:03, 13.53s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0618,  0.1641, -0.1631, -0.1904,  0.2373], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2480,  0.1475, -0.1572, -0.1328,  0.1270], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1445,  0.2451, -0.9375, -0.0791, -0.5703], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.8633,  0.4180, -0.6562,  1.5000,  0.3184], device='cuda:0',
   

 30%|███       | 21/69 [04:43<10:52, 13.59s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2637,  0.2295, -0.3008, -0.2441,  0.2031], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2490,  0.1729, -0.0029, -0.1914, -0.0801], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1562,  0.2100, -0.9766, -0.0206, -0.7070], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.2793, -0.1719, -1.5078,  1.4609, -0.4277], device='cuda:0',
   

 32%|███▏      | 22/69 [04:56<10:40, 13.62s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0461, -0.0527, -0.1592, -0.1504,  0.0854], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2393,  0.3320, -0.0219, -0.0603,  0.0005], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1807,  0.3203, -0.8125,  0.0381, -0.5859], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5859,  1.0781, -0.8711,  0.8477, -0.0591], device='cuda:0',
   

 33%|███▎      | 23/69 [05:10<10:27, 13.64s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1562,  0.0630, -0.1426, -0.0952,  0.2578], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3105,  0.0757,  0.1377, -0.0718,  0.0415], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2168,  0.1406, -0.8711,  0.0140, -0.5977], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5977,  0.7227, -0.2891,  1.1172, -0.1943], device='cuda:0',
   

 35%|███▍      | 24/69 [05:23<10:09, 13.55s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1934,  0.0889, -0.1934, -0.0171,  0.2012], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2559,  0.2109, -0.1152, -0.0593,  0.0771], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2021,  0.2363, -0.8906, -0.0079, -0.6484], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3711,  0.6953, -0.9727,  0.9453, -0.1729], device='cuda:0',
   

 36%|███▌      | 25/69 [05:36<09:50, 13.41s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0332,  0.1309, -0.1279, -0.1758,  0.0361], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2002,  0.3457, -0.0820, -0.0596,  0.0245], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1128,  0.2422, -0.6836, -0.0432, -0.5586], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0262,  1.0391, -0.7344,  0.9336,  0.1553], device='cuda:0',
   

 38%|███▊      | 26/69 [05:50<09:36, 13.42s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3698, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2227,  0.2910, -0.1396, -0.1592,  0.2734], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3698, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2637, -0.0145, -0.2988, -0.0645, -0.0864], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3698, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1875,  0.0664, -0.8398, -0.0310, -0.5859], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3698, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0133,  0.4297, -0.8594,  1.2344, -0.3242], device='cuda:0',
   

 39%|███▉      | 27/69 [06:04<09:30, 13.59s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3729, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1641,  0.1807, -0.1309, -0.1602,  0.2285], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3729, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3496,  0.0082, -0.2656, -0.0874,  0.0137], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3729, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1953,  0.1089, -0.7500, -0.0713, -0.5156], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3729, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1934,  0.7031, -0.0747,  1.0000, -0.3242], device='cuda:0',
   

 41%|████      | 28/69 [06:18<09:19, 13.66s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3714, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1377,  0.1060, -0.1748, -0.2656,  0.0098], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3714, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-3.1836e-01,  2.1582e-01,  1.1492e-04, -1.5723e-01,  1.3794e-02],
       device='cuda:1', dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3714, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773,  0.2451, -0.7344, -0.1021, -0.6680], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3714, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3730,  0.5625, -1.3906,  1.0859, -0.9453], 

 42%|████▏     | 29/69 [06:31<09:07, 13.68s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0006,  0.0193,  0.0781,  0.0151,  0.0957], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3848,  0.1475, -0.2344,  0.0378,  0.0106], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2158,  0.2012, -0.8398,  0.0095, -0.7266], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2754, -0.3047, -0.8867,  1.3594, -0.2432], device='cuda:0',
   

 43%|████▎     | 30/69 [06:45<08:51, 13.63s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1465,  0.0903, -0.1680, -0.0645,  0.0254], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.1689, -0.1279, -0.0859, -0.0018], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3145,  0.1226, -0.8828, -0.0530, -0.6055], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0889,  0.4512, -0.9844,  1.1250, -0.2969], device='cuda:0',
   

 45%|████▍     | 31/69 [06:59<08:39, 13.68s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0605,  0.3340, -0.1377, -0.1953,  0.1836], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3184,  0.2422, -0.0825, -0.1104,  0.1143], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3457,  0.1035, -0.8984, -0.1138, -0.6328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1602,  0.5039, -0.8672,  1.3672, -0.2520], device='cuda:0',
   

 46%|████▋     | 32/69 [07:12<08:27, 13.71s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0518,  0.1406, -0.2617, -0.2275,  0.2832], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2910,  0.2520, -0.1230, -0.1943,  0.0635], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1279,  0.2539, -0.8828, -0.1270, -0.6914], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4082,  1.0859, -0.8555,  1.0000,  0.4258], device='cuda:0',
   

 48%|████▊     | 33/69 [07:26<08:14, 13.75s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1079,  0.2012, -0.2090, -0.0986,  0.1147], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.1504, -0.2109, -0.0393,  0.0139], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2715,  0.1328, -0.7031, -0.0232, -0.6367], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4258,  0.5625, -1.4219,  0.7148, -0.6406], device='cuda:0',
   

 49%|████▉     | 34/69 [07:40<07:59, 13.71s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0884, -0.0053, -0.2539, -0.1611,  0.2314], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2637,  0.1191, -0.0776, -0.0854,  0.0267], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2461,  0.1562, -0.8984, -0.0295, -0.6484], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6797,  0.5273, -0.5859,  0.9375, -0.0928], device='cuda:0',
   

 51%|█████     | 35/69 [07:53<07:43, 13.64s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1128, -0.0208, -0.2617, -0.1250,  0.1797], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3730,  0.2930, -0.1445, -0.0442, -0.0322], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2949,  0.3633, -0.7891, -0.0874, -0.6953], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1289,  0.7812, -0.3828,  1.3125, -0.2139], device='cuda:0',
   

 52%|█████▏    | 36/69 [08:07<07:32, 13.71s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773,  0.0613, -0.0410, -0.1621,  0.2393], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2090,  0.1406, -0.1699, -0.1094, -0.0835], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1934,  0.1963, -0.8789, -0.1533, -0.6680], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7148,  0.7227, -0.9219,  0.9648, -0.1963], device='cuda:0',
   

 54%|█████▎    | 37/69 [08:21<07:18, 13.71s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1777,  0.1416, -0.2061, -0.2197,  0.1699], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2275,  0.0884, -0.2021, -0.0859, -0.0030], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0986,  0.1689, -0.8281, -0.1680, -0.7031], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0442,  0.5156, -1.3594,  0.7773,  0.4766], device='cuda:0',
   

 55%|█████▌    | 38/69 [08:35<07:03, 13.66s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2871,  0.3867,  0.0044, -0.1021,  0.2148], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2812,  0.1689, -0.2402, -0.1240,  0.0591], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0405,  0.3906, -0.9102, -0.0688, -0.5312], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.2451,  1.2656, -0.3203,  1.3594,  0.1768], device='cuda:0',
   

 57%|█████▋    | 39/69 [08:48<06:51, 13.71s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3658, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2158,  0.2656, -0.2227, -0.0544,  0.2188], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3658, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3867,  0.1895, -0.4180, -0.1260, -0.0172], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3658, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3145,  0.3262, -1.1016, -0.1025, -0.5234], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3658, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0854,  0.4102, -0.5312,  0.8672, -0.2637], device='cuda:0',
   

 58%|█████▊    | 40/69 [09:02<06:39, 13.77s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0752,  0.0835, -0.3379, -0.1406,  0.1113], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3262,  0.3438, -0.0030, -0.0270,  0.0273], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3047,  0.3906, -0.8633, -0.0469, -0.4980], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1641,  0.5078, -0.7539,  1.5547, -0.7734], device='cuda:0',
   

 59%|█████▉    | 41/69 [09:16<06:25, 13.76s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1221,  0.3398, -0.0654, -0.1494,  0.0981], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3477,  0.2158, -0.1885, -0.1777,  0.1455], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2676,  0.2051, -0.7930, -0.1357, -0.5703], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2852,  0.5664, -0.1270,  1.5625, -0.4297], device='cuda:0',
   

 61%|██████    | 42/69 [09:30<06:10, 13.72s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1621,  0.2168, -0.4258, -0.2197,  0.2617], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3438,  0.2676, -0.0889, -0.1084,  0.1543], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2266,  0.2988, -0.7852, -0.0693, -0.5547], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0488,  0.5234, -0.5703,  1.6094, -0.3809], device='cuda:0',
   

 62%|██████▏   | 43/69 [09:43<05:57, 13.76s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1494,  0.2471, -0.1953, -0.2139,  0.1543], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2139,  0.0762, -0.0469, -0.0713, -0.0344], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1826,  0.1641, -0.7812, -0.0564, -0.3945], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.8359, -0.3613, -0.8711,  0.9219, -0.1133], device='cuda:0',
   

 64%|██████▍   | 44/69 [09:57<05:43, 13.75s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0491,  0.0874, -0.2363, -0.2373,  0.1182], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3535,  0.3574,  0.2119, -0.1206,  0.0238], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2334,  0.2656, -0.7891, -0.1025, -0.5586], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4805,  0.2578, -0.8906,  1.5781, -0.6367], device='cuda:0',
   

 65%|██████▌   | 45/69 [10:11<05:30, 13.77s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2734,  0.2832, -0.1318, -0.3262,  0.1050], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.1436, -0.0228, -0.1611,  0.0752], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0796,  0.0977, -0.8555, -0.1367, -0.5742], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1069, -0.0947, -0.6758,  0.5977, -0.0317], device='cuda:0',
   

 67%|██████▋   | 46/69 [10:25<05:17, 13.81s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3164,  0.3145, -0.2344, -0.1494,  0.0942], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1729,  0.1279, -0.0272, -0.1279,  0.0415], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1143,  0.1465, -0.7695, -0.1592, -0.4824], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3633,  0.8008, -0.8789,  1.1328, -0.3926], device='cuda:0',
   

 68%|██████▊   | 47/69 [10:39<05:03, 13.80s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3281,  0.2695, -0.1953, -0.2832,  0.0962], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2754,  0.1963,  0.0283, -0.1836,  0.0674], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1191,  0.1475, -0.8555, -0.1621, -0.5234], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2969,  0.4492, -0.7812,  1.0078, -0.4961], device='cuda:0',
   

 70%|██████▉   | 48/69 [10:52<04:49, 13.80s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2969,  0.2305, -0.1396, -0.2754,  0.0957], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2852,  0.1914, -0.0825, -0.1846,  0.0342], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1299,  0.1279, -0.8242, -0.2080, -0.6094], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2100,  0.1670, -0.6992,  1.0859, -0.3887], device='cuda:0',
   

 71%|███████   | 49/69 [11:06<04:33, 13.70s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3689, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0015,  0.3164, -0.2500, -0.1396,  0.2578], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3689, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3301,  0.2119, -0.0505, -0.0708,  0.1260], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3689, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3867,  0.2363, -0.8516, -0.0145, -0.5234], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3689, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0693,  0.3984, -0.5312,  0.9609, -0.3438], device='cuda:0',
   

 72%|███████▏  | 50/69 [11:20<04:19, 13.67s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2295,  0.2461, -0.2109, -0.0192,  0.1348], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3457,  0.2500, -0.0801, -0.0645,  0.0610], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3730,  0.2432, -0.8008, -0.0483, -0.5000], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.2021,  0.5664, -0.2441,  1.2734,  0.3145], device='cuda:0',
   

 74%|███████▍  | 51/69 [11:33<04:05, 13.63s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3649, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0179,  0.0342, -0.0344, -0.1572,  0.0510], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3649, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.0518, -0.0361, -0.0776, -0.1367], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3649, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1572,  0.1260, -0.9102, -0.0854, -0.4336], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3649, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.2236,  0.5352, -0.8398,  0.7148,  0.2275], device='cuda:0',
   

 75%|███████▌  | 52/69 [11:47<03:50, 13.57s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2080,  0.2305, -0.0172, -0.1069,  0.3984], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2832,  0.1025, -0.1147, -0.1211,  0.0059], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1230,  0.2168, -0.9023, -0.1562, -0.5273], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3164,  0.6953, -0.3691,  1.3281,  0.0371], device='cuda:0',
   

 77%|███████▋  | 53/69 [12:00<03:36, 13.55s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2100,  0.1816,  0.0310, -0.1523,  0.2500], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2988,  0.2070,  0.1172, -0.1377, -0.0520], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1895,  0.2021, -0.9453, -0.0938, -0.6562], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2422,  1.1250, -0.3008,  1.8750, -0.1846], device='cuda:0',
   

 78%|███████▊  | 54/69 [12:14<03:23, 13.54s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1270,  0.1074, -0.2471, -0.2334,  0.0776], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773,  0.3145,  0.1455, -0.1201, -0.0260], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2695,  0.1904, -0.8086, -0.0510, -0.6484], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7227,  0.0649, -0.7734,  1.2109, -0.5234], device='cuda:0',
   

 80%|███████▉  | 55/69 [12:27<03:10, 13.60s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2129,  0.1201, -0.2500, -0.1436,  0.1885], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2441,  0.3086, -0.0253, -0.0732, -0.0352], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2891,  0.2129, -0.7812, -0.0806, -0.6445], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.8125,  0.2969, -0.7227,  1.3516, -0.2910], device='cuda:0',
   

 81%|████████  | 56/69 [12:41<02:57, 13.66s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4258,  0.0571, -0.2812, -0.1631,  0.1865], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2852,  0.0645, -0.2617, -0.0255,  0.0530], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2471,  0.3281, -0.6172, -0.0265, -0.5273], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0156,  1.3125, -0.5469,  1.2031, -0.5859], device='cuda:0',
   

 83%|████████▎ | 57/69 [12:54<02:42, 13.51s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3750,  0.2734, -0.2520, -0.1260,  0.1611], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2266,  0.1689, -0.1235, -0.1035,  0.1187], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0903,  0.1963, -0.8555, -0.0359, -0.5977], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1196,  1.4219, -0.7266,  1.6328, -0.2852], device='cuda:0',
   

 84%|████████▍ | 58/69 [13:08<02:29, 13.63s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3984,  0.1631, -0.1729, -0.1924,  0.0588], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2793,  0.1602, -0.0072, -0.1318,  0.1060], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1377,  0.3242, -0.7734, -0.0510, -0.5664], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2451,  1.7734, -0.5781,  1.2578, -0.0635], device='cuda:0',
   

 86%|████████▌ | 59/69 [13:22<02:16, 13.60s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3613,  0.2236, -0.1455, -0.1299,  0.0996], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.1592, -0.1196, -0.0374,  0.0874], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0659,  0.2461, -0.7578,  0.0562, -0.5742], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1475,  1.3281, -0.7891,  2.0000, -0.1846], device='cuda:0',
   

 87%|████████▋ | 60/69 [13:35<02:01, 13.51s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3555,  0.1875, -0.1406, -0.2217,  0.0649], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2969,  0.1826, -0.1670, -0.1357,  0.0742], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1050,  0.2500, -0.7539, -0.0947, -0.6016], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0503,  1.6328, -0.3262,  1.7109, -0.1147], device='cuda:0',
   

 88%|████████▊ | 61/69 [13:49<01:48, 13.62s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3574, -0.1309, -0.7109, -0.1719,  0.5508], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2734, -0.0583,  0.1138, -0.1621,  0.0045], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2061, -0.0962, -0.9023, -0.1074, -0.2910], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0566, -0.4785, -1.8203,  1.1641, -0.4434], device='cuda:0',
   

 90%|████████▉ | 62/69 [14:02<01:35, 13.59s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3702, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0425,  0.0554, -0.2891, -0.1157,  0.1504], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3702, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2676,  0.2119,  0.0630, -0.0359,  0.0198], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3702, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3574,  0.2520, -0.6602, -0.0488, -0.5586], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3702, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1943,  0.2910, -1.0312,  0.1836, -0.1602], device='cuda:0',
   

 91%|█████████▏| 63/69 [14:16<01:21, 13.59s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1992,  0.0864, -0.1211, -0.2715,  0.0957], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2373,  0.0938, -0.0376, -0.1973, -0.0204], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1543,  0.2656, -0.7852, -0.2988, -0.4941], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4551,  0.4707, -0.7500,  0.0170,  0.3145], device='cuda:0',
   

 93%|█████████▎| 64/69 [14:30<01:08, 13.61s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2295,  0.1621, -0.1348, -0.1709,  0.2715], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3203,  0.0559,  0.0012, -0.1079,  0.0187], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2119,  0.1768, -0.7578, -0.1631, -0.4336], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6875,  0.5312, -1.5391,  0.3496, -0.1025], device='cuda:0',
   

 94%|█████████▍| 65/69 [14:43<00:54, 13.67s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2373,  0.0918, -0.0674, -0.2734,  0.0840], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1572,  0.1001,  0.1797, -0.1299, -0.0439], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0569,  0.2051, -0.7812, -0.2637, -0.5664], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3965,  1.1094, -0.7344,  0.0703,  0.4531], device='cuda:0',
   

 96%|█████████▌| 66/69 [14:57<00:40, 13.61s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1709,  0.0928, -0.0378, -0.2500,  0.1523], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1357,  0.0364,  0.0947, -0.1216, -0.0566], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0894,  0.1416, -0.8633, -0.2080, -0.5625], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4727,  1.0781, -0.8242,  0.1572,  0.4395], device='cuda:0',
   

 97%|█████████▋| 67/69 [15:10<00:27, 13.53s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2363,  0.1426,  0.0603, -0.2139,  0.1064], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1934,  0.1191, -0.1025, -0.1299, -0.1079], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0752,  0.1592, -0.8125, -0.2266, -0.5742], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3672,  0.7148, -1.0547,  0.0869,  0.4551], device='cuda:0',
   

 99%|█████████▊| 68/69 [15:16<00:13, 13.47s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 1826, 3584]), Averaged feature shape: torch.Size([7, 3584]), Sample: tensor([-0.1680,  0.2363, -0.0903, -0.1309,  0.2178], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 1826, 3584]), Averaged feature shape: torch.Size([7, 3584]), Sample: tensor([-0.2754,  0.0864,  0.0159, -0.0864,  0.1270], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 1826, 3584]), Averaged feature shape: torch.Size([7, 3584]), Sample: tensor([-0.2354,  0.0405, -0.7617, -0.0483, -0.5352], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 1826, 3584]), Averaged feature shape: torch.Size([7, 3584]), Sample: tensor([-0.2852, -0.2217, -1.7656,  0.4062, -0.4004], device='cuda:0',
       




Total number of frames in the video: 21222.0
Original Resolution: (720.0, 480.0)
FPS: 29.968454258675077
Duration (seconds): 708.1446315789474
Target Resolution: (224, 224)
Read 21222 frames.
Frames shape: torch.Size([21222, 3, 224, 224])


  torchaudio.set_audio_backend("ffmpeg")


Total duration: 708.14 seconds
Number of intervals: 475
Sample rate: 48000
Output file: /kaggle/working/friends/s1/friends_s01e08a.h5
Num splits: 35


  1%|▏         | 1/69 [00:13<14:53, 13.14s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1221, -0.0732,  0.0447, -0.6094, -0.4023], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2832, -0.3184, -0.0435, -0.2031, -0.0300], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0287,  0.0267, -0.5312, -0.1865, -0.6211], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 1.4219, -0.4004, -1.5156,  0.0928, -1.5078], device='cuda:0',
   

  3%|▎         | 2/69 [00:26<14:36, 13.09s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3887,  0.0537, -0.1025, -0.1689,  0.2002], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2891,  0.2012,  0.0344, -0.0089,  0.1147], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3242,  0.3320, -0.8086,  0.0654, -0.4746], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3477,  0.6797, -1.1250,  0.8320, -0.6055], device='cuda:0',
   

  4%|▍         | 3/69 [00:39<14:25, 13.12s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.1553,  0.0835, -0.2715,  0.1904], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3262,  0.1338, -0.0752, -0.1338,  0.1152], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1777,  0.3105, -0.8164, -0.1309, -0.4258], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1963,  0.2275, -0.7422,  0.4375, -0.1514], device='cuda:0',
   

  6%|▌         | 4/69 [00:52<14:20, 13.23s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1689,  0.0464,  0.0400, -0.1113,  0.1377], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2734,  0.1768, -0.2012, -0.0505, -0.0311], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2617,  0.0498, -0.8125, -0.0378, -0.5156], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3145,  0.0154, -0.3535,  0.8945,  0.1396], device='cuda:0',
   

  7%|▋         | 5/69 [01:05<14:06, 13.22s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2676,  0.0815,  0.0339, -0.1572, -0.0845], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2383,  0.1504, -0.0410, -0.0417, -0.1602], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1084,  0.1719, -0.7969, -0.0518, -0.6094], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2305,  0.6680, -0.5742,  1.4453, -0.0981], device='cuda:0',
   

  9%|▊         | 6/69 [01:19<13:55, 13.27s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1245, -0.0513,  0.0576, -0.6172, -0.3984], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2637, -0.2871, -0.0054, -0.2002, -0.0315], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0189,  0.0356, -0.5430, -0.1855, -0.6602], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 1.3750, -0.4277, -1.6406,  0.1182, -1.6328], device='cuda:0',
   

 10%|█         | 7/69 [01:32<13:50, 13.40s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0198,  0.2217, -0.2061, -0.1416, -0.0791], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2100,  0.2520, -0.1904, -0.1504,  0.1836], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2441,  0.2344, -0.7188, -0.0374, -0.6133], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2490,  0.8516, -0.6914,  1.1641,  0.5547], device='cuda:0',
   

 12%|█▏        | 8/69 [01:46<13:39, 13.43s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0114,  0.0635, -0.1992, -0.1338,  0.1953], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3770,  0.1807,  0.0322, -0.0544,  0.1631], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2500,  0.1631, -0.8242,  0.0471, -0.5508], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2949,  0.1079, -1.3906,  0.9414,  0.1050], device='cuda:0',
   

 13%|█▎        | 9/69 [02:00<13:32, 13.55s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0115,  0.1147, -0.3438, -0.1689, -0.0444], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2676,  0.1992, -0.1357, -0.1689,  0.1572], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3145,  0.2129, -0.7344, -0.0588, -0.6328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1895,  1.1016, -1.0391,  1.4844, -0.0913], device='cuda:0',
   

 14%|█▍        | 10/69 [02:13<13:16, 13.51s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2354,  0.0410, -0.0393, -0.0737,  0.2520], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3340,  0.1289,  0.1670, -0.0728,  0.0835], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2490,  0.1553, -0.7852, -0.0493, -0.5938], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5078,  0.1943, -1.4062,  0.7969, -0.2891], device='cuda:0',
   

 16%|█▌        | 11/69 [02:27<13:00, 13.45s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3594, -0.0029, -0.1377, -0.0884,  0.1396], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3418,  0.1279,  0.1455, -0.0767,  0.0864], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2539,  0.0908, -0.8242, -0.0757, -0.5078], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7031,  0.1094, -1.4531,  0.4531, -0.1572], device='cuda:0',
   

 17%|█▋        | 12/69 [02:41<13:00, 13.69s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0781,  0.0879, -0.1660, -0.1836,  0.0869], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3945,  0.2598,  0.0518, -0.1187,  0.1689], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2197,  0.2598, -0.7422, -0.1104, -0.5938], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6367,  1.2188, -0.6719,  1.2031,  0.2314], device='cuda:0',
   

 19%|█▉        | 13/69 [02:55<12:54, 13.83s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1543, -0.0028, -0.1128, -0.1738,  0.1738], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3398,  0.2129, -0.0181, -0.1104,  0.0654], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3379,  0.0776, -0.8359, -0.0496, -0.5898], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4473, -0.1006, -1.3750,  0.8672, -0.4570], device='cuda:0',
   

 20%|██        | 14/69 [03:09<12:44, 13.89s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1016,  0.0859, -0.2139, -0.1562,  0.1709], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3086,  0.2754,  0.0566, -0.0859,  0.1030], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1514,  0.2236, -0.7852, -0.0295, -0.5742], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4531,  1.7031, -0.6328,  1.1484, -0.1729], device='cuda:0',
   

 22%|██▏       | 15/69 [03:23<12:26, 13.83s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2637,  0.1260, -0.0055, -0.1963,  0.0801], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1758,  0.1592, -0.1699, -0.1279, -0.1572], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1074,  0.1602, -0.7539, -0.1953, -0.5391], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1328,  0.0952, -0.4355,  0.0179, -0.2715], device='cuda:0',
   

 23%|██▎       | 16/69 [03:37<12:17, 13.91s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2324,  0.0339, -0.1445, -0.0957,  0.1514], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2432,  0.0977,  0.1670, -0.0913,  0.1001], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2412,  0.1973, -0.7891, -0.0413, -0.5273], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4043,  0.4688, -1.4297,  0.6328, -0.5273], device='cuda:0',
   

 25%|██▍       | 17/69 [03:51<12:05, 13.95s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2148,  0.2178, -0.3828, -0.0742,  0.1699], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2832,  0.2832,  0.0192, -0.0101,  0.1289], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2197,  0.3066, -0.8789,  0.0947, -0.5898], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2715,  0.3047, -0.9531,  1.8516, -0.3418], device='cuda:0',
   

 26%|██▌       | 18/69 [04:05<11:56, 14.06s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1016,  0.0972, -0.4102, -0.0630,  0.0187], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2637,  0.2773,  0.0972, -0.0366, -0.0552], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3320,  0.1177, -0.8789, -0.0009, -0.5938], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3281,  0.3066, -1.0000,  0.8906, -0.7227], device='cuda:0',
   

 28%|██▊       | 19/69 [04:19<11:46, 14.13s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1553,  0.0869, -0.2373, -0.1108,  0.1328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2734,  0.1494, -0.0122, -0.0918,  0.0469], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1650,  0.1143, -0.7227, -0.0732, -0.5977], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5898,  0.4453, -1.7109,  0.6250, -0.9062], device='cuda:0',
   

 29%|██▉       | 20/69 [04:34<11:32, 14.13s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1318,  0.1738, -0.2773, -0.0393,  0.2617], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3203,  0.2295, -0.1021, -0.0154,  0.0498], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2188,  0.2500, -0.8477, -0.0303, -0.5234], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2246,  0.6055, -0.9844,  1.2500, -0.1768], device='cuda:0',
   

 30%|███       | 21/69 [04:47<11:14, 14.05s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3770,  0.0144, -0.4023, -0.2275,  0.1631], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1963,  0.0605, -0.2891, -0.1777, -0.2773], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0598,  0.1260, -0.8516, -0.1299, -0.5273], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.5312, -0.5352, -1.3750,  0.9102, -0.0679], device='cuda:0',
   

 32%|███▏      | 22/69 [05:02<11:01, 14.08s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2139,  0.1074, -0.1436, -0.1689, -0.0238], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1982,  0.1963, -0.1108, -0.1533, -0.1592], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4082,  0.1484, -0.7891, -0.1055, -0.5078], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1113,  0.6406, -0.7617,  1.2812, -0.0654], device='cuda:0',
   

 33%|███▎      | 23/69 [05:16<10:46, 14.05s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4297,  0.1855,  0.0786, -0.1328,  0.2373], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2559,  0.1475, -0.2354, -0.0243, -0.2598], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1260,  0.1768, -0.9805, -0.0286, -0.5977], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3164,  0.6797, -0.3379,  1.6406,  0.5352], device='cuda:0',
   

 35%|███▍      | 24/69 [05:29<10:28, 13.96s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2734,  0.2148,  0.0762, -0.2578,  0.1475], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2314,  0.1592, -0.3184, -0.0811, -0.2168], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1494,  0.1973, -0.9531, -0.0674, -0.5469], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.8945,  0.8359, -0.5234,  1.1797,  0.0459], device='cuda:0',
   

 36%|███▌      | 25/69 [05:43<10:14, 13.98s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4570,  0.2236,  0.0552, -0.2852,  0.0449], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3047,  0.0854, -0.2637, -0.1553, -0.0972], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1104,  0.0586, -0.7500, -0.1484, -0.5938], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.2676,  0.7734, -0.9219,  1.2344, -0.2266], device='cuda:0',
   

 38%|███▊      | 26/69 [05:57<09:56, 13.88s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2188, -0.1328, -0.2793, -0.1406,  0.0864], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2480,  0.2969, -0.1270,  0.0317, -0.0239], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1631,  0.2988, -0.8047,  0.0933, -0.5664], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0898,  0.6055, -0.6484,  1.8203, -0.4082], device='cuda:0',
   

 39%|███▉      | 27/69 [06:12<09:52, 14.12s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2354, -0.0654, -0.1660, -0.1348,  0.1206], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2471,  0.3281, -0.0103,  0.0197, -0.0210], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1650,  0.2852, -0.7617,  0.0640, -0.5508], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3691,  0.3828, -0.9492,  1.7578, -0.5156], device='cuda:0',
   

 41%|████      | 28/69 [06:26<09:40, 14.15s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0850, -0.0952, -0.1758, -0.1426,  0.0747], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3379,  0.2949, -0.0674, -0.0669, -0.0767], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1875,  0.3223, -0.7812, -0.0488, -0.6602], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7188,  0.7227, -0.8359,  1.5391, -0.5742], device='cuda:0',
   

 42%|████▏     | 29/69 [06:41<09:41, 14.54s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2041,  0.2637, -0.1377, -0.1123,  0.0449], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2812,  0.3281, -0.1162, -0.0583,  0.1074], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1670,  0.2832, -0.6133, -0.0457, -0.4805], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7656,  0.9062, -1.1172,  1.1406,  0.2871], device='cuda:0',
   

 43%|████▎     | 30/69 [06:56<09:32, 14.67s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3626, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3555,  0.2051,  0.0148, -0.0684,  0.1846], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3626, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2715,  0.1670, -0.0771, -0.0064, -0.0591], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3626, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0109,  0.1475, -0.7109,  0.0047, -0.6133], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3626, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6641,  0.8477, -0.7930,  1.5312,  0.1602], device='cuda:0',
   

 45%|████▍     | 31/69 [07:11<09:20, 14.74s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3633, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3633,  0.3379, -0.1167, -0.0820,  0.2197], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3633, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3105,  0.1748, -0.3496, -0.0786,  0.0898], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3633, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1250,  0.2754, -0.8594, -0.0449, -0.4980], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3633, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4043,  1.2969, -0.7383,  1.5703,  0.2793], device='cuda:0',
   

 46%|████▋     | 32/69 [07:25<08:59, 14.59s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1494,  0.1748, -0.1963, -0.1592,  0.1416], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3281,  0.2832, -0.1543, -0.1206,  0.0574], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1982,  0.3281, -0.6641, -0.1357, -0.5234], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-1.0312,  1.0469, -1.1016,  1.1562, -0.2812], device='cuda:0',
   

 48%|████▊     | 33/69 [07:40<08:43, 14.54s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1533,  0.0090, -0.0447, -0.1328, -0.0796], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2793,  0.2412,  0.1099, -0.0103, -0.0140], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2158,  0.2852, -0.6445, -0.0201, -0.5469], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1748,  0.7969, -0.8477,  1.4453, -0.1108], device='cuda:0',
   

 49%|████▉     | 34/69 [07:53<08:19, 14.27s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2441,  0.0957,  0.2324, -0.0464,  0.0540], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2441,  0.0520,  0.1836, -0.0234, -0.0123], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2070,  0.0620, -0.8203,  0.0038, -0.5312], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4355,  0.0625, -0.6289,  1.0391, -0.1504], device='cuda:0',
   

 51%|█████     | 35/69 [08:07<08:01, 14.15s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1914,  0.1670,  0.0033, -0.1328, -0.0957], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3086,  0.3105, -0.0322, -0.0273, -0.0422], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1367,  0.2734, -0.6484, -0.0811, -0.5352], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2188,  1.1953, -0.9297,  1.0156, -0.0256], device='cuda:0',
   

 52%|█████▏    | 36/69 [08:21<07:42, 14.01s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3630, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3359,  0.3379, -0.1201, -0.1338,  0.1641], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3630, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3184,  0.1621, -0.2432, -0.0698, -0.0234], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3630, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1660,  0.2344, -0.7656, -0.0742, -0.4434], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3630, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5273,  1.1953, -0.7734,  1.3828,  0.5039], device='cuda:0',
   

 54%|█████▎    | 37/69 [08:35<07:25, 13.93s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1709,  0.1309, -0.1021, -0.1641,  0.0214], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2012,  0.2373,  0.0576,  0.0056, -0.1201], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1406,  0.2812, -0.6641,  0.0212, -0.5586], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0396,  1.2500, -0.7070,  1.5078, -0.0491], device='cuda:0',
   

 55%|█████▌    | 38/69 [08:48<07:07, 13.78s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1553, -0.0100, -0.5742, -0.1865,  0.1235], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1216,  0.1367, -0.3320, -0.0515, -0.0300], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1982,  0.0466, -0.6758, -0.0664, -0.3711], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.5469, -0.4258, -0.0117,  1.0000, -0.0801], device='cuda:0',
   

 57%|█████▋    | 39/69 [09:02<06:52, 13.74s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1670,  0.2393,  0.0908, -0.2051,  0.3398], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2461,  0.2334, -0.1787, -0.0752,  0.0830], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0864,  0.2734, -0.6836, -0.0107, -0.4062], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0134,  0.2676, -0.5703,  1.7656,  0.2949], device='cuda:0',
   

 58%|█████▊    | 40/69 [09:16<06:42, 13.88s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2334,  0.0130, -0.2812, -0.2129, -0.0015], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2324,  0.2871, -0.2344, -0.0311,  0.0067], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2402,  0.2871, -0.7969,  0.1133, -0.5078], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1084,  0.8711, -1.0234,  1.9922, -0.1143], device='cuda:0',
   

 59%|█████▉    | 41/69 [09:31<06:33, 14.07s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2441,  0.2930, -0.2148, -0.2148,  0.2080], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2969,  0.3164, -0.1445, -0.0830,  0.0381], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1230,  0.1719, -0.7070,  0.0131, -0.5977], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0693,  0.1973, -0.5352,  1.3047,  0.0684], device='cuda:0',
   

 61%|██████    | 42/69 [09:45<06:23, 14.21s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2100,  0.2197, -0.1758, -0.0903,  0.0461], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2852,  0.1846, -0.0562, -0.0737,  0.0508], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0669,  0.1758, -0.7773, -0.0439, -0.5312], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4082,  0.5664, -1.0547,  1.2500, -0.2793], device='cuda:0',
   

 62%|██████▏   | 43/69 [09:58<06:03, 13.96s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1709,  0.1836, -0.2080, -0.1006,  0.2480], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2256,  0.3262, -0.0084, -0.0598,  0.1016], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1167,  0.3711, -0.6641,  0.0664, -0.5430], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0747,  0.9023, -0.6523,  1.6016,  0.1035], device='cuda:0',
   

 64%|██████▍   | 44/69 [10:12<05:44, 13.78s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1621,  0.1758, -0.1260, -0.1924,  0.2285], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2949,  0.3477,  0.0211, -0.0947,  0.0874], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1060,  0.4258, -0.7109,  0.0117, -0.5039], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1719,  1.2109, -0.5234,  1.3047,  0.4316], device='cuda:0',
   

 65%|██████▌   | 45/69 [10:25<05:28, 13.67s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1943,  0.0928, -0.2490, -0.0894,  0.0928], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3086,  0.2002, -0.1484, -0.0859,  0.0698], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2031,  0.1943, -0.9141,  0.0018, -0.4609], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3184,  0.8984, -1.0234,  1.2188, -0.4336], device='cuda:0',
   

 67%|██████▋   | 46/69 [10:39<05:13, 13.64s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2451,  0.2578, -0.0613, -0.0623,  0.1973], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2559,  0.2949, -0.3574,  0.0238, -0.0249], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1611,  0.1514, -0.8086,  0.1089, -0.5547], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1162,  0.5703, -1.2422,  2.1875,  0.0864], device='cuda:0',
   

 68%|██████▊   | 47/69 [10:53<05:00, 13.68s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0039,  0.2168, -0.2676, -0.0972,  0.2080], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2363,  0.1465, -0.1943, -0.0227,  0.0991], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1309,  0.2002, -0.8398,  0.0530, -0.6211], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1807,  0.5234, -1.3359,  1.1250, -0.0659], device='cuda:0',
   

 70%|██████▉   | 48/69 [11:07<04:49, 13.79s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0282,  0.2451,  0.0476, -0.0923,  0.1348], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1836,  0.2773, -0.1523, -0.0034,  0.0845], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1572,  0.2178, -0.7578,  0.1475, -0.4766], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5312,  0.3105, -1.3438,  1.9375, -0.0894], device='cuda:0',
   

 71%|███████   | 49/69 [11:20<04:35, 13.79s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0623,  0.0625, -0.2207, -0.2871,  0.1309], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3379,  0.3281,  0.0593, -0.1016, -0.0177], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2715,  0.3887, -0.8281, -0.0549, -0.6680], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7734,  0.4238, -1.0078,  1.0703, -0.2031], device='cuda:0',
   

 72%|███████▏  | 50/69 [11:34<04:21, 13.75s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3689, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0525,  0.2754, -0.2236, -0.1514,  0.1377], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3689, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3926,  0.2461, -0.2070, -0.0811,  0.0586], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3689, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.2451, -0.7266, -0.0732, -0.6562], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3689, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2754,  0.6680, -0.8672,  0.9023,  0.1050], device='cuda:0',
   

 74%|███████▍  | 51/69 [11:48<04:07, 13.75s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0027,  0.2178, -0.0297, -0.2139,  0.0752], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2031,  0.1689, -0.0596, -0.1875,  0.1260], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0065,  0.2090, -1.0000, -0.1826, -0.5312], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1914,  0.6445, -0.8789,  1.1719,  0.1748], device='cuda:0',
   

 75%|███████▌  | 52/69 [12:02<03:54, 13.81s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1963,  0.0232, -0.1367, -0.1582,  0.1699], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2949,  0.2168, -0.1465, -0.0918,  0.0815], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0408,  0.2070, -0.9961, -0.1719, -0.6250], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0132,  0.8789, -1.2031,  1.3906,  0.0898], device='cuda:0',
   

 77%|███████▋  | 53/69 [12:15<03:39, 13.73s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0703, -0.0820, -0.1816, -0.1523,  0.0869], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1982,  0.1875, -0.2070, -0.0620, -0.1699], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3262,  0.0854, -0.7734, -0.1279, -0.6133], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7969,  0.2207, -0.3867,  0.9453,  0.1631], device='cuda:0',
   

 78%|███████▊  | 54/69 [12:29<03:25, 13.67s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3634, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1758, -0.0034,  0.0047, -0.1367,  0.0728], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3634, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2402,  0.1748, -0.0986, -0.0786, -0.0101], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3634, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1396,  0.2197, -0.7812, -0.1396, -0.7070], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3634, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2490,  0.5156, -0.3164,  1.0078,  0.0493], device='cuda:0',
   

 80%|███████▉  | 55/69 [12:43<03:12, 13.76s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1494,  0.1807, -0.4414, -0.2256, -0.0449], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1689,  0.2354, -0.2363, -0.0952, -0.1836], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3105,  0.1030, -0.6953, -0.1758, -0.5312], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3631, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4824,  0.0908, -1.0391,  0.8438, -0.1221], device='cuda:0',
   

 81%|████████  | 56/69 [12:57<02:59, 13.82s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3647, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1230,  0.0286, -0.4570, -0.2490, -0.0172], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3647, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1943,  0.1318, -0.0486, -0.0850, -0.2441], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3647, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2432,  0.0801, -0.7812, -0.2334, -0.5664], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3647, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6914,  0.0933, -0.8008,  0.6445, -0.0491], device='cuda:0',
   

 83%|████████▎ | 57/69 [13:10<02:45, 13.79s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5391, -0.1934, -0.3203, -0.1338,  0.0347], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3262, -0.0342, -0.1494, -0.1182, -0.1973], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1504,  0.0938, -0.9023, -0.2432, -0.5273], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.4570, -0.1338, -1.1875,  0.1465, -0.0078], device='cuda:0',
   

 84%|████████▍ | 58/69 [13:24<02:30, 13.69s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1777,  0.0420, -0.1348, -0.1040,  0.0354], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1367,  0.1680, -0.1338, -0.0093, -0.0186], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0889,  0.1069, -0.9375, -0.0608, -0.6016], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4570,  0.0427, -0.6758,  0.9844,  0.1729], device='cuda:0',
   

 86%|████████▌ | 59/69 [13:37<02:16, 13.64s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0312, -0.0017, -0.0679, -0.0801,  0.2754], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2246,  0.1738, -0.0228, -0.0068,  0.0048], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0537,  0.1787, -1.0000, -0.0198, -0.6953], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.8281,  1.1875, -0.0801,  2.1719,  0.5508], device='cuda:0',
   

 87%|████████▋ | 60/69 [13:51<02:02, 13.59s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0156,  0.1650, -0.1147, -0.0996,  0.2676], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2432,  0.2168, -0.0547, -0.0698, -0.0522], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1338,  0.2305, -0.8867, -0.0815, -0.7227], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3669, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6719,  1.1484, -0.5195,  1.7969,  0.0089], device='cuda:0',
   

 88%|████████▊ | 61/69 [14:04<01:47, 13.49s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3640, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1123,  0.1016, -0.0786, -0.1191,  0.0520], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3640, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1230,  0.1748, -0.2363, -0.0918, -0.0039], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3640, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1533,  0.1270, -0.9297, -0.0557, -0.6289], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3640, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5898, -0.0063, -0.7812,  1.1406,  0.1963], device='cuda:0',
   

 90%|████████▉ | 62/69 [14:18<01:34, 13.50s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0097,  0.0708, -0.0928, -0.0625,  0.3477], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2295,  0.1396, -0.0282,  0.0004,  0.0474], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1089,  0.1328, -0.9219, -0.0046, -0.6523], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.8086,  1.0391, -0.3359,  1.8281,  0.2891], device='cuda:0',
   

 91%|█████████▏| 63/69 [14:31<01:21, 13.57s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.1084,  0.1709, -0.1836,  0.1895], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1777,  0.0957, -0.1514, -0.0386, -0.0840], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2217, -0.0742, -0.6797, -0.0123, -0.5859], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0537,  0.1016, -1.1562,  0.9023, -0.2207], device='cuda:0',
   

 93%|█████████▎| 64/69 [14:45<01:07, 13.59s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0415,  0.0659, -0.0757,  0.0008,  0.0806], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2812,  0.1270,  0.0757,  0.0544, -0.0728], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1553,  0.2295, -0.8242,  0.0786, -0.5391], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-4.3945e-01,  6.5994e-04, -8.9453e-01,  7.5000e-01, -2.6953e-01],


 94%|█████████▍| 65/69 [14:58<00:54, 13.52s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0327,  0.1875,  0.2793, -0.1943,  0.1943], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3164,  0.0757,  0.1348, -0.1875,  0.1768], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2441,  0.2197, -0.8672, -0.1699, -0.4180], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3184,  0.4766, -0.9688,  1.1016, -0.1641], device='cuda:0',
   

 96%|█████████▌| 66/69 [15:12<00:40, 13.47s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.3516,  0.0349, -0.1592,  0.0654], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2969,  0.1221, -0.1328, -0.0938,  0.0245], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2334, -0.0559, -0.8398, -0.0260, -0.5391], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5625, -0.5703, -0.8906,  0.7305, -0.3887], device='cuda:0',
   

 97%|█████████▋| 67/69 [15:24<00:26, 13.14s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3415, 3584]), Averaged feature shape: torch.Size([13, 3584]), Sample: tensor([-0.2051,  0.3652, -0.0026, -0.2480,  0.1953], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3415, 3584]), Averaged feature shape: torch.Size([13, 3584]), Sample: tensor([-0.2695,  0.1089,  0.0928, -0.1338, -0.0625], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3415, 3584]), Averaged feature shape: torch.Size([13, 3584]), Sample: tensor([-0.2178,  0.0371, -0.7773, -0.1143, -0.5312], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3415, 3584]), Averaged feature shape: torch.Size([13, 3584]), Sample: tensor([-0.5664, -0.4980, -1.1875,  0.8594, -0.3184], device='cuda:0',
   

 99%|█████████▊| 68/69 [15:29<00:13, 13.66s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 1562, 3584]), Averaged feature shape: torch.Size([6, 3584]), Sample: tensor([-0.0801,  0.0277,  0.0420, -0.0425,  0.2188], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 1562, 3584]), Averaged feature shape: torch.Size([6, 3584]), Sample: tensor([-0.2637,  0.0496,  0.0080, -0.0168,  0.1875], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 1562, 3584]), Averaged feature shape: torch.Size([6, 3584]), Sample: tensor([-0.1396,  0.2891, -0.8438,  0.0293, -0.4863], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 1562, 3584]), Averaged feature shape: torch.Size([6, 3584]), Sample: tensor([-0.3867,  0.6289, -0.9414,  0.8906,  0.0024], device='cuda:0',
       




Total number of frames in the video: 21079.0
Original Resolution: (720.0, 480.0)
FPS: 29.968454258675077
Duration (seconds): 703.3729473684211
Target Resolution: (224, 224)
Read 21079 frames.
Frames shape: torch.Size([21079, 3, 224, 224])


  torchaudio.set_audio_backend("ffmpeg")


Total duration: 703.37 seconds
Number of intervals: 472
Sample rate: 48000
Output file: /kaggle/working/friends/s1/friends_s01e03a.h5
Num splits: 35


  1%|▏         | 1/69 [00:13<14:49, 13.08s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1226, -0.0549,  0.0422, -0.6016, -0.3887], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2910, -0.3086, -0.0339, -0.1973, -0.0045], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0347,  0.0378, -0.5469, -0.1816, -0.6328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 1.3438, -0.3125, -1.5625,  0.1504, -1.5078], device='cuda:0',
   

  3%|▎         | 2/69 [00:26<14:33, 13.03s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2266,  0.1108, -0.0559, -0.1250,  0.3555], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3281,  0.1562, -0.1836, -0.0190, -0.0039], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1895,  0.2041, -0.7227,  0.0091, -0.5820], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5898,  0.7500, -0.8867,  1.0000, -0.1416], device='cuda:0',
   

  4%|▍         | 3/69 [00:39<14:21, 13.05s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1289,  0.0282, -0.0996, -0.1475,  0.1787], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2275,  0.3242, -0.1816, -0.0552,  0.0444], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2305,  0.2451, -0.7969,  0.0157, -0.5430], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0076,  1.0547, -1.0312,  1.5000, -0.2793], device='cuda:0',
   

  6%|▌         | 4/69 [00:52<14:13, 13.13s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1021,  0.1562, -0.0620, -0.0625,  0.3262], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2275,  0.2090, -0.0674, -0.0742,  0.0452], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0579,  0.2246, -0.8516,  0.0262, -0.7070], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-1.2109,  1.2969, -0.4590,  1.7188,  0.2061], device='cuda:0',
   

  7%|▋         | 5/69 [01:05<14:02, 13.17s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2227,  0.1416, -0.2188, -0.0791,  0.1709], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2812,  0.2930, -0.1260, -0.0056,  0.1416], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1416,  0.2617, -0.8320,  0.0850, -0.5078], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3223,  0.8555, -1.4062,  1.7812, -0.2520], device='cuda:0',
   

  9%|▊         | 6/69 [01:19<13:53, 13.23s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1943,  0.1475, -0.2412, -0.0811,  0.1406], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2520,  0.3320, -0.0991, -0.0232,  0.1328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1572,  0.3008, -0.8203,  0.1050, -0.5391], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1396,  0.7578, -1.4609,  2.1094, -0.1709], device='cuda:0',
   

 10%|█         | 7/69 [01:32<13:37, 13.19s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1465, -0.0294, -0.1226, -0.1553,  0.1768], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2734,  0.3066, -0.0281, -0.0374,  0.0752], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1943,  0.3828, -0.8672,  0.0991, -0.5664], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3535,  1.2969, -0.9141,  2.3438,  0.1562], device='cuda:0',
   

 12%|█▏        | 8/69 [01:45<13:26, 13.21s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0703,  0.0972,  0.0134, -0.0767,  0.2578], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2217,  0.2256,  0.0089, -0.1338,  0.0698], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1064,  0.3105, -0.8320, -0.0728, -0.5352], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5234,  1.0859, -0.8672,  1.4297,  0.5938], device='cuda:0',
   

 13%|█▎        | 9/69 [01:59<13:21, 13.35s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1196, -0.0654,  0.0486, -0.6055, -0.3789], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2734, -0.3008, -0.0178, -0.2090, -0.0089], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0383,  0.0214, -0.5273, -0.1914, -0.6406], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 1.3906, -0.3223, -1.4844,  0.1318, -1.5547], device='cuda:0',
   

 14%|█▍        | 10/69 [02:12<13:12, 13.43s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1377, -0.0042, -0.1196, -0.1836,  0.1553], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2041, -0.0059, -0.3184, -0.1187,  0.0104], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0869,  0.0588, -0.8789, -0.0977, -0.5508], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1211, -0.7617, -0.7188,  1.2031, -0.3027], device='cuda:0',
   

 16%|█▌        | 11/69 [02:26<13:01, 13.48s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2520,  0.3145, -0.0479, -0.1436,  0.2051], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2305,  0.2344, -0.1338, -0.0649, -0.0693], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2334,  0.1719, -0.6797, -0.0459, -0.6250], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6367,  0.0442, -0.8047,  0.5547,  0.0228], device='cuda:0',
   

 17%|█▋        | 12/69 [02:39<12:40, 13.35s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4121,  0.0879, -0.0864, -0.1680,  0.2891], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2021,  0.1621, -0.1777, -0.0461, -0.1045], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0967,  0.1992, -0.8555, -0.0645, -0.6406], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0493, -0.1387, -1.7188,  1.2422,  0.1553], device='cuda:0',
   

 19%|█▉        | 13/69 [02:52<12:28, 13.36s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1982, -0.0791, -0.3008, -0.0664,  0.1777], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.1982, -0.0422, -0.0398, -0.1426], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3164,  0.2139, -0.7539, -0.0552, -0.7539], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2295,  0.5039, -1.4375,  0.5430, -0.1079], device='cuda:0',
   

 20%|██        | 14/69 [03:05<12:12, 13.33s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4160,  0.1768, -0.0571, -0.1240,  0.0119], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1924,  0.1934, -0.2275, -0.0850, -0.0894], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1963,  0.2002, -0.8398, -0.0859, -0.6914], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3730, -0.1416, -1.4609,  1.0391, -0.0542], device='cuda:0',
   

 22%|██▏       | 15/69 [03:19<12:01, 13.36s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0962, -0.0557, -0.2578, -0.0820,  0.1211], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2871,  0.1982, -0.0071, -0.0254, -0.1040], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3301,  0.3164, -0.7695, -0.0684, -0.6328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.7188, -1.4453,  0.3516, -0.2773], device='cuda:0',
   

 23%|██▎       | 16/69 [03:32<11:47, 13.36s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2656,  0.3633, -0.0820, -0.1953,  0.1758], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1914,  0.2168, -0.1885, -0.1104, -0.0376], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2383,  0.0791, -0.7305, -0.0513, -0.6211], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2480, -0.1494, -0.9727,  0.4707, -0.3789], device='cuda:0',
   

 25%|██▍       | 17/69 [03:46<11:36, 13.39s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1621, -0.0874, -0.2490, -0.1035,  0.1123], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2344,  0.2080, -0.0571, -0.0581, -0.1104], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3184,  0.1904, -0.7617, -0.0723, -0.6055], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0620,  0.5234, -0.8828,  0.3887, -0.3613], device='cuda:0',
   

 26%|██▌       | 18/69 [03:59<11:24, 13.43s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3965,  0.1611, -0.1650, -0.1680,  0.2949], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2754,  0.1846, -0.0713, -0.0630, -0.0408], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1953,  0.0991, -0.7461, -0.0173, -0.6133], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5117, -0.1211, -0.6289,  0.5781, -0.1133], device='cuda:0',
   

 28%|██▊       | 19/69 [04:13<11:12, 13.45s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1182,  0.0082, -0.2354, -0.1475, -0.0081], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2891,  0.2061,  0.0476, -0.1006, -0.1226], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2832,  0.2734, -0.7539, -0.0640, -0.7148], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2314,  1.0781, -0.8047,  0.5703, -0.0874], device='cuda:0',
   

 29%|██▉       | 20/69 [04:26<10:57, 13.41s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1523,  0.1001, -0.4199, -0.0942,  0.0977], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.1089, -0.1621,  0.0376, -0.0234], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1562,  0.2090, -0.7148,  0.0371, -0.5859], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.6484,  0.3594, -0.7188,  0.7227,  0.5625], device='cuda:0',
   

 30%|███       | 21/69 [04:40<10:47, 13.49s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4023,  0.2734, -0.0378, -0.1846,  0.2266], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2490,  0.0376, -0.0981, -0.0811, -0.0811], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0996,  0.1406, -0.7383, -0.1592, -0.5938], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0591,  0.5234, -1.3594,  0.4336,  0.4141], device='cuda:0',
   

 32%|███▏      | 22/69 [04:53<10:36, 13.53s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1689, -0.0889, -0.4316, -0.1216,  0.0415], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2441,  0.3691, -0.2812,  0.0063,  0.0210], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2910,  0.3008, -0.8359,  0.1367, -0.5859], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0199,  0.6875, -0.8086,  1.8125, -0.8086], device='cuda:0',
   

 33%|███▎      | 23/69 [05:07<10:22, 13.54s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1377,  0.0596, -0.1279, -0.2012,  0.1426], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3750,  0.3789, -0.0427, -0.1406,  0.0432], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2793,  0.4102, -0.7461, -0.0623, -0.5000], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1670,  1.1328, -1.3281,  1.4062,  0.0320], device='cuda:0',
   

 35%|███▍      | 24/69 [05:21<10:10, 13.57s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1089,  0.0491, -0.0732, -0.1348,  0.2373], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2148,  0.2656, -0.1504, -0.1030,  0.1445], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0688,  0.2217, -0.8359,  0.0080, -0.6367], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4512,  1.2422, -0.2285,  1.7422,  0.0073], device='cuda:0',
   

 36%|███▌      | 25/69 [05:34<10:00, 13.64s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3643, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1689, -0.0264, -0.2617, -0.1768,  0.0967], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3643, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2217,  0.4512, -0.1934,  0.0070,  0.0588], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3643, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1196,  0.4004, -0.7070,  0.1826, -0.5273], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3643, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.3164,  1.3750, -0.8633,  2.2656, -0.3496], device='cuda:0',
   

 38%|███▊      | 26/69 [05:48<09:49, 13.71s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1196, -0.0222, -0.0097, -0.0659,  0.2148], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3184,  0.2734, -0.2754, -0.0476,  0.0581], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773,  0.2500, -0.7930,  0.0079, -0.5508], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3654, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0104,  0.9336, -0.5352,  1.6562, -0.4746], device='cuda:0',
   

 39%|███▉      | 27/69 [06:02<09:34, 13.68s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1572, -0.0172, -0.2891, -0.1846, -0.0023], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2393,  0.3457, -0.2832, -0.0508,  0.0233], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2188,  0.2754, -0.8203,  0.1055, -0.6680], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0476,  0.8398, -0.9805,  2.0156, -0.9062], device='cuda:0',
   

 41%|████      | 28/69 [06:15<09:17, 13.59s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1099,  0.0625, -0.0361, -0.1494,  0.1367], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2832,  0.1357, -0.0698, -0.1328,  0.0271], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1099,  0.1963, -0.9023, -0.1177, -0.5391], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5781,  0.9062, -0.2793,  1.3281,  0.4531], device='cuda:0',
   

 42%|████▏     | 29/69 [06:29<09:07, 13.69s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1494,  0.1709, -0.1245, -0.1621,  0.0898], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3926,  0.3281, -0.2383, -0.1367,  0.0762], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1030,  0.3477, -0.8633, -0.0515, -0.6055], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3066,  0.8945, -1.1172,  2.0312,  0.0679], device='cuda:0',
   

 43%|████▎     | 30/69 [06:43<08:58, 13.80s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0688,  0.0242,  0.0732, -0.1177,  0.2910], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3691,  0.2578, -0.0884, -0.0645,  0.1377], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1406,  0.1504, -0.8789,  0.0703, -0.6094], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3418,  0.3750, -1.0625,  1.7109, -0.0417], device='cuda:0',
   

 45%|████▍     | 31/69 [06:57<08:45, 13.82s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3709, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0830,  0.1299,  0.0076, -0.1748,  0.2793], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3709, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3242,  0.2305, -0.1592, -0.1172,  0.1123], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3709, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773,  0.1553, -0.9297,  0.0361, -0.5117], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3709, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1699,  0.4395, -1.0234,  2.0312, -0.0256], device='cuda:0',
   

 46%|████▋     | 32/69 [07:11<08:27, 13.71s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0251,  0.0874,  0.0510, -0.0903,  0.2227], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3574,  0.3066, -0.2412, -0.0437,  0.1240], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1396,  0.1514, -0.8672,  0.0718, -0.5781], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1377,  0.7305, -0.9805,  1.8125, -0.0393], device='cuda:0',
   

 48%|████▊     | 33/69 [07:24<08:13, 13.71s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2051,  0.0010, -0.0820, -0.1226,  0.0396], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2070,  0.1680, -0.3105, -0.0444, -0.0388], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1143,  0.2441, -0.8242,  0.0182, -0.6094], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.5156,  0.5781, -1.0938,  2.1719,  0.1367], device='cuda:0',
   

 49%|████▉     | 34/69 [07:38<07:58, 13.67s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3712, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2520,  0.1953, -0.1621, -0.2051,  0.0126], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3712, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2969,  0.3535, -0.1875, -0.1133,  0.0996], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3712, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1025,  0.2832, -0.7461, -0.0156, -0.5156], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3712, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2852,  0.6367, -0.8320,  1.7969, -0.7500], device='cuda:0',
   

 51%|█████     | 35/69 [07:51<07:44, 13.65s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1318,  0.1260, -0.2266, -0.2061,  0.2158], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3887,  0.1533, -0.2715, -0.1235,  0.1338], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1729,  0.3672, -0.7109, -0.0518, -0.5469], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6523,  0.5547,  0.0034,  1.4922,  0.1924], device='cuda:0',
   

 52%|█████▏    | 36/69 [08:05<07:29, 13.63s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1865,  0.0166, -0.4277, -0.1533, -0.0008], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2617,  0.4062, -0.3184, -0.0160,  0.0449], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1982,  0.3359, -0.8242,  0.0386, -0.5508], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1484,  0.9336, -0.7891,  2.1094, -0.7109], device='cuda:0',
   

 54%|█████▎    | 37/69 [08:19<07:17, 13.67s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1904,  0.0508, -0.1523, -0.1924,  0.0143], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2754,  0.3574, -0.2500, -0.0264,  0.0549], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1152,  0.3047, -0.7266,  0.0801, -0.6133], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0586,  0.8203, -0.9180,  1.5000, -0.7656], device='cuda:0',
   

 55%|█████▌    | 38/69 [08:33<07:06, 13.76s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2520,  0.2422, -0.0332, -0.1543,  0.1914], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.1592, -0.2715, -0.0952, -0.1416], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1572,  0.1367, -0.8438, -0.0086, -0.5742], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773,  0.4395, -0.2021,  1.8438, -0.2969], device='cuda:0',
   

 57%|█████▋    | 39/69 [08:46<06:52, 13.75s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4219,  0.1270,  0.0233, -0.0898,  0.2656], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2393,  0.2168, -0.3652,  0.0024,  0.0400], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1196,  0.2871, -0.6797,  0.1006, -0.6523], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.3340,  0.4023, -0.6328,  1.6797, -0.0508], device='cuda:0',
   

 58%|█████▊    | 40/69 [09:00<06:41, 13.83s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1172,  0.1328, -0.0135, -0.2500,  0.1309], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2402,  0.2656, -0.0439, -0.0977,  0.0608], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1475,  0.2871, -0.7930, -0.0036, -0.6055], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0718,  0.3164, -0.6680,  2.1250, -0.6680], device='cuda:0',
   

 59%|█████▉    | 41/69 [09:14<06:26, 13.79s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3707, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1904,  0.1035, -0.4082, -0.2695,  0.0645], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3707, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2871, -0.0461, -0.1416, -0.2217, -0.1973], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3707, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1260, -0.0204, -0.8789, -0.1816, -0.5820], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3707, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.6680, -0.5781, -1.0156,  1.6406, -0.4082], device='cuda:0',
   

 61%|██████    | 42/69 [09:27<06:08, 13.64s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3704, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0052,  0.0903,  0.0645, -0.0398,  0.1416], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3704, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3008,  0.1426, -0.0791, -0.0713,  0.1299], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3704, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0276,  0.1543, -0.8203, -0.0078, -0.5273], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3704, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3320,  1.0703, -0.1191,  1.9141, -0.2910], device='cuda:0',
   

 62%|██████▏   | 43/69 [09:41<05:54, 13.64s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0134,  0.1133, -0.1079, -0.1187,  0.0942], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2178,  0.1787, -0.1875, -0.1484,  0.0732], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0352,  0.1934, -0.8164, -0.1387, -0.5547], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3242,  0.7109,  0.2500,  2.2656,  0.0378], device='cuda:0',
   

 64%|██████▍   | 44/69 [09:55<05:40, 13.61s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2041,  0.1963, -0.1299, -0.0894,  0.2051], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2451,  0.1670, -0.2578, -0.1270,  0.1328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1758,  0.1201, -0.7930, -0.0476, -0.5781], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1475,  0.1797, -0.1855,  2.1875, -0.1523], device='cuda:0',
   

 65%|██████▌   | 45/69 [10:08<05:26, 13.61s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1768,  0.1099, -0.0879, -0.1299,  0.2480], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.2480,  0.1206, -0.0933,  0.1768], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-1.8457e-01,  2.1191e-01, -7.3047e-01,  7.1716e-04, -5.0000e-01],
       device='cuda:1', dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5508,  0.6055, -1.2578,  1.1953,  0.5273], 

 67%|██████▋   | 46/69 [10:22<05:13, 13.61s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1494, -0.0025, -0.4375, -0.1001,  0.1157], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1758,  0.3418,  0.0026, -0.0206, -0.1084], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2285,  0.1592, -0.8320,  0.0588, -0.6562], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3650, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3457,  0.3809, -1.2109,  1.7109, -0.5156], device='cuda:0',
   

 68%|██████▊   | 47/69 [10:35<04:57, 13.54s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3809, -0.0337, -0.4258, -0.1348,  0.1187], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1406,  0.1533, -0.3535, -0.1162, -0.2324], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.0757, -1.0078, -0.1040, -0.4727], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.2676,  0.0898, -1.2578,  0.8281, -0.4512], device='cuda:0',
   

 70%|██████▉   | 48/69 [10:49<04:45, 13.60s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.1533,  0.0150, -0.1221,  0.0232], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2871,  0.2051, -0.0349, -0.1099, -0.0220], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 3.1662e-04,  2.4414e-01, -7.1875e-01, -1.0742e-01, -6.2109e-01],
       device='cuda:1', dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1426,  0.2383, -1.1250,  0.6133,  0.5156], 

 71%|███████   | 49/69 [11:03<04:34, 13.73s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1226,  0.2422, -0.1787, -0.0845,  0.3750], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3281,  0.3301, -0.1338, -0.0645,  0.0088], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2178,  0.2754, -0.8008,  0.0586, -0.6055], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2344,  0.3652, -0.6602,  1.7734, -0.3125], device='cuda:0',
   

 72%|███████▏  | 50/69 [11:17<04:21, 13.77s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0206,  0.1660, -0.1777, -0.1543,  0.1582], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2500,  0.2031, -0.1592, -0.0737,  0.0811], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2539,  0.2031, -0.8047,  0.0075, -0.5195], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1201,  0.6641, -0.5859,  1.3516, -0.1738], device='cuda:0',
   

 74%|███████▍  | 51/69 [11:31<04:08, 13.78s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0986, -0.0276, -0.3164, -0.1040,  0.1050], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2852,  0.1270, -0.1064, -0.0435, -0.0322], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2109,  0.1328, -0.8438,  0.0212, -0.6445], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.3281, -0.3379, -0.4414,  0.9141,  0.1226], device='cuda:0',
   

 75%|███████▌  | 52/69 [11:45<03:55, 13.85s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1445,  0.2168, -0.2598, -0.1768,  0.1611], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2432,  0.0977,  0.0132, -0.0552, -0.1133], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1621,  0.0630, -0.7969, -0.0044, -0.5312], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.8633,  0.2598, -1.1953,  0.9102, -0.2812], device='cuda:0',
   

 77%|███████▋  | 53/69 [11:59<03:44, 14.02s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1147,  0.1660, -0.3613, -0.1069, -0.0140], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2559,  0.3770, -0.0284, -0.0045, -0.0244], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1895,  0.3184, -0.7578,  0.0325, -0.6211], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4648,  0.2656, -0.9492,  1.1250, -0.5117], device='cuda:0',
   

 78%|███████▊  | 54/69 [12:13<03:29, 13.94s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2793,  0.1484, -0.1748, -0.1221,  0.0908], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2891,  0.1846,  0.1445, -0.0713, -0.0776], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0317,  0.2266, -0.7500, -0.0525, -0.6328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4707,  0.7734, -1.0312,  0.8359,  0.0757], device='cuda:0',
   

 80%|███████▉  | 55/69 [12:27<03:14, 13.88s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2949, -0.2012, -0.6641, -0.4609,  0.3770], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2520, -0.2676,  0.2090, -0.2070, -0.0654], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1729, -0.0908, -0.9375, -0.2256, -0.6211], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3656, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3574,  0.2217, -1.2344,  1.0703, -0.6445], device='cuda:0',
   

 81%|████████  | 56/69 [12:41<03:00, 13.91s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0033,  0.0559, -0.2266, -0.1504,  0.0427], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3145,  0.3320,  0.1084, -0.0698,  0.0109], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2715,  0.2314, -0.6953, -0.0403, -0.5625], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2188,  0.7383, -0.9062,  1.2109, -0.4961], device='cuda:0',
   

 83%|████████▎ | 57/69 [12:55<02:47, 13.99s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0757,  0.0879, -0.0635, -0.0674,  0.1289], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2236,  0.0923,  0.0405,  0.0102,  0.0801], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1260,  0.1465, -0.8086,  0.0386, -0.4863], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7539,  0.3945, -1.4922,  1.0000,  0.0527], device='cuda:0',
   

 84%|████████▍ | 58/69 [13:09<02:34, 14.00s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2441,  0.0986, -0.1055, -0.0559,  0.1118], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2793,  0.1060, -0.1309,  0.0128,  0.0454], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0065,  0.1582, -0.8867,  0.0282, -0.6328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3906, -0.0801, -0.5742,  1.5781, -0.2412], device='cuda:0',
   

 86%|████████▌ | 59/69 [13:23<02:20, 14.09s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1504,  0.1465,  0.0238, -0.0591,  0.0757], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1768,  0.2129,  0.0728, -0.0796, -0.0957], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1455,  0.1582, -0.7148, -0.0913, -0.6484], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4414,  0.4258, -0.8320,  1.4297, -0.0312], device='cuda:0',
   

 87%|████████▋ | 60/69 [13:37<02:06, 14.03s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1299,  0.0349, -0.1123, -0.0640,  0.2910], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3184,  0.1777, -0.2119, -0.1021,  0.0593], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1084,  0.2314, -0.9219, -0.0322, -0.6562], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0703,  0.8398, -0.5898,  0.7812, -0.8008], device='cuda:0',
   

 88%|████████▊ | 61/69 [13:51<01:52, 14.06s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1167,  0.1807, -0.1318, -0.0981,  0.1992], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2178,  0.1963, -0.0630, -0.1030,  0.0574], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0669,  0.2275, -0.8945, -0.1279, -0.4551], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3690, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.9844,  0.8125, -0.9023,  0.8945,  0.4766], device='cuda:0',
   

 90%|████████▉ | 62/69 [14:05<01:38, 14.05s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2373,  0.0732, -0.0688, -0.1206,  0.3867], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2910,  0.1143, -0.1060, -0.1162, -0.1299], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1494,  0.0488, -0.8984, -0.1377, -0.6406], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1855,  0.7383, -0.3535,  1.0156,  0.3086], device='cuda:0',
   

 91%|█████████▏| 63/69 [14:19<01:24, 14.06s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0840,  0.2012, -0.1279, -0.1592, -0.0396], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2354,  0.1836,  0.0302, -0.1099, -0.0820], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2432,  0.0840, -0.7930, -0.1309, -0.4844], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0454,  0.2871, -1.0781,  0.4980, -0.2539], device='cuda:0',
   

 93%|█████████▎| 64/69 [14:33<01:09, 13.95s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3630, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0605,  0.1836, -0.3008, -0.1592,  0.0193], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3630, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2637,  0.3887,  0.1270, -0.0413, -0.0679], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3630, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.2500, -0.7266, -0.0115, -0.7070], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3630, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5430,  0.4688, -1.0938,  1.0781, -0.5391], device='cuda:0',
   

 94%|█████████▍| 65/69 [14:47<00:55, 13.93s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3086,  0.0728, -0.5898, -0.2559,  0.2266], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2412,  0.0344, -0.4199, -0.0996, -0.1357], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2002,  0.0337, -0.7070, -0.1729, -0.4473], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.4512, -0.4434, -0.4258,  1.5859, -0.1387], device='cuda:0',
   

 96%|█████████▌| 66/69 [15:01<00:41, 13.89s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1318,  0.3125, -0.0752, -0.1104,  0.0737], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3281,  0.2988, -0.1982, -0.0322, -0.0654], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1855,  0.1865, -0.6953,  0.0310, -0.6250], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0388, -0.0664, -1.2109,  1.5234, -0.2559], device='cuda:0',
   

 97%|█████████▋| 67/69 [15:09<00:24, 12.34s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 2627, 3584]), Averaged feature shape: torch.Size([10, 3584]), Sample: tensor([-0.2168,  0.2217, -0.0981, -0.0830,  0.2314], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 2627, 3584]), Averaged feature shape: torch.Size([10, 3584]), Sample: tensor([-0.2393,  0.2188, -0.1533, -0.0520, -0.0688], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 2627, 3584]), Averaged feature shape: torch.Size([10, 3584]), Sample: tensor([-0.2070,  0.1865, -0.6875, -0.0106, -0.5820], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 2627, 3584]), Averaged feature shape: torch.Size([10, 3584]), Sample: tensor([-0.5234, -0.1934, -1.4297,  1.6406, -0.0128], device='cuda:0',
   

 99%|█████████▊| 68/69 [15:11<00:13, 13.41s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 777, 3584]), Averaged feature shape: torch.Size([3, 3584]), Sample: tensor([-0.1040, -0.0378,  0.0491, -0.6211, -0.3828], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 777, 3584]), Averaged feature shape: torch.Size([3, 3584]), Sample: tensor([-0.2910, -0.2812, -0.0135, -0.2100, -0.0248], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 777, 3584]), Averaged feature shape: torch.Size([3, 3584]), Sample: tensor([ 0.0203,  0.0493, -0.5195, -0.1924, -0.6250], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 777, 3584]), Averaged feature shape: torch.Size([3, 3584]), Sample: tensor([ 1.2734, -0.2832, -1.6406,  0.0354, -1.5625], device='cuda:0',
       dtyp




Total number of frames in the video: 21554.0
Original Resolution: (720.0, 480.0)
FPS: 29.968454258675077
Duration (seconds): 719.222947368421
Target Resolution: (224, 224)
Read 21554 frames.
Frames shape: torch.Size([21554, 3, 224, 224])


  torchaudio.set_audio_backend("ffmpeg")


Total duration: 719.23 seconds
Number of intervals: 482
Sample rate: 48000
Output file: /kaggle/working/friends/s1/friends_s01e17a.h5
Num splits: 35


  1%|▏         | 1/69 [00:12<14:43, 12.99s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1133, -0.0520,  0.0320, -0.6055, -0.3887], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2910, -0.2949,  0.0277, -0.1865, -0.0327], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0347,  0.0649, -0.5273, -0.1670, -0.6562], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 1.3125, -0.4414, -1.5156,  0.1611, -1.5703], device='cuda:0',
   

  3%|▎         | 2/69 [00:26<14:42, 13.17s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2754,  0.1001, -0.0505, -0.0645,  0.2969], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3555,  0.1299,  0.0544, -0.0291,  0.0294], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1523,  0.1631, -0.8398,  0.0356, -0.5352], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4180,  0.8906, -0.9727,  1.6641,  0.4707], device='cuda:0',
   

  4%|▍         | 3/69 [00:39<14:23, 13.09s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4219,  0.0669,  0.0388, -0.1338,  0.3828], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3066,  0.0771, -0.2910, -0.0066, -0.0845], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2012,  0.1543, -0.9727,  0.0471, -0.5586], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5195,  0.4219, -1.1719,  1.9531, -0.1206], device='cuda:0',
   

  6%|▌         | 4/69 [00:52<14:14, 13.15s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3887,  0.0796, -0.0601, -0.1235,  0.2734], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.0952, -0.1729, -0.0535, -0.0742], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2324,  0.1299, -0.9570, -0.0564, -0.5039], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2207,  0.5898, -1.3516,  1.3516,  0.0339], device='cuda:0',
   

  7%|▋         | 5/69 [01:05<13:58, 13.10s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1631,  0.0251,  0.0004, -0.1582,  0.1206], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2266,  0.2090, -0.3457, -0.0762, -0.0620], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1309,  0.1187, -0.8008,  0.0420, -0.5977], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.6445,  0.4766, -0.8281,  1.6953, -0.3340], device='cuda:0',
   

  9%|▊         | 6/69 [01:18<13:50, 13.18s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3555,  0.0476,  0.1011, -0.2344,  0.0811], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773,  0.2812, -0.0669, -0.0732, -0.0742], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1494,  0.2275, -0.7344, -0.0081, -0.4980], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.4121,  0.7773, -0.3496,  2.0000,  0.1611], device='cuda:0',
   

 10%|█         | 7/69 [01:32<13:45, 13.31s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4316,  0.0654,  0.0175, -0.2832,  0.0967], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.2773, -0.1191, -0.0471, -0.1426], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1934,  0.2246, -0.7734, -0.0100, -0.4746], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.2852,  1.0938, -0.4746,  2.1406,  0.2578], device='cuda:0',
   

 12%|█▏        | 8/69 [01:45<13:26, 13.22s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1523,  0.0796, -0.1582, -0.2266,  0.0850], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3027,  0.3535, -0.3066, -0.0708,  0.0201], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2051,  0.3086, -0.8750,  0.0508, -0.5703], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1138,  0.6367, -1.2500,  1.7109, -0.2363], device='cuda:0',
   

 13%|█▎        | 9/69 [01:58<13:14, 13.24s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1089,  0.2246, -0.0659, -0.2246,  0.1089], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2734,  0.1797, -0.2217, -0.1553,  0.0654], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0488,  0.2373, -0.8359, -0.0444, -0.5859], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1240,  0.5469, -0.7539,  1.6328,  0.0530], device='cuda:0',
   

 14%|█▍        | 10/69 [02:12<13:02, 13.26s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0781,  0.2246, -0.2012, -0.2793,  0.1328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3516,  0.2119, -0.1533, -0.1934,  0.0581], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1035,  0.2930, -0.7930, -0.1963, -0.5312], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4355,  0.9336, -0.8789,  1.3672, -0.2236], device='cuda:0',
   

 16%|█▌        | 11/69 [02:25<12:55, 13.37s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1133,  0.2969, -0.1289, -0.2012,  0.2793], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2520,  0.1348, -0.1602, -0.1982,  0.0325], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0496,  0.2129, -0.8828, -0.1621, -0.5195], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1953,  0.4629, -0.7500,  1.0859,  0.0767], device='cuda:0',
   

 17%|█▋        | 12/69 [02:39<12:50, 13.52s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3706, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1660,  0.2021, -0.1641, -0.2617,  0.1289], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3706, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3203,  0.1279, -0.2334, -0.2100, -0.0391], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3706, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0430,  0.1953, -0.8438, -0.1553, -0.6289], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3706, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3711,  0.3027, -0.7305,  1.1953, -0.1504], device='cuda:0',
   

 19%|█▉        | 13/69 [02:53<12:44, 13.64s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1367,  0.1904, -0.0942, -0.1777,  0.2246], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2910,  0.2109, -0.2461, -0.1650,  0.0476], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0405,  0.3125, -0.8047, -0.1167, -0.5820], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3887,  0.7461, -0.9688,  1.2188, -0.0479], device='cuda:0',
   

 20%|██        | 14/69 [03:07<12:29, 13.63s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2275,  0.1318, -0.2734, -0.2295,  0.2197], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2432,  0.2871, -0.1963, -0.0369,  0.0757], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2266,  0.3125, -0.7930, -0.0293, -0.5039], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0154,  0.5156, -1.2188,  1.5312,  0.2314], device='cuda:0',
   

 22%|██▏       | 15/69 [03:20<12:13, 13.59s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1963,  0.0718, -0.2051, -0.1709,  0.1621], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2637,  0.1191, -0.1475, -0.0996,  0.0474], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0669,  0.1514, -0.8477,  0.0156, -0.5820], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4336,  0.6562, -0.6367,  1.6406,  0.0432], device='cuda:0',
   

 23%|██▎       | 16/69 [03:34<12:03, 13.64s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2871,  0.0557, -0.2012, -0.0864,  0.2461], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3008,  0.2314, -0.2100,  0.0166, -0.0376], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2910,  0.1904, -0.8594,  0.0625, -0.5859], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0287,  0.4941, -1.3828,  1.4141,  0.0537], device='cuda:0',
   

 25%|██▍       | 17/69 [03:48<11:50, 13.65s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1865,  0.0005, -0.2695, -0.2217,  0.1582], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2178,  0.2930, -0.1504, -0.0187,  0.0280], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2480,  0.3789, -0.8594,  0.0520, -0.5938], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0366,  0.8242, -1.2969,  1.8047, -0.1846], device='cuda:0',
   

 26%|██▌       | 18/69 [04:01<11:40, 13.74s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3699, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2617,  0.0403, -0.2246, -0.1934,  0.0645], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3699, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2695,  0.2480, -0.2500, -0.0476, -0.0059], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3699, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2227,  0.2158, -0.7930, -0.0305, -0.5781], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3699, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0021,  0.5898, -1.2734,  1.5547, -0.5117], device='cuda:0',
   

 28%|██▊       | 19/69 [04:15<11:30, 13.81s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2197,  0.1465, -0.1406, -0.1299,  0.3223], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2520,  0.2236, -0.1475, -0.0654,  0.0859], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1191,  0.2578, -0.8047, -0.0344, -0.6367], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0889,  0.5508, -1.4609,  1.4766,  0.4160], device='cuda:0',
   

 29%|██▉       | 20/69 [04:29<11:13, 13.75s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930, -0.0332, -0.2285, -0.1670,  0.1494], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2539,  0.3320, -0.2637,  0.0039, -0.0371], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2119,  0.2988, -0.7461,  0.0273, -0.5859], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3661, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1270,  0.9570, -1.3750,  1.9375, -0.2461], device='cuda:0',
   

 30%|███       | 21/69 [04:43<11:01, 13.78s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2412,  0.1855, -0.0140, -0.0530,  0.1357], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2715,  0.0688, -0.0806, -0.0250, -0.0310], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0933,  0.1631, -0.7578,  0.0273, -0.6719], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0786,  0.7852, -0.5430,  1.4062,  0.2578], device='cuda:0',
   

 32%|███▏      | 22/69 [04:57<10:48, 13.80s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3707, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5859,  0.2930, -0.2734, -0.0155,  0.1621], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3707, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2637,  0.0109,  0.0654, -0.0020,  0.0282], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3707, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0110,  0.0815, -0.8281,  0.0923, -0.4805], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3707, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0454, -0.1377, -0.5859,  0.4805,  0.1406], device='cuda:0',
   

 33%|███▎      | 23/69 [05:10<10:34, 13.78s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6992,  0.3242, -0.0364, -0.0869,  0.2969], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.1309, -0.1602, -0.0884,  0.0400], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0894,  0.2461, -0.7969, -0.0781, -0.3887], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0718, -0.3887, -0.8320,  0.4512,  0.3730], device='cuda:0',
   

 35%|███▍      | 24/69 [05:24<10:20, 13.79s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2676,  0.1318, -0.1719, -0.0859,  0.1196], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2715,  0.1040, -0.1221, -0.0106, -0.0640], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1543,  0.1416, -0.8242,  0.0361, -0.6289], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2344,  0.8516, -0.7578,  1.3047,  0.3789], device='cuda:0',
   

 36%|███▌      | 25/69 [05:38<10:07, 13.80s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2207, -0.0796, -0.5273, -0.3262,  0.2754], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3262,  0.0728,  0.1719, -0.2100, -0.1484], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1206,  0.1738, -0.8828, -0.1787, -0.4707], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.7656, -0.1133, -1.7891,  0.5469, -0.3340], device='cuda:0',
   

 38%|███▊      | 26/69 [05:52<09:49, 13.70s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0967,  0.3164, -0.1982, -0.1777,  0.1562], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.2412, -0.1758, -0.0771,  0.0625], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1914,  0.2461, -0.7773, -0.0386, -0.5820], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3703, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4824,  0.4199, -0.7188,  1.2656, -0.1416], device='cuda:0',
   

 39%|███▉      | 27/69 [06:05<09:32, 13.64s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0952,  0.2832, -0.0762, -0.0835,  0.2119], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3184,  0.3320, -0.0723, -0.0183,  0.0042], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2148,  0.2930, -0.7891,  0.0148, -0.5391], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1074,  0.3008, -0.4414,  1.5156,  0.1689], device='cuda:0',
   

 41%|████      | 28/69 [06:18<09:14, 13.52s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1162,  0.2910, -0.0444, -0.1670,  0.2158], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3965,  0.3379, -0.0564, -0.1357,  0.0854], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2031,  0.2158, -0.7812, -0.1396, -0.5430], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6211,  0.6953, -0.6602,  1.0859,  0.4609], device='cuda:0',
   

 42%|████▏     | 29/69 [06:32<08:58, 13.45s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2168,  0.2227, -0.2324, -0.2578,  0.0078], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2656,  0.2539,  0.0408, -0.1729,  0.0082], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2285,  0.1904, -0.6953, -0.1992, -0.5820], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3789,  0.5156, -0.3848,  0.5664,  0.1885], device='cuda:0',
   

 43%|████▎     | 30/69 [06:45<08:40, 13.34s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1797,  0.2715, -0.1641, -0.3008, -0.0271], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2812,  0.3047,  0.1182, -0.2598, -0.0115], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2539,  0.2871, -0.6797, -0.3223, -0.6055], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3301,  0.6875, -0.5469,  0.2988, -0.2852], device='cuda:0',
   

 45%|████▍     | 31/69 [06:58<08:26, 13.33s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3658, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2158,  0.1289, -0.1768, -0.3730,  0.0014], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3658, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2402,  0.1904, -0.0457, -0.1514, -0.0520], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3658, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3770,  0.0767, -0.8359, -0.1797, -0.4961], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3658, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5508,  1.0469, -0.9336,  1.3438,  0.0063], device='cuda:0',
   

 46%|████▋     | 32/69 [07:11<08:08, 13.19s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1147,  0.2090, -0.0918, -0.2178,  0.2852], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2832,  0.3555, -0.1953, -0.1533,  0.0415], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1641,  0.2676, -0.7891, -0.1250, -0.6719], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-1.0938,  0.7539, -0.8438,  1.1094, -0.3281], device='cuda:0',
   

 48%|████▊     | 33/69 [07:24<07:54, 13.18s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2754,  0.2793, -0.1885, -0.1216,  0.2773], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2295,  0.1143, -0.1133, -0.0430, -0.0096], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2031,  0.0747, -0.9023, -0.0138, -0.5820], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.9414,  0.7305, -0.4551,  1.6875, -0.0869], device='cuda:0',
   

 49%|████▉     | 34/69 [07:37<07:41, 13.18s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3242,  0.2344, -0.0977,  0.0200,  0.2891], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2500,  0.1533, -0.2617, -0.0498, -0.1621], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0272,  0.1807, -0.8867,  0.0110, -0.6445], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3672, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0679,  0.3066, -1.0469,  1.6250, -0.4824], device='cuda:0',
   

 51%|█████     | 35/69 [07:50<07:28, 13.20s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4375,  0.1709, -0.1982, -0.3008,  0.2236], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2305,  0.1797, -0.3574, -0.1543, -0.1748], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2139,  0.0442, -0.9219, -0.1001, -0.5742], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3642, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.8945,  0.6797, -0.0791,  1.5156, -0.0654], device='cuda:0',
   

 52%|█████▏    | 36/69 [08:04<07:20, 13.36s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.1562, -0.1216, -0.1836,  0.3242], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3379,  0.2354, -0.1040, -0.0786, -0.1328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2246,  0.0635, -0.9336, -0.1309, -0.5508], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-1.1719,  0.5156, -0.8867,  1.7109, -0.1982], device='cuda:0',
   

 54%|█████▎    | 37/69 [08:18<07:09, 13.42s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3701, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2354,  0.0757, -0.6289, -0.2314,  0.1504], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3701, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1523,  0.2139, -0.0630, -0.0688, -0.0306], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3701, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1553,  0.1758, -0.9844,  0.0178, -0.5039], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3701, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3164,  0.3262, -0.5820,  1.0391, -0.2793], device='cuda:0',
   

 55%|█████▌    | 38/69 [08:32<06:59, 13.53s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3750,  0.0535, -0.2266, -0.1963,  0.0928], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2217,  0.0496, -0.1807, -0.0547,  0.1084], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1787,  0.1592, -0.8477,  0.0188, -0.5156], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3685, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1572,  0.6523, -0.9805,  1.3516, -0.5117], device='cuda:0',
   

 57%|█████▋    | 39/69 [08:45<06:44, 13.48s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3965, -0.0104, -0.1738, -0.1689,  0.0613], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1953, -0.0576, -0.2363, -0.0255,  0.0630], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1992,  0.0786, -0.8164, -0.0107, -0.5117], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4727,  0.6406, -0.9648,  1.0703, -0.7617], device='cuda:0',
   

 58%|█████▊    | 40/69 [08:58<06:29, 13.44s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2637, -0.0508, -0.2617, -0.2373,  0.0913], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2539,  0.1289, -0.2578, -0.1089, -0.0591], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1787,  0.2695, -0.9023, -0.0913, -0.6523], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2812,  0.8711, -0.7031,  0.5547, -0.2520], device='cuda:0',
   

 59%|█████▉    | 41/69 [09:12<06:17, 13.50s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3770,  0.1123, -0.1797, -0.1768,  0.0845], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2256,  0.0752, -0.1904, -0.0435,  0.0688], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1963,  0.1807, -0.8438,  0.0476, -0.4785], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3697, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2148,  0.4512, -0.9062,  1.0859, -0.6875], device='cuda:0',
   

 61%|██████    | 42/69 [09:26<06:06, 13.58s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2812,  0.0508, -0.1816, -0.1895,  0.0840], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2275,  0.0042, -0.2139, -0.0767,  0.0544], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.0938, -0.8984,  0.0195, -0.5000], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2676,  0.7031, -0.7773,  1.2500, -0.8516], device='cuda:0',
   

 62%|██████▏   | 43/69 [09:39<05:51, 13.52s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3066,  0.1011, -0.1924, -0.2578,  0.0488], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2188,  0.0522, -0.1387, -0.1348,  0.1025], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2910,  0.1396, -0.8203, -0.0425, -0.5156], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2656,  0.6445, -0.8906,  1.1797, -0.6406], device='cuda:0',
   

 64%|██████▍   | 44/69 [09:52<05:37, 13.50s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2891, -0.0005, -0.3320, -0.1885,  0.1338], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2207,  0.1133, -0.1943, -0.0581, -0.0659], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0874,  0.2383, -0.9336, -0.0503, -0.6523], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5703,  0.7578, -0.7109,  0.5391, -0.3320], device='cuda:0',
   

 65%|██████▌   | 45/69 [10:06<05:26, 13.60s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4180,  0.1069, -0.1875, -0.2754,  0.1455], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2188,  0.2578, -0.3027, -0.0630,  0.0142], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0771,  0.3203, -0.9062, -0.0193, -0.5859], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2275,  0.4551, -0.8320,  1.1875, -0.2676], device='cuda:0',
   

 67%|██████▋   | 46/69 [10:19<05:09, 13.46s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598, -0.0046, -0.3906, -0.2354,  0.1377], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1826,  0.1089, -0.2275, -0.0618, -0.0776], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1270,  0.2812, -0.8555, -0.1182, -0.6680], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3965,  0.9414, -0.5625,  0.5469, -0.1201], device='cuda:0',
   

 68%|██████▊   | 47/69 [10:33<04:56, 13.49s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3691,  0.0513, -0.2598, -0.2930,  0.0337], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1650,  0.1777, -0.1357, -0.1367, -0.0266], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1021,  0.2695, -0.9023, -0.1016, -0.6367], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.5195, -0.6406,  0.6523, -0.3066], device='cuda:0',
   

 70%|██████▉   | 48/69 [10:46<04:43, 13.49s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2715, -0.0361, -0.3535, -0.1973,  0.1069], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1787,  0.1079, -0.2314, -0.0820, -0.0938], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0693,  0.2178, -0.8203, -0.1426, -0.6250], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5391,  0.5117, -0.6641,  0.3145, -0.1021], device='cuda:0',
   

 71%|███████   | 49/69 [11:00<04:29, 13.47s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1279,  0.0747, -0.2734, -0.1079,  0.0243], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2656,  0.1699, -0.1982, -0.0579, -0.1138], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3008,  0.0840, -0.8086, -0.0466, -0.6758], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0038,  0.8789, -0.5625,  0.8594, -0.4902], device='cuda:0',
   

 72%|███████▏  | 50/69 [11:14<04:16, 13.52s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1973,  0.2520, -0.2080, -0.1973,  0.2383], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2559,  0.2461,  0.0417, -0.1465,  0.0457], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2500,  0.1953, -0.7578, -0.1128, -0.6328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7188,  0.5469, -1.0938,  1.1641, -0.4023], device='cuda:0',
   

 74%|███████▍  | 51/69 [11:27<04:04, 13.58s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1670,  0.2451, -0.1875, -0.2285,  0.0654], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2695,  0.1182,  0.1299, -0.1338, -0.1094], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1514,  0.2676, -0.6836, -0.1270, -0.4570], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.8555,  0.0815, -0.7500,  0.8516,  0.5469], device='cuda:0',
   

 75%|███████▌  | 52/69 [11:41<03:52, 13.69s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1348,  0.1582, -0.0361, -0.2256,  0.1660], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2754,  0.2285, -0.0903, -0.1572, -0.0684], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2109,  0.2217, -0.7891, -0.1602, -0.5781], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3594,  0.7695, -0.8945,  0.8906, -0.1030], device='cuda:0',
   

 77%|███████▋  | 53/69 [11:55<03:38, 13.66s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3728, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0496,  0.1177, -0.0771,  0.0859,  0.1162], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3728, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2178,  0.2061, -0.1943,  0.0520, -0.0129], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3728, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3086,  0.2217, -0.6836, -0.0061, -0.5352], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3728, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.5000,  0.3984, -0.5039,  1.1172,  0.1602], device='cuda:0',
   

 78%|███████▊  | 54/69 [12:08<03:24, 13.61s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1787,  0.2930, -0.0806, -0.1426,  0.2344], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.1494, -0.1641, -0.1260,  0.0074], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1787,  0.1348, -0.8008, -0.1699, -0.5430], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3696, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1445,  0.4570, -0.0065,  0.8281,  0.2832], device='cuda:0',
   

 80%|███████▉  | 55/69 [12:22<03:09, 13.52s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3965,  0.1611, -0.2949, -0.1826,  0.2109], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773,  0.0923, -0.0552, -0.0908,  0.0004], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1406,  0.1406, -0.8633, -0.0806, -0.5859], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.9453, -0.1011, -1.3828,  0.7539, -0.0913], device='cuda:0',
   

 81%|████████  | 56/69 [12:35<02:54, 13.41s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3398,  0.1738, -0.2852, -0.2012,  0.2832], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2852,  0.1797, -0.0659, -0.1748, -0.0011], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2100,  0.2305, -0.8984, -0.1562, -0.5938], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.9766,  0.2051, -1.6094,  0.5273, -0.0869], device='cuda:0',
   

 83%|████████▎ | 57/69 [12:48<02:40, 13.41s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1089,  0.2295, -0.0096, -0.1128,  0.0057], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2305,  0.1758, -0.2578, -0.1079, -0.0177], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2715,  0.2314, -0.8359, -0.1299, -0.5156], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([0.1729, 0.6289, 0.1094, 0.6172, 0.2354], device='cuda:0',
       d

 84%|████████▍ | 58/69 [13:02<02:27, 13.43s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1157,  0.2256, -0.2617, -0.2197,  0.1299], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2695,  0.2051, -0.1836, -0.1592, -0.0510], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2891,  0.1729, -0.8008, -0.1309, -0.5391], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3677, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0317,  0.2832, -0.4590,  0.7500, -0.3047], device='cuda:0',
   

 86%|████████▌ | 59/69 [13:15<02:13, 13.40s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3658, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1387,  0.0815, -0.2598, -0.1562,  0.1230], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3658, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2637,  0.3262, -0.0278, -0.0532,  0.0117], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3658, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2812,  0.2002, -0.8242, -0.0266, -0.6211], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3658, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1562,  1.0078, -0.8203,  1.2656, -0.4141], device='cuda:0',
   

 87%|████████▋ | 60/69 [13:28<02:00, 13.35s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0908,  0.1030, -0.2354, -0.1270,  0.1533], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1943,  0.1133, -0.0060, -0.0398,  0.0200], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0649,  0.2021, -0.8047, -0.1196, -0.6250], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3242,  0.3730, -1.1094,  0.8359, -0.0096], device='cuda:0',
   

 88%|████████▊ | 61/69 [13:42<01:46, 13.37s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0737,  0.1455, -0.1709, -0.2617,  0.1924], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3555,  0.2393,  0.0222, -0.1230,  0.1904], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2090,  0.1836, -0.7852, -0.1436, -0.5195], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4688,  1.0234, -0.7969,  1.1797, -0.2139], device='cuda:0',
   

 90%|████████▉ | 62/69 [13:55<01:33, 13.38s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1729,  0.1211, -0.2334,  0.0525,  0.2422], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773,  0.0732, -0.2471,  0.0304, -0.0608], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0403,  0.1406, -0.9180,  0.1113, -0.6719], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3659, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4258,  1.0078, -0.1914,  1.5625, -0.0608], device='cuda:0',
   

 91%|█████████▏| 63/69 [14:08<01:20, 13.37s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2617,  0.1650, -0.2432, -0.2734,  0.1475], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3281,  0.2988,  0.1504, -0.1631, -0.0542], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2793,  0.2314, -0.7812, -0.2129, -0.6523], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3645, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5977,  0.6875, -1.2344,  0.7461, -0.6328], device='cuda:0',
   

 93%|█████████▎| 64/69 [14:22<01:07, 13.43s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2871,  0.2197, -0.2373, -0.1807,  0.1582], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3281,  0.2236, -0.1807, -0.0947,  0.1196], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1875,  0.2295, -0.8672, -0.1426, -0.5703], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3665, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2988,  0.9805, -0.8086,  1.0469,  0.1748], device='cuda:0',
   

 94%|█████████▍| 65/69 [14:35<00:53, 13.46s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1904,  0.2129, -0.2080, -0.2461,  0.0413], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3086,  0.2715, -0.0275, -0.1187,  0.0918], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1504,  0.2051, -0.8242, -0.1289, -0.6250], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0605,  1.0156, -0.6172,  0.9688, -0.0508], device='cuda:0',
   

 96%|█████████▌| 66/69 [14:49<00:40, 13.45s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2061,  0.1846, -0.1641, -0.2285,  0.1953], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1826,  0.1992, -0.2520, -0.0918, -0.0080], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0253,  0.2275, -0.8984, -0.0713, -0.6953], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3662, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4160,  0.5586, -1.1094,  1.0078, -0.3145], device='cuda:0',
   

 97%|█████████▋| 67/69 [15:02<00:26, 13.48s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1172,  0.1104, -0.1118, -0.0233,  0.1025], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3438,  0.0679, -0.0498,  0.0082,  0.0209], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0688,  0.1562, -0.8789,  0.0732, -0.6016], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3644, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773,  1.2266, -0.1904,  1.0625, -0.3340], device='cuda:0',
   

 99%|█████████▊| 68/69 [15:15<00:13, 13.11s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3378, 3584]), Averaged feature shape: torch.Size([13, 3584]), Sample: tensor([-0.1206, -0.0092, -0.3711, -0.1680,  0.0320], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3378, 3584]), Averaged feature shape: torch.Size([13, 3584]), Sample: tensor([-0.3242,  0.4180, -0.0576, -0.0776,  0.0417], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3378, 3584]), Averaged feature shape: torch.Size([13, 3584]), Sample: tensor([-0.2812,  0.2969, -0.9062, -0.0422, -0.5977], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3378, 3584]), Averaged feature shape: torch.Size([13, 3584]), Sample: tensor([-0.4004,  0.8555, -0.8086,  1.3906, -0.7031], device='cuda:0',
   

100%|██████████| 69/69 [15:19<00:00, 13.33s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 1559, 3584]), Averaged feature shape: torch.Size([6, 3584]), Sample: tensor([-0.5039,  0.1963, -0.5469, -0.1455,  0.2275], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 1559, 3584]), Averaged feature shape: torch.Size([6, 3584]), Sample: tensor([-0.2393,  0.2334, -0.1064, -0.0408,  0.1084], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 1559, 3584]), Averaged feature shape: torch.Size([6, 3584]), Sample: tensor([-0.1611,  0.2715, -0.8672,  0.0598, -0.5742], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 1559, 3584]), Averaged feature shape: torch.Size([6, 3584]), Sample: tensor([-0.0359,  0.2539, -1.5547,  1.1172, -0.2539], device='cuda:0',
       




Total number of frames in the video: 21554.0
Original Resolution: (720.0, 480.0)
FPS: 29.968454258675077
Duration (seconds): 719.222947368421
Target Resolution: (224, 224)
Read 21554 frames.
Frames shape: torch.Size([21554, 3, 224, 224])


  torchaudio.set_audio_backend("ffmpeg")


Total duration: 719.23 seconds
Number of intervals: 482
Sample rate: 48000
Output file: /kaggle/working/friends/s1/friends_s01e16b.h5
Num splits: 35


  1%|▏         | 1/69 [00:13<14:47, 13.05s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1201, -0.0488,  0.0815, -0.6055, -0.3887], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2812, -0.2910, -0.0079, -0.1895, -0.0308], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0266,  0.0527, -0.5352, -0.1689, -0.6211], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 1.3906, -0.1738, -1.5234,  0.0879, -1.5000], device='cuda:0',
   

  3%|▎         | 2/69 [00:26<14:35, 13.07s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2256,  0.0669,  0.0703, -0.1069,  0.2432], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3027,  0.1406, -0.1523,  0.0029,  0.0234], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1387,  0.0417, -0.9648,  0.0613, -0.4434], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0305,  0.7656, -1.0078,  2.4219,  0.0425], device='cuda:0',
   

  4%|▍         | 3/69 [00:39<14:31, 13.20s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1602,  0.0908, -0.1465, -0.1050,  0.1201], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.1387, -0.1641, -0.0070, -0.0366], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1943,  0.1426, -0.9258,  0.0141, -0.5391], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.2070,  0.2773, -0.6211,  1.8750, -0.3438], device='cuda:0',
   

  6%|▌         | 4/69 [00:52<14:19, 13.22s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1875,  0.1406,  0.0698, -0.1211,  0.2832], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2773,  0.1689, -0.1177, -0.0532,  0.1187], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1650,  0.0801, -0.8906,  0.0304, -0.5430], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0400,  0.4609, -1.1875,  2.0938, -0.3262], device='cuda:0',
   

  7%|▋         | 5/69 [01:05<14:04, 13.19s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3398, -0.0014, -0.1836, -0.1533,  0.0354], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2051, -0.0111,  0.0295, -0.1592,  0.1992], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1157,  0.2129, -0.8398, -0.1777, -0.5742], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1035, -0.6328, -1.0312,  1.2812, -0.4062], device='cuda:0',
   

  9%|▊         | 6/69 [01:19<13:51, 13.20s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1602,  0.3438, -0.0256, -0.1699,  0.1797], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2871,  0.2393, -0.1904, -0.0728,  0.0942], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2539,  0.2246, -0.7578, -0.1138, -0.4727], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1611,  0.3594, -1.2344,  0.5586, -0.1895], device='cuda:0',
   

 10%|█         | 7/69 [01:32<13:38, 13.20s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1406,  0.1279, -0.0164, -0.2061,  0.2051], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3477,  0.2393, -0.0157, -0.1064,  0.0684], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2871,  0.2676, -0.7773, -0.1279, -0.4844], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5547,  0.4258, -0.8125,  1.1172,  0.0977], device='cuda:0',
   

 12%|█▏        | 8/69 [01:45<13:32, 13.33s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2236,  0.3652, -0.1963, -0.1260,  0.1943], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3613,  0.2461, -0.3691, -0.0693,  0.1377], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3301,  0.2324, -0.8164, -0.0869, -0.5703], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0403,  0.1973, -0.7695,  1.3984,  0.1279], device='cuda:0',
   

 13%|█▎        | 9/69 [01:58<13:14, 13.24s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0354,  0.2295, -0.1904, -0.1328,  0.1406], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3418,  0.0295, -0.1128, -0.0400,  0.0099], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2246,  0.0574, -0.8555, -0.0349, -0.5156], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3668, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1104,  0.0037, -1.0234,  0.9102, -0.1885], device='cuda:0',
   

 14%|█▍        | 10/69 [02:12<13:01, 13.25s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1455,  0.2344,  0.0090, -0.2119,  0.0356], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-3.0273e-01,  2.6953e-01, -6.4373e-05, -1.5820e-01, -1.4746e-01],
       device='cuda:1', dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2471,  0.1074, -0.7383, -0.2021, -0.5664], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4121,  0.5586, -1.1250,  0.3867, -0.0430], 

 16%|█▌        | 11/69 [02:25<12:51, 13.30s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3320,  0.1846,  0.2100, -0.1816,  0.1348], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.1030,  0.1133, -0.0732, -0.0045], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0084,  0.1943, -0.7812,  0.0084, -0.6367], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0186,  0.6328, -0.9492,  1.1484,  0.3164], device='cuda:0',
   

 17%|█▋        | 12/69 [02:38<12:35, 13.26s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0684,  0.1069, -0.1138, -0.1357,  0.1494], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2871,  0.1338, -0.1123, -0.0586, -0.0454], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3711,  0.1758, -0.9102, -0.0859, -0.5859], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3684, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.3320,  0.1943, -0.8203,  0.8359, -0.2314], device='cuda:0',
   

 19%|█▉        | 13/69 [02:52<12:27, 13.35s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0150,  0.0234, -0.2754, -0.1650,  0.2598], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.2197, -0.0825, -0.0540,  0.0588], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2471,  0.2412, -0.6680, -0.0381, -0.4961], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4102,  0.9141, -1.2969,  0.9375,  0.4824], device='cuda:0',
   

 20%|██        | 14/69 [03:05<12:16, 13.39s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0116,  0.1787, -0.1182, -0.1133,  0.2656], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2227,  0.2422, -0.0732, -0.0469, -0.0103], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2656,  0.2383, -0.7891, -0.0228, -0.5820], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0894,  0.8047, -1.8438,  1.0000,  0.1807], device='cuda:0',
   

 22%|██▏       | 15/69 [03:19<12:11, 13.54s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1934,  0.1729, -0.3711, -0.1162,  0.0811], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3125,  0.2832, -0.2461, -0.0128,  0.0669], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.2324, -0.8281, -0.0242, -0.5430], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3688, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1211,  0.1108, -0.7422,  1.5625,  0.1235], device='cuda:0',
   

 23%|██▎       | 16/69 [03:33<11:59, 13.58s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0025,  0.2832,  0.0391, -0.2832,  0.1260], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3008,  0.1748, -0.0535, -0.2451,  0.0806], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.1152, -0.8125, -0.2227, -0.5625], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2139,  0.5625, -1.0234,  1.2969, -0.3262], device='cuda:0',
   

 25%|██▍       | 17/69 [03:47<11:47, 13.60s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1357,  0.1641, -0.1572, -0.1191,  0.1533], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2734,  0.1001, -0.0703, -0.0437, -0.0850], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1064,  0.1367, -0.8438, -0.0192, -0.5938], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0126,  0.4121, -1.1094,  0.5938,  0.9297], device='cuda:0',
   

 26%|██▌       | 18/69 [04:00<11:28, 13.50s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3643, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1641,  0.2832,  0.0771, -0.2021,  0.1533], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3643, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3555,  0.3008, -0.2490, -0.1377,  0.1035], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3643, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2832,  0.2344, -0.8008, -0.1709, -0.5547], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3643, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0035,  0.0664, -1.1641,  1.1953, -0.2041], device='cuda:0',
   

 28%|██▊       | 19/69 [04:13<11:16, 13.53s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.1904, -0.4707, -0.2637,  0.1138], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2217,  0.1338, -0.0791, -0.2354, -0.0908], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2412,  0.0635, -0.8594, -0.2656, -0.5703], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3457,  0.7734, -0.6914,  0.3809, -0.3516], device='cuda:0',
   

 29%|██▉       | 20/69 [04:27<11:01, 13.50s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4023,  0.1206,  0.1523, -0.3262,  0.0942], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2695,  0.0669, -0.0508, -0.1621, -0.1016], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0405,  0.0540, -0.7695, -0.1196, -0.5352], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3678, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2031,  1.0391, -0.4238,  0.4492,  0.1196], device='cuda:0',
   

 30%|███       | 21/69 [04:40<10:50, 13.55s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2461,  0.2305, -0.0654, -0.3164,  0.1562], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2119,  0.0425,  0.1201, -0.2754, -0.0854], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1147,  0.1157, -0.8711, -0.2383, -0.5117], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3691, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.9258, -0.0297, -0.6836,  0.1436, -0.2021], device='cuda:0',
   

 32%|███▏      | 22/69 [04:54<10:36, 13.55s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3689, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3066,  0.2021,  0.1221, -0.1914,  0.1328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3689, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2119,  0.0757, -0.1533, -0.1504, -0.0603], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3689, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1416,  0.2178, -0.8359, -0.0688, -0.5234], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3689, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5391,  0.8008, -1.3281,  1.1016, -0.2930], device='cuda:0',
   

 33%|███▎      | 23/69 [05:08<10:26, 13.61s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2412,  0.2061,  0.1416, -0.2412,  0.2002], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2129,  0.0830, -0.0042, -0.1650, -0.0136], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0349,  0.1855, -0.7617, -0.1270, -0.5664], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6680,  0.4277, -0.7891,  0.4199, -0.2334], device='cuda:0',
   

 35%|███▍      | 24/69 [05:21<10:12, 13.60s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2559,  0.0101, -0.0503, -0.2559, -0.1094], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1089,  0.2061,  0.0845, -0.1641, -0.1826], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2285,  0.2441, -0.8359, -0.1768, -0.5586], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.8672,  0.9609, -1.0312,  0.5938, -0.2002], device='cuda:0',
   

 36%|███▌      | 25/69 [05:35<09:59, 13.63s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2461,  0.1953,  0.1055, -0.1235,  0.0378], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1924, -0.0520,  0.0510, -0.0786,  0.0640], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1055,  0.1768, -0.7852,  0.0165, -0.5156], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3680, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0786,  1.2109, -0.5039,  0.6914,  0.0383], device='cuda:0',
   

 38%|███▊      | 26/69 [05:49<09:47, 13.67s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1533,  0.0659, -0.1279, -0.1465,  0.1807], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3145,  0.2354, -0.2285, -0.1348,  0.0476], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1797,  0.1689, -0.9102, -0.1377, -0.6094], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3664, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0044,  0.9141, -1.4766,  1.3516, -0.3672], device='cuda:0',
   

 39%|███▉      | 27/69 [06:03<09:34, 13.69s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1099,  0.2363, -0.1289, -0.1299,  0.1245], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2236,  0.1777, -0.1895, -0.0947, -0.0762], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0532,  0.1836, -0.7617, -0.0767, -0.5586], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1108,  0.5820, -0.3027,  0.7930,  0.0825], device='cuda:0',
   

 41%|████      | 28/69 [06:17<09:24, 13.77s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2051,  0.1631, -0.2969, -0.1660,  0.2197], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2070,  0.1904, -0.3242,  0.0096, -0.0947], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0903,  0.0413, -0.8125,  0.0376, -0.4746], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.5625,  0.1113, -1.5156,  1.4688, -0.1572], device='cuda:0',
   

 42%|████▏     | 29/69 [06:30<09:10, 13.76s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2354,  0.1270,  0.0549, -0.1719, -0.0713], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.1348, -0.2266, -0.1201, -0.1128], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0898,  0.2051, -0.9375, -0.1279, -0.5156], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.9297,  0.9141, -0.7188,  1.3047,  0.1514], device='cuda:0',
   

 43%|████▎     | 30/69 [06:44<08:57, 13.77s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2305,  0.3340, -0.0518, -0.1514,  0.1016], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2969,  0.2520, -0.3945, -0.0417, -0.0820], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2090,  0.2197, -0.7500,  0.0239, -0.5977], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0293,  0.7305, -1.0391,  0.8711,  0.1738], device='cuda:0',
   

 45%|████▍     | 31/69 [06:58<08:45, 13.84s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1484,  0.2520, -0.1357, -0.1582,  0.2373], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2637,  0.2793, -0.2324, -0.0547, -0.0149], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2910,  0.1592, -0.7227,  0.0282, -0.5586], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3682, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.1543,  1.0938, -1.0625,  1.5625,  0.0674], device='cuda:0',
   

 46%|████▋     | 32/69 [07:12<08:31, 13.83s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0147,  0.1602,  0.0142, -0.2480,  0.1777], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2930,  0.1992, -0.0796, -0.1416, -0.0299], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2295,  0.1416, -0.7070, -0.1387, -0.4023], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3683, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.2275,  0.3633, -0.9961,  0.6367,  0.6250], device='cuda:0',
   

 48%|████▊     | 33/69 [07:25<08:14, 13.75s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2559,  0.1367, -0.1206, -0.0781,  0.1191], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1738,  0.2695, -0.0649,  0.0439,  0.1338], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1807,  0.1963, -0.6875,  0.0216, -0.4824], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3692, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1416,  0.6367, -1.2891,  1.3203, -0.0132], device='cuda:0',
   

 49%|████▉     | 34/69 [07:39<08:01, 13.77s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2598,  0.2266, -0.2002, -0.1328,  0.0327], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2070,  0.2793, -0.1484, -0.0081,  0.1196], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2109,  0.2119, -0.7188,  0.0317, -0.4941], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4023,  0.6562, -1.2500,  1.6016,  0.0942], device='cuda:0',
   

 51%|█████     | 35/69 [07:53<07:49, 13.80s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3649, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1777,  0.1494, -0.0138, -0.1279,  0.3320], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3649, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3477,  0.1475, -0.0791, -0.0947,  0.0625], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3649, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0574,  0.0698, -0.8555, -0.0190, -0.5625], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3649, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5977,  1.4609, -0.8555,  1.6953, -0.0123], device='cuda:0',
   

 52%|█████▏    | 36/69 [08:07<07:34, 13.78s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2656,  0.3320, -0.0444, -0.1543,  0.1484], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2500,  0.1709, -0.0674, -0.0364,  0.0898], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0781,  0.2080, -0.7891,  0.0544, -0.4297], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5117,  0.2988, -0.8438,  1.8516,  0.1084], device='cuda:0',
   

 54%|█████▎    | 37/69 [08:20<07:18, 13.70s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2412, -0.0182, -0.0791, -0.0193,  0.2197], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2617,  0.1611, -0.0884,  0.0378,  0.0461], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0669,  0.2363, -0.9258,  0.1357, -0.6602], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3651, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4590,  0.6250, -0.9531,  2.0469, -0.2578], device='cuda:0',
   

 55%|█████▌    | 38/69 [08:34<07:02, 13.64s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1641,  0.1191, -0.1030, -0.1377,  0.1001], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3477,  0.3125, -0.3438, -0.0114,  0.0559], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3145,  0.2383, -0.7188,  0.0408, -0.5234], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3648, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0952,  0.3594, -1.5391,  1.9375, -0.4785], device='cuda:0',
   

 57%|█████▋    | 39/69 [08:48<06:49, 13.66s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2344, -0.0182, -0.2832, -0.4180,  0.2236], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1138, -0.0791, -0.1973, -0.1680,  0.0203], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0505,  0.1289, -1.0312, -0.3730, -0.5195], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3671, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5469, -0.0271, -0.2832,  0.1299, -0.0483], device='cuda:0',
   

 58%|█████▊    | 40/69 [09:01<06:36, 13.68s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2256, -0.1504, -0.1514, -0.1875, -0.0732], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2559,  0.0947,  0.2715, -0.0388, -0.1035], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1631,  0.0564, -0.7969,  0.0025, -0.4961], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5664,  0.7461, -1.0625,  0.8828, -0.3984], device='cuda:0',
   

 59%|█████▉    | 41/69 [09:15<06:20, 13.59s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3164,  0.0874,  0.0796, -0.2500, -0.0381], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1992,  0.1367,  0.0172, -0.1133, -0.0144], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1191,  0.1396, -0.7188, -0.0723, -0.5469], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3670, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6797,  0.7930, -0.6914,  0.7344, -0.2793], device='cuda:0',
   

 61%|██████    | 42/69 [09:28<06:04, 13.49s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1309,  0.1060, -0.2812, -0.2490,  0.1387], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2168,  0.0933,  0.0055, -0.1797, -0.0811], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1201,  0.1396, -0.7539, -0.1816, -0.5898], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7578,  0.2734, -0.4590,  0.3613, -0.0270], device='cuda:0',
   

 62%|██████▏   | 43/69 [09:41<05:49, 13.46s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3809,  0.2598,  0.0791, -0.1377,  0.1455], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2314,  0.0513, -0.1377, -0.1055, -0.0625], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0312,  0.0608, -0.7734, -0.0591, -0.4590], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3667, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4434,  1.2656, -0.4414,  0.9180,  0.1836], device='cuda:0',
   

 64%|██████▍   | 44/69 [09:55<05:37, 13.52s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2754,  0.0889, -0.2656, -0.1973,  0.0698], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2139,  0.0742, -0.0503, -0.0659, -0.0811], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1045,  0.0967, -0.7734, -0.0688, -0.5508], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3657, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.9688,  0.3457, -0.7109,  1.1875, -0.0693], device='cuda:0',
   

 65%|██████▌   | 45/69 [10:09<05:28, 13.67s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1553,  0.0796, -0.1953, -0.1128,  0.1328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2432,  0.2266, -0.1235, -0.0635,  0.0422], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1699,  0.2129, -0.7148,  0.0874, -0.5625], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3687, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5703,  0.7266, -1.1328,  1.4141, -0.3340], device='cuda:0',
   

 67%|██████▋   | 46/69 [10:23<05:16, 13.78s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1973,  0.0767, -0.1650, -0.1914,  0.1797], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2617,  0.1177, -0.1113, -0.0698,  0.1084], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1006,  0.0659, -0.7539,  0.0317, -0.5781], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3675, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4004,  0.3711, -0.8242,  1.0156, -0.5664], device='cuda:0',
   

 68%|██████▊   | 47/69 [10:37<05:01, 13.71s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2617,  0.1953, -0.0767, -0.1318,  0.1719], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2734,  0.2637, -0.2754, -0.0986,  0.0459], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2070,  0.2676, -0.8477, -0.0139, -0.6289], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4121,  0.3926, -1.3984,  1.3828, -0.5742], device='cuda:0',
   

 70%|██████▉   | 48/69 [10:51<04:50, 13.83s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1118,  0.1226, -0.0942, -0.1592,  0.1484], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1943,  0.1992, -0.2207, -0.1206,  0.0596], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1543,  0.1338, -0.8086, -0.1289, -0.6016], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3695, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6250,  0.4648, -0.9023,  1.1484, -0.0134], device='cuda:0',
   

 71%|███████   | 49/69 [11:05<04:37, 13.88s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0259, -0.0051, -0.0095, -0.1040,  0.2715], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3320,  0.2031, -0.0874, -0.0762,  0.0388], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1357,  0.2021, -0.9844, -0.0298, -0.6641], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3693, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-6.1768e-02,  8.2422e-01, -1.0781e+00,  1.1875e+00,  6.4468e-04],


 72%|███████▏  | 50/69 [11:18<04:19, 13.66s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3704, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0913,  0.1113, -0.1001, -0.1738,  0.1338], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3704, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3027,  0.1035, -0.1836, -0.1514,  0.0427], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3704, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1777,  0.1826, -0.7891, -0.0757, -0.6484], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3704, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6484,  0.9648, -1.1641,  1.0547,  0.2871], device='cuda:0',
   

 74%|███████▍  | 51/69 [11:31<04:04, 13.60s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0081, -0.0137,  0.1240, -0.1484,  0.2988], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2832,  0.2656, -0.0535, -0.1504,  0.0771], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1089,  0.2559, -0.8906, -0.0942, -0.6797], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3694, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2461,  0.9609, -0.8477,  1.1172, -0.0291], device='cuda:0',
   

 75%|███████▌  | 52/69 [11:45<03:49, 13.53s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0310,  0.0403,  0.0505, -0.2197,  0.2949], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3633,  0.2754, -0.0383, -0.1934,  0.0830], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1377,  0.3047, -0.9375, -0.1279, -0.6641], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3673, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2969,  0.6836, -0.6797,  1.2031, -0.3125], device='cuda:0',
   

 77%|███████▋  | 53/69 [11:58<03:35, 13.44s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0206,  0.1318, -0.1143, -0.1943,  0.0120], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2949,  0.2598, -0.0938, -0.0747,  0.0228], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2041,  0.2207, -0.7422, -0.1055, -0.6133], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3676, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1777,  0.8164, -1.0625,  1.0703, -0.2285], device='cuda:0',
   

 78%|███████▊  | 54/69 [12:11<03:22, 13.49s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0977,  0.0221,  0.1514, -0.1680,  0.2637], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3008,  0.2207, -0.1396, -0.1143,  0.0015], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1167,  0.1660, -0.9609, -0.0204, -0.6758], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3681, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.0913,  0.4316, -0.7656,  1.4531,  0.2539], device='cuda:0',
   

 80%|███████▉  | 55/69 [12:25<03:10, 13.62s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3698, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1289,  0.0284, -0.1338, -0.1787,  0.1650], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3698, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3145,  0.3652, -0.1523, -0.1143, -0.0349], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3698, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2910,  0.1748, -0.7930, -0.0564, -0.6328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3698, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4141,  0.4727, -0.8555,  1.6953, -0.3418], device='cuda:0',
   

 81%|████████  | 56/69 [12:39<02:55, 13.53s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0513,  0.1885, -0.0742, -0.1475,  0.1445], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3203,  0.1914, -0.1006, -0.0356,  0.0566], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2295,  0.1895, -0.8164,  0.0112, -0.6250], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3686, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5078,  0.5430, -0.9141,  1.1016, -0.4258], device='cuda:0',
   

 83%|████████▎ | 57/69 [12:52<02:40, 13.40s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3640, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0474,  0.0845,  0.0928, -0.0559,  0.2129], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3640, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.2119, -0.0835, -0.0957,  0.0444], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3640, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1572,  0.2109, -0.8203, -0.0184, -0.6406], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3640, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.7148,  0.7930, -0.6602,  1.1406,  0.0483], device='cuda:0',
   

 84%|████████▍ | 58/69 [13:05<02:27, 13.41s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3626, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0508,  0.2246, -0.0737, -0.1299,  0.0698], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3626, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2734,  0.2793, -0.0381, -0.0669,  0.0220], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3626, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2402,  0.2637, -0.7812, -0.0564, -0.6328], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3626, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1611,  0.8008, -0.8047,  0.8359, -0.2598], device='cuda:0',
   

 86%|████████▌ | 59/69 [13:19<02:13, 13.39s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3626, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3613,  0.1631, -0.2080, -0.2373,  0.2168], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3626, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2852,  0.1396,  0.0457, -0.2480, -0.0253], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3626, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1914,  0.1289, -1.0078, -0.2559, -0.5898], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3626, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1719, -0.2637, -0.6133,  0.1064, -0.0981], device='cuda:0',
   

 87%|████████▋ | 60/69 [13:32<02:01, 13.47s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3184,  0.2090, -0.4141, -0.1182,  0.0918], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3633,  0.1162,  0.1406, -0.1816, -0.1016], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1553,  0.2070, -0.8633, -0.1729, -0.6406], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3655, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.8828,  0.3027, -0.7773,  0.7734,  0.0615], device='cuda:0',
   

 88%|████████▊ | 61/69 [13:46<01:47, 13.47s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5781, -0.1641, -0.4609, -0.2451,  0.3848], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3203,  0.1475, -0.0374, -0.2129,  0.0364], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3652,  0.0188, -0.9141, -0.2109, -0.5039], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3666, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.6250,  0.1328, -0.6367,  1.0781, -1.1641], device='cuda:0',
   

 90%|████████▉ | 62/69 [13:59<01:34, 13.53s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4766,  0.1133, -0.3652, -0.2070,  0.4766], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3027,  0.1562, -0.0669, -0.1206, -0.0112], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.0752, -0.7305, -0.2021, -0.3945], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3660, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.5078,  0.1235, -0.5312,  0.6992, -1.1797], device='cuda:0',
   

 91%|█████████▏| 63/69 [14:13<01:21, 13.62s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4180,  0.0146, -0.3477, -0.1553,  0.3008], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1118,  0.2334, -0.2949, -0.1650, -0.1040], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1191,  0.2715, -1.0000, -0.1221, -0.4551], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3674, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.0452,  0.2773, -0.9883,  1.3203, -0.6680], device='cuda:0',
   

 93%|█████████▎| 64/69 [14:27<01:08, 13.67s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3691,  0.1074, -0.6172, -0.3965,  0.2168], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2383, -0.1826, -0.0540, -0.2217, -0.0459], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1934, -0.0043, -0.8477, -0.3535, -0.3496], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3679, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.8281, -0.1816, -0.8164, -0.0097,  0.5898], device='cuda:0',
   

 94%|█████████▍| 65/69 [14:41<00:54, 13.64s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2578,  0.1396,  0.1006, -0.2539,  0.2041], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2832,  0.2969, -0.1045, -0.0933, -0.0640], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1846,  0.2021, -0.7578, -0.0549, -0.4961], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3663, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.4883,  0.8438, -0.2695,  1.8281,  0.5781], device='cuda:0',
   

 96%|█████████▌| 66/69 [14:54<00:41, 13.68s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3359,  0.2295,  0.0713, -0.1650,  0.1157], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3047,  0.1680,  0.0337, -0.0110, -0.0171], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.3359,  0.1895, -0.7148,  0.1099, -0.4824], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3653, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([ 0.5156,  0.4316, -0.6133,  1.4766,  0.1553], device='cuda:0',
   

 97%|█████████▋| 67/69 [15:08<00:27, 13.63s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.4004,  0.1592, -0.0854, -0.1021,  0.3340], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2891,  0.1729, -0.0312,  0.0020, -0.1064], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.1118,  0.2178, -0.9297,  0.0276, -0.5352], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3652, 3584]), Averaged feature shape: torch.Size([14, 3584]), Sample: tensor([-0.2852,  0.9648, -1.2734,  1.4062,  0.4121], device='cuda:0',
   

 99%|█████████▊| 68/69 [15:20<00:13, 13.16s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 3381, 3584]), Averaged feature shape: torch.Size([13, 3584]), Sample: tensor([-0.4258,  0.0654, -0.1011, -0.1099,  0.3379], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 3381, 3584]), Averaged feature shape: torch.Size([13, 3584]), Sample: tensor([-0.2754,  0.1553, -0.2256, -0.0238, -0.0679], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 3381, 3584]), Averaged feature shape: torch.Size([13, 3584]), Sample: tensor([-0.1748,  0.1670, -0.9648,  0.0786, -0.5156], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 3381, 3584]), Averaged feature shape: torch.Size([13, 3584]), Sample: tensor([-0.0608,  1.0781, -0.8555,  2.1406,  0.2178], device='cuda:0',
   

100%|██████████| 69/69 [15:24<00:00, 13.40s/it]

Layer: language_model.model.layers.10.post_attention_layernorm, Feature shape: torch.Size([1, 1559, 3584]), Averaged feature shape: torch.Size([6, 3584]), Sample: tensor([-0.4160,  0.0201,  0.0398, -0.0486,  0.3906], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.15.post_attention_layernorm, Feature shape: torch.Size([1, 1559, 3584]), Averaged feature shape: torch.Size([6, 3584]), Sample: tensor([-0.2637,  0.0986, -0.2344,  0.0043, -0.1494], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.layers.20.post_attention_layernorm, Feature shape: torch.Size([1, 1559, 3584]), Averaged feature shape: torch.Size([6, 3584]), Sample: tensor([-0.1128,  0.1514, -1.0234,  0.0508, -0.5938], device='cuda:1',
       dtype=torch.bfloat16)
Layer: language_model.model.norm, Feature shape: torch.Size([1, 1559, 3584]), Averaged feature shape: torch.Size([6, 3584]), Sample: tensor([-0.0981,  1.0156, -0.7891,  2.1562,  0.4043], device='cuda:0',
       




In [17]:
# modality = 'video'
# verbose = True
# movie_file = "/kaggle/input/algonauts2025nsl/algonauts_2025.competitors/stimuli/movies/friends/s5/friends_s05e02a.mkv"
# transcript_file = "/kaggle/input/algonauts2025nsl/algonauts_2025.competitors/stimuli/transcripts/friends/s5/friends_s05e02a.tsv"
# interval = 1.49

# ####################33

# video, audio, transcript, sample_rate, fps_video = None, None, None, None, None
# if modality == 'all' or modality == 'video':
#     video, fps_video = load_video(movie_file, verbose=verbose)
# if modality == 'all' or modality == 'audio' or modality == 'video' or modality == 'transcript':
#     audio, sample_rate = load_audio(movie_file)
# if modality == 'all' or modality == 'transcript':
#     transcript = load_transcript(transcript_file)

# # round fps video

# if transcript is not None:
#     transcript = resample_transcript(transcript, interval)
    
# total_duration = audio.shape[1] / sample_rate
# num_intervals = int(total_duration // interval)

# if verbose:
#     print(f"Total duration: {total_duration:.2f} seconds")
#     print(f"Number of intervals: {num_intervals}")

In [18]:
# for i in range num_intervals:
#     video_section, audio_section, transcript_section = extract_section(
#         video, audio, transcript, interval, i, sample_rate, modality, fps_video
#     )
#     print(video_section.shape)
    

# # output_features = extract_fn(video_section, audio_section, transcript_section, verbose)

In [19]:
# import h5py
# import numpy as np

# # Path to the generated h5 file.
# file_path = '/kaggle/working/friends/s5/friends_s05e10b.h5'

# with h5py.File(file_path, 'r') as f:
#     print("Datasets in the file:")
#     for layer_name in f.keys():
#         dataset = f[layer_name]
#         print(f"Layer: {layer_name}")
#         print(f" - Shape: {dataset.shape}")
#         print(f" - Data type: {dataset.dtype}")
        
#         num_intervals_to_print = min(50, dataset.shape[0])
#         for i in range(num_intervals_to_print):
#             # Get the i-th interval data, flatten it, and print the first 5 elements.
#             interval_data = dataset[i]
#             flat_data = interval_data.flatten()
#             first_five = flat_data[:5] if flat_data.size >= 5 else flat_data
#             print(f" Interval {i}: first 5 elements: {first_five}")
#         print("-" * 50)