# CSV structure examination

In [None]:
import pandas as pd
import glob

# Get lists of files for both before and after
pn_before_files = glob.glob('PreTrain - PNbefore*.csv')
pn_after_files = glob.glob('PreTrain - PNafter*.csv')

def examine_csv(filepath):
    print(f"\nExamining {filepath}:")
    print("-" * 50)

    # Read the CSV
    df = pd.read_csv(filepath)

    # Print column names
    print("\nColumns:")
    print(df.columns.tolist())

    # Print first 5 rows
    print("\nFirst 5 rows:")
    print(df.head())
    print("\n")

# Examine all files
for file in sorted(pn_before_files + pn_after_files):
    examine_csv(file)


Examining PreTrain - PNafter gpu-time.csv:
--------------------------------------------------

Columns:
['Relative Time (Process)', 'denim-lake-75 - system/gpu.process.0.memory', 'denim-lake-75 - system/gpu.process.0.memory__MIN', 'denim-lake-75 - system/gpu.process.0.memory__MAX']

First 5 rows:
   Relative Time (Process)  denim-lake-75 - system/gpu.process.0.memory  \
0                61.132072                                         4.73   
1                91.132639                                        58.20   
2               121.133184                                        69.13   
3               151.133758                                        69.07   
4               181.134467                                        70.07   

   denim-lake-75 - system/gpu.process.0.memory__MIN  \
0                                              4.73   
1                                             58.20   
2                                             69.13   
3                             

In [None]:
import pandas as pd
import glob

def examine_csv(filepath):
    print(f"\nExamining {filepath}:")
    print("-" * 50)

    # Read the CSV
    df = pd.read_csv(filepath)

    # Print column names
    print("\nColumns:")
    print(df.columns.tolist())

    # Print first 5 rows
    print("\nFirst 5 rows:")
    print(df.head())
    print("\n")

# Get all relevant CSV files
files = glob.glob('PreTrain -*.csv')

# Examine each file
for file in sorted(files):
    examine_csv(file)


Examining PreTrain - LN gpu-time.csv:
--------------------------------------------------

Columns:
['Relative Time (Process)', 'prime-snowball-26 - system/gpu.process.0.memory', 'prime-snowball-26 - system/gpu.process.0.memory__MIN', 'prime-snowball-26 - system/gpu.process.0.memory__MAX']

First 5 rows:
   Relative Time (Process)  prime-snowball-26 - system/gpu.process.0.memory  \
0                60.562657                                             4.53   
1                90.563732                                             0.00   
2               120.566191                                            32.93   
3               150.569060                                            34.67   
4               180.571015                                            33.80   

   prime-snowball-26 - system/gpu.process.0.memory__MIN  \
0                                               4.53      
1                                               0.00      
2                                         

# Main ADA

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker

# Set style to match W&B dark theme
plt.style.use('dark_background')
sns.set_style("darkgrid", {'axes.edgecolor': '#2B2B2B',
                          'grid.color': '#2B2B2B'})

# Read the data
pn_after = pd.read_csv('PreTrain - PNafter tok-sec.csv')
pn_before = pd.read_csv('PreTrain - PNbefore tok-sec.csv')

# Apply scaling factor to PNafter
scaling_factor = 19073/21841
pn_after['denim-lake-75 - tok per sec'] = pn_after['denim-lake-75 - tok per sec'] * scaling_factor

def create_wandb_style_plot(figsize=(12, 6)):
    fig, ax = plt.subplots(figsize=figsize)
    ax.grid(True, alpha=0.4)
    # Correct darker background color
    ax.set_facecolor('#111111')
    fig.patch.set_facecolor('#111111')

    # Make text white and borders darker
    ax.spines['bottom'].set_color('#2B2B2B')
    ax.spines['top'].set_color('#2B2B2B')
    ax.spines['right'].set_color('#2B2B2B')
    ax.spines['left'].set_color('#2B2B2B')
    ax.tick_params(colors='white')
    ax.yaxis.label.set_color('white')
    ax.xaxis.label.set_color('white')
    ax.title.set_color('white')

    # Format y-axis to use k notation
    def y_fmt(x, p):
        if x >= 1e6:
            return f'{x/1e6:.0f}e+6'
        else:
            return f'{int(x/1000)}k'

    ax.yaxis.set_major_formatter(ticker.FuncFormatter(y_fmt))

    return fig, ax

# Colors matching the W&B theme
after_color = '#00CCBB'  # cyan/turquoise
before_color = '#FF9966'  # orange

# Both overlaid
fig, ax = create_wandb_style_plot()
ax.plot(pn_after['Step'], pn_after['denim-lake-75 - tok per sec'],
        color=after_color, label='denim-lake-75', alpha=0.8)
ax.plot(pn_before['Step'], pn_before['sleek-thunder-73 - tok per sec'],
        color=before_color, label='sleek-thunder-73', alpha=0.8)
ax.set_xlabel('Step')
# ax.set_ylabel('tok per sec')
ax.set_title('tok per sec')
ax.legend(facecolor='#111111', edgecolor='#2B2B2B', labelcolor='white')
plt.savefig('pn_comparison_tok_sec.png', dpi=100, bbox_inches='tight', facecolor='#111111')
plt.close()

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import os

# Easy configuration
STEP_SIZE = 3000  # Change this value to switch between different step sizes
OUTPUT_SUFFIX = f'{STEP_SIZE}_steps'  # This will be used in output filenames

# Create results directory if it doesn't exist
if not os.path.exists('analysis_results'):
    os.makedirs('analysis_results')

# Dictionary to map model types to their run names
model_names = {
    'LN': 'prime-snowball-26',
    'RMSN': 'autumn-energy-2',
    'PNbefore': 'sleek-thunder-73',
    'PNafter': 'denim-lake-75'
}

def calculate_average_by_steps(df, value_column, step_size=STEP_SIZE):
    """Calculate average values for every step_size steps, including remainder"""
    # Calculate regular bins
    df['step_bin'] = (df['Step'] // step_size) * step_size

    # Get the maximum step
    max_step = df['Step'].max()

    # Calculate averages for regular bins
    averages = df.groupby('step_bin')[value_column].mean().reset_index()

    # Handle remainder steps if they exist
    last_complete_bin = (max_step // step_size) * step_size
    remainder_df = df[df['Step'] > last_complete_bin]

    if not remainder_df.empty:
        remainder_avg = pd.DataFrame({
            'step_bin': [last_complete_bin + step_size],
            value_column: [remainder_df[value_column].mean()]
        })
        averages = pd.concat([averages, remainder_avg], ignore_index=True)

    return averages

def filter_outliers(df, value_column, percentile_threshold=1):
    """Filter out outliers using percentile method"""
    lower_bound = np.percentile(df[value_column], percentile_threshold)
    upper_bound = np.percentile(df[value_column], 100 - percentile_threshold)
    return df[(df[value_column] >= lower_bound) & (df[value_column] <= upper_bound)]

# Store results
results = {model: {} for model in model_names.keys()}

# Process each model
for model, run_name in model_names.items():
    # Process loss/train
    loss_train_df = pd.read_csv(f'PreTrain - {model} loss-train.csv')
    value_col = f'{run_name} - loss/train'
    avg_loss = calculate_average_by_steps(loss_train_df, value_col)
    avg_loss.columns = ['step_bin', 'average_loss']
    avg_loss['model'] = model
    results[model]['loss_train_avg'] = avg_loss

    # Process grad-norm
    grad_norm_df = pd.read_csv(f'PreTrain - {model} grad-norm.csv')
    value_col = f'{run_name} - global gradient norm'
    avg_grad = calculate_average_by_steps(grad_norm_df, value_col)
    avg_grad.columns = ['step_bin', 'average_gradient_norm']
    avg_grad['model'] = model
    results[model]['grad_norm_avg'] = avg_grad

    # Process tok-sec
    tok_sec_df = pd.read_csv(f'PreTrain - {model} tok-sec.csv')
    value_col = f'{run_name} - tok per sec'
    if model == 'PNafter':
        tok_sec_df[value_col] *= (19073/21841)
    filtered_tok = filter_outliers(tok_sec_df, value_col)
    filtered_tok = filtered_tok[['Step', value_col]]
    filtered_tok.columns = ['step', 'tokens_per_second']
    filtered_tok['model'] = model
    results[model]['tok_sec_filtered'] = filtered_tok

    # Process gpu-time
    gpu_time_df = pd.read_csv(f'PreTrain - {model} gpu-time.csv')
    value_col = f'{run_name} - system/gpu.process.0.memory'
    filtered_gpu = filter_outliers(gpu_time_df, value_col)
    filtered_gpu = filtered_gpu[['Relative Time (Process)', value_col]]
    filtered_gpu.columns = ['time', 'gpu_memory_usage']
    filtered_gpu['model'] = model
    results[model]['gpu_time_filtered'] = filtered_gpu

# Combine and save results
# 1. Loss Train Averages
loss_train_combined = pd.concat([results[model]['loss_train_avg'] for model in model_names.keys()])
loss_train_combined.to_csv(f'analysis_results/average_loss_train_by_{OUTPUT_SUFFIX}.csv', index=False)

# 2. Gradient Norm Averages
grad_norm_combined = pd.concat([results[model]['grad_norm_avg'] for model in model_names.keys()])
grad_norm_combined.to_csv(f'analysis_results/average_gradient_norm_by_{OUTPUT_SUFFIX}.csv', index=False)

# 3. Filtered Token/Sec
tok_sec_combined = pd.concat([results[model]['tok_sec_filtered'] for model in model_names.keys()])
tok_sec_combined.to_csv(f'analysis_results/filtered_tokens_per_second_{OUTPUT_SUFFIX}.csv', index=False)

# 4. Filtered GPU Memory Usage
gpu_time_combined = pd.concat([results[model]['gpu_time_filtered'] for model in model_names.keys()])
gpu_time_combined.to_csv(f'analysis_results/filtered_gpu_memory_usage_{OUTPUT_SUFFIX}.csv', index=False)

print(f"Results have been saved to the 'analysis_results' directory (Step Size: {STEP_SIZE}):")
print(f"1. average_loss_train_by_{OUTPUT_SUFFIX}.csv")
print(f"2. average_gradient_norm_by_{OUTPUT_SUFFIX}.csv")
print(f"3. filtered_tokens_per_second_{OUTPUT_SUFFIX}.csv")
print(f"4. filtered_gpu_memory_usage_{OUTPUT_SUFFIX}.csv")

Results have been saved to the 'analysis_results' directory:
1. average_loss_train_by_3000_steps.csv
2. average_gradient_norm_by_3000_steps.csv
3. filtered_tokens_per_second.csv
4. filtered_gpu_memory_usage.csv
