## 1. Setup & Imports

In [None]:
# Standard imports
import os
import sys
import json
import yaml
from pathlib import Path

# Data processing
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# ML Libraries
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM

# Add project root to path
PROJECT_ROOT = Path('.').absolute().parent
sys.path.insert(0, str(PROJECT_ROOT))

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

# Display settings
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 2. Load Configuration

In [None]:
# Load project configuration
config_path = PROJECT_ROOT / 'configs' / 'config.yaml'

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

print("üìã Project Configuration:")
print(f"  Model: {config['model']['base_model']}")
print(f"  Dataset: {config['data']['dataset_name']}")
print(f"  LoRA Rank: {config['lora']['rank']}")
print(f"  Batch Size: {config['training']['batch_size']}")
print(f"  Learning Rate: {config['training']['learning_rate']}")

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Load Databricks Dolly 15k dataset
print("Loading dataset...")
dataset = load_dataset("databricks/databricks-dolly-15k")

print(f"\nüìä Dataset Overview:")
print(f"  Total samples: {len(dataset['train'])}")
print(f"  Features: {dataset['train'].features}")

In [None]:
# Convert to DataFrame for analysis
df = pd.DataFrame(dataset['train'])

# Calculate lengths
df['instruction_len'] = df['instruction'].str.len()
df['context_len'] = df['context'].str.len()
df['response_len'] = df['response'].str.len()
df['has_context'] = df['context'].str.len() > 0

print("\nüìà Length Statistics:")
print(df[['instruction_len', 'context_len', 'response_len']].describe())

In [None]:
# Category distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Category counts
category_counts = df['category'].value_counts()
axes[0].barh(category_counts.index, category_counts.values, color='steelblue')
axes[0].set_xlabel('Count')
axes[0].set_title('Samples per Category', fontweight='bold')
axes[0].invert_yaxis()

# Context presence
context_counts = df['has_context'].value_counts()
axes[1].pie(context_counts.values, labels=['With Context', 'No Context'], 
            autopct='%1.1f%%', colors=['#4ECDC4', '#FF6B6B'])
axes[1].set_title('Samples with Context', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Length distributions
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for ax, col, title in zip(axes, 
                         ['instruction_len', 'context_len', 'response_len'],
                         ['Instruction Length', 'Context Length', 'Response Length']):
    data = df[col]
    ax.hist(data, bins=50, color='steelblue', edgecolor='black', alpha=0.7)
    ax.axvline(data.mean(), color='red', linestyle='--', label=f'Mean: {data.mean():.0f}')
    ax.axvline(data.median(), color='green', linestyle='--', label=f'Median: {data.median():.0f}')
    ax.set_xlabel('Characters')
    ax.set_ylabel('Count')
    ax.set_title(title, fontweight='bold')
    ax.legend(fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
# Sample examples by category
print("\nüìù Sample Examples:")
print("="*80)

for category in df['category'].unique()[:3]:
    sample = df[df['category'] == category].iloc[0]
    print(f"\nüè∑Ô∏è Category: {category}")
    print(f"üìå Instruction: {sample['instruction'][:200]}...")
    print(f"üí¨ Response: {sample['response'][:200]}...")
    print("-"*80)

## 4. Data Preparation

In [None]:
# Import data preparation module
from data.prepare_data import DatasetPreparator

# Prepare data
preparator = DatasetPreparator(config)
datasets = preparator.prepare_data()

print("\n‚úÖ Prepared datasets:")
for split, ds in datasets.items():
    print(f"  {split}: {len(ds)} samples")

In [None]:
# Show formatted prompt example
sample = datasets['train'][0]
print("üìù Formatted Prompt Example:")
print("="*80)
print(sample['text'][:1000])
print("...")

## 5. Model Loading & Inspection

In [None]:
# Load tokenizer
model_name = config['model']['base_model']
tokenizer = AutoTokenizer.from_pretrained(model_name)

print(f"\nüî§ Tokenizer Info:")
print(f"  Model: {model_name}")
print(f"  Vocab size: {tokenizer.vocab_size}")
print(f"  Model max length: {tokenizer.model_max_length}")

In [None]:
# Analyze token lengths
def count_tokens(text):
    return len(tokenizer.encode(text, truncation=False))

# Sample 1000 examples for speed
sample_df = df.sample(min(1000, len(df)), random_state=42)

sample_df['total_tokens'] = sample_df.apply(
    lambda x: count_tokens(x['instruction'] + ' ' + x['context'] + ' ' + x['response']),
    axis=1
)

print("\nüéØ Token Statistics:")
print(sample_df['total_tokens'].describe())

In [None]:
# Token distribution plot
fig, ax = plt.subplots(figsize=(12, 5))

ax.hist(sample_df['total_tokens'], bins=50, color='steelblue', edgecolor='black', alpha=0.7)
ax.axvline(2048, color='red', linestyle='--', linewidth=2, label='Max length (2048)')
ax.axvline(sample_df['total_tokens'].mean(), color='green', linestyle='--', 
           label=f'Mean: {sample_df["total_tokens"].mean():.0f}')

ax.set_xlabel('Total Tokens')
ax.set_ylabel('Count')
ax.set_title('Token Length Distribution', fontweight='bold')
ax.legend()

plt.tight_layout()
plt.show()

# Truncation analysis
truncated = (sample_df['total_tokens'] > 2048).sum()
print(f"\n‚ö†Ô∏è Samples exceeding 2048 tokens: {truncated} ({truncated/len(sample_df)*100:.1f}%)")

## 6. Quick Inference Test (Base Model)

In [None]:
# Load base model for comparison
from transformers import BitsAndBytesConfig

# Quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

print("Loading base model with 4-bit quantization...")
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

print(f"\n‚úÖ Model loaded!")
print(f"  Parameters: {base_model.num_parameters():,}")
print(f"  Memory: {torch.cuda.memory_allocated()/1e9:.2f} GB")

In [None]:
# Test inference
def generate_response(prompt, model, tokenizer, max_new_tokens=256):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id
        )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response[len(prompt):].strip()

# Test prompt
test_prompt = "<|im_start|>user\nExplain what machine learning is in simple terms.<|im_end|>\n<|im_start|>assistant\n"

print("ü§ñ Base Model Response:")
print("="*80)
response = generate_response(test_prompt, base_model, tokenizer)
print(response)

## 7. Results Loading & Visualization

In [None]:
# Load results (after running main.py)
results_dir = PROJECT_ROOT / 'results'

# Find latest results file
results_files = list(results_dir.glob('pipeline_results_*.json'))

if results_files:
    latest_results = max(results_files, key=os.path.getctime)
    print(f"üìÅ Loading results from: {latest_results}")
    
    with open(latest_results, 'r') as f:
        results = json.load(f)
    
    print("\nüìä Available results:")
    for key in results.keys():
        print(f"  - {key}")
else:
    print("‚ö†Ô∏è No results found. Run main.py first.")
    results = None

In [None]:
# Import visualizer
from visualize import ResultsVisualizer

if results:
    viz = ResultsVisualizer(output_dir=str(results_dir / 'visualizations'))
    
    # Generate benchmark visualizations if available
    if 'benchmark' in results and results['benchmark']:
        print("\nüìä Benchmark Results:")
        fig = viz.plot_benchmark_combined(results['benchmark'])
        plt.show()
    
    # Generate quality metrics if available
    if 'evaluation' in results and results['evaluation']:
        print("\nüìà Quality Metrics:")
        fig = viz.plot_quality_metrics(results['evaluation'])
        plt.show()

## 8. Custom Analysis

In [None]:
# Add your custom analysis here
# This cell is for exploratory work and custom experiments

print("üî¨ Custom Analysis Section")
print("Add your own analysis code here!")

---

## üìù Notes & Observations

Use this section to document your findings and observations during the analysis.

### Key Findings:
1. 
2. 
3. 

### Recommendations:
1. 
2. 
3. 