# A/B Testing Guide - Interactive Learning
# دليل اختبارات A/B - تعلم تفاعلي

This notebook covers:
- Understanding A/B testing concepts
- Designing and running experiments
- User assignment strategies
- Statistical significance testing

يغطي هذا المفكرة:
- فهم مفاهيم اختبارات A/B
- تصميم وتشغيل التجارب
- استراتيجيات تعيين المستخدمين
- اختبار الدلالة الإحصائية

## Part 1: Understanding A/B Testing
## الجزء 1: فهم اختبارات A/B

### What is A/B Testing?
### ما هي اختبارات A/B؟

A/B testing compares two versions to determine which performs better.

In [None]:
# Simulating A/B testing
import random
from dataclasses import dataclass
from typing import Dict

@dataclass
class Variant:
    name: str
    allocation: float
    conversions: int = 0
    visitors: int = 0

# Create variants
variant_a = Variant(name="Blue Button", allocation=0.5, visitors=1000, conversions=200)
variant_b = Variant(name="Green Button", allocation=0.5, visitors=1000, conversions=240)

# Calculate conversion rates
def conversion_rate(variant: Variant) -> float:
    return (variant.conversions / variant.visitors) * 100

print("A/B Test Results:")
print(f"{variant_a.name}: {conversion_rate(variant_a):.2f}% conversion rate")
print(f"{variant_b.name}: {conversion_rate(variant_b):.2f}% conversion rate")
print(f"\nLift: {conversion_rate(variant_b) - conversion_rate(variant_a):.2f}%")

### Exercise 1: Design an A/B Test
### تمرين 1: تصميم اختبار A/B

Design an A/B test for comparing two chat answer models:
- Define hypothesis
- Choose metrics
- Calculate sample size

In [None]:
# TODO: Design your A/B test

hypothesis = {
    "change": "TODO: What are you testing?",
    "expected_outcome": "TODO: What do you expect?",
    "primary_metric": "TODO: How to measure?",
    "baseline": 0.20,  # 20% satisfaction rate
    "target": 0.25,  # 25% satisfaction rate
}

print("Your A/B Test Design:")
print(f"Change: {hypothesis['change']}")
print(f"Expected Outcome: {hypothesis['expected_outcome']}")
print(f"Primary Metric: {hypothesis['primary_metric']}")
print(f"Baseline: {hypothesis['baseline']:.2%}")
print(f"Target: {hypothesis['target']:.2%}")

### Solution / الحل

In [None]:
# Solution
hypothesis = {
    "change": "Use GPT-4 instead of GPT-3.5 for chat answers",
    "expected_outcome": "User satisfaction increases by 25%",
    "primary_metric": "satisfaction_score",
    "baseline": 0.20,
    "target": 0.25,
}

print("A/B Test Design (Solution):")
for key, value in hypothesis.items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2%}")
    else:
        print(f"  {key}: {value}")

## Part 2: User Assignment
## الجزء 2: تعيين المستخدمين

In [None]:
import hashlib

def assign_variant_deterministic(user_id: str, variants: list) -> dict:
    """
    Assign variant using deterministic hashing.
    
    Same user_id always gets same variant.
    """
    hash_value = int(
        hashlib.md5(f"{user_id}:experiment_123".encode()).hexdigest(),
        16
    )
    normalized_value = hash_value / 65536.0  # 0.0 to 1.0
    
    cumulative = 0.0
    for variant in variants:
        cumulative += variant['allocation']
        if normalized_value <= cumulative:
            return {"user_id": user_id, "variant": variant['name'], "hash_value": normalized_value}
    
    return None

# Create variants
variants = [
    {"name": "Model A", "allocation": 0.5},
    {"name": "Model B", "allocation": 0.5},
]

# Test assignment
print("User Assignment Test:")
for user_id in ["user_1", "user_2", "user_3", "user_1", "user_2"]:
    assignment = assign_variant_deterministic(user_id, variants)
    print(f"  {user_id}: {assignment['variant']} (hash: {assignment['hash_value']:.4f})")

### Exercise 2: Verify Consistency
### تمرين 2: التحقق من الاتساق

Verify that the same user ID always gets the same variant.

In [None]:
# TODO: Verify consistency
# Test multiple assignments for same user
user_id = "test_user_123"
assignments = [assign_variant_deterministic(user_id, variants)['variant'] for _ in range(10)]

# Check if all are the same
if len(set(assignments)) == 1:
    print("✓ Consistent: All assignments are the same")
    print(f"  User {user_id} always gets: {assignments[0]}")
else:
    print(f"✗ Inconsistent: Got different assignments: {set(assignments)}")

## Part 3: Statistical Significance
## الجزء 3: الدلالة الإحصائية

In [None]:
import math

def z_test_two_proportions(
    conversions_a: int,
    total_a: int,
    conversions_b: int,
    total_b: int,
) -> dict:
    """
    Z-test for comparing two proportions.
    """
    p1 = conversions_a / total_a
    p2 = conversions_b / total_b
    
    # Pooled proportion
    p_pooled = (conversions_a + conversions_b) / (total_a + total_b)
    
    # Standard error
    se = math.sqrt(
        p_pooled * (1 - p_pooled) * (1/total_a + 1/total_b)
    )
    
    # Z-score
    z = (p2 - p1) / se
    
    # P-value (using standard normal approximation)
    from scipy import stats
    p_value = 2 * (1 - stats.norm.cdf(abs(z)))
    
    # Confidence interval
    ci_lower = (p2 - p1) - 1.96 * se
    ci_upper = (p2 - p1) + 1.96 * se
    
    return {
        "z_score": z,
        "p_value": p_value,
        "significant": p_value < 0.05,
        "lift": (p2 - p1) / p1 * 100,
        "confidence_interval": (ci_lower * 100, ci_upper * 100),
    }

# Test on earlier example
result = z_test_two_proportions(
    conversions_a=200, total_a=1000,
    conversions_b=240, total_b=1000,
)

print("Statistical Analysis:")
print(f"  Z-score: {result['z_score']:.4f}")
print(f"  P-value: {result['p_value']:.6f}")
print(f"  Significant (p<0.05): {result['significant']}")
print(f"  Lift: {result['lift']:.2f}%")
print(f"  95% CI: [{result['confidence_interval'][0]:.2f}%, {result['confidence_interval'][1]:.2f}%]")

### Exercise 3: Interpret Results
### تمرين 3: تفسير النتائج

Given the results above, what would you conclude?

In [None]:
# Interpret results
result = z_test_two_proportions(
    conversions_a=200, total_a=1000,
    conversions_b=240, total_b=1000,
)

print("Result Interpretation:")

if result['significant']:
    print(f"✓ Significant difference detected (p={result['p_value']:.6f})")
    print(f"✓ Variant B is {result['lift']:.2f}% better than A")
    print(f"✓ 95% confident the true lift is between {result['confidence_interval'][0]:.2f}% and {result['confidence_interval'][1]:.2f}%")
    print("\nConclusion: Implement Variant B")
else:
    print(f"✗ No significant difference (p={result['p_value']:.6f})")
    print("\nConclusion: Need more data or results are inconclusive")

## Part 4: GraphQL Integration
## الجزء 4: التكامل مع GraphQL

In [None]:
# GraphQL query examples

# Query to list experiments
query_list_experiments = '''
query ListExperiments($status: ExperimentStatus) {
  experiments(status: $status, limit: 20) {
    id
    name
    description
    status
    created_at
    variants {
      id
      name
      allocation
    }
  }
}
'''

# Query to get results
query_get_results = '''
query GetResults($experimentId: ID!) {
  experimentResults(experimentId: $experimentId) {
    experiment {
      id
      name
      status
    }
    metrics {
      metricName
      metricValue
    }
    significant
    summary
  }
}
'''

# Mutation to create experiment
mutation_create_experiment = '''
mutation CreateExperiment(
  $name: String!
  $description: String!
  $variants: [String!]!
) {
  createExperiment(
    name: $name
    description: $description
    variantsInput: $variants
  ) {
    id
    name
    status
  }
}
'''

print("GraphQL Queries and Mutations:")
print("\n1. List Experiments:")
print(query_list_experiments)
print("\n2. Get Results:")
print(query_get_results)
print("\n3. Create Experiment:")
print(mutation_create_experiment)

### Exercise 4: Create Experiment Payload
### تمرين 4: إنشاء حمولة التجربة

Create a GraphQL mutation payload to create an experiment with 2 variants.

In [None]:
import json

# TODO: Create experiment payload
experiment_payload = {
    "query": mutation_create_experiment,
    "variables": {
        "name": "TODO: Experiment name",
        "description": "TODO: Experiment description",
        "variants": [
            # TODO: Add variant configs
        ]
    }
}

print("Create Experiment Payload:")
print(json.dumps(experiment_payload, indent=2))

### Solution / الحل

In [None]:
# Solution
experiment_payload = {
    "query": mutation_create_experiment,
    "variables": {
        "name": "Chat Model Comparison",
        "description": "Compare GPT-4 vs GPT-3.5 for answer quality",
        "variants": [
            '{"name": "GPT-4", "allocation": 0.5, "config": {"model": "gpt-4"}}',
            '{"name": "GPT-3.5", "allocation": 0.5, "config": {"model": "gpt-3.5"}}',
        ]
    }
}

print("Create Experiment Payload (Solution):")
print(json.dumps(experiment_payload, indent=2))

## Part 5: Best Practices Quiz
## الجزء 5: اختبار أفضل الممارسات

### Quiz Questions / أسئلة الاختبار

**Q1:** Why should you test one change at a time?
a) It's easier
b) To know which change caused the effect
c) Multiple changes are better
d) It's faster

**Q2:** What's the purpose of pre-committing to sample size?
a) To stop earlier
b) To ensure statistical significance
c) To reduce costs
d) To test more variants

**Q3:** What p-value indicates statistical significance?
a) > 0.05
b) < 0.05
c) < 0.10
d) = 0.50

### أسئلة الاختبار

**س1:** لماذا يجب اختبار تغيير واحد في كل مرة؟
أ) أسهل
ب) لمعرفة التغيير الذي تسبب الأثر
ج) التغييرات المتعددة أفضل
د) أسرع

In [None]:
# Answer check
quiz_answers = {
    "Q1": "b",
    "Q2": "b",
    "Q3": "b"
}

for q, answer in quiz_answers.items():
    print(f"{q}: {answer}")

## Part 6: Simulation
## الجزء 6: المحاكاة

In [None]:
# Simulate running an A/B test
import numpy as np

def simulate_ab_test(days: int, daily_users: int, variants: list) -> dict:
    """Simulate A/B test over multiple days."""
    results = {v['name']: {'conversions': 0, 'visitors': 0} for v in variants}
    
    for day in range(days):
        for user_id in range(daily_users):
            # Assign variant
            assignment = assign_variant_deterministic(f"user_{user_id}", variants)
            variant_name = assignment['variant']
            
            # Simulate conversion (random based on conversion rate)
            variant_config = next(v for v in variants if v['name'] == variant_name)
            conv_rate = variant_config.get('conversion_rate', 0.20)
            
            if random.random() < conv_rate:
                results[variant_name]['conversions'] += 1
            results[variant_name]['visitors'] += 1
    
    return results

# Run simulation
variants = [
    {"name": "Model A", "allocation": 0.5, "conversion_rate": 0.20},
    {"name": "Model B", "allocation": 0.5, "conversion_rate": 0.23},  # 3% lift
]

print("Simulating A/B Test (7 days, 1000 users/day)...")
results = simulate_ab_test(days=7, daily_users=1000, variants=variants)

for variant_name, data in results.items():
    conv_rate = (data['conversions'] / data['visitors']) * 100
    print(f"\n{variant_name}:")
    print(f"  Visitors: {data['visitors']}")
    print(f"  Conversions: {data['conversions']}")
    print(f"  Conversion Rate: {conv_rate:.2f}%")

# Statistical analysis
stat_result = z_test_two_proportions(
    conversions_a=results['Model A']['conversions'],
    total_a=results['Model A']['visitors'],
    conversions_b=results['Model B']['conversions'],
    total_b=results['Model B']['visitors'],
)

print("\nStatistical Analysis:")
print(f"  Significant: {stat_result['significant']} (p={stat_result['p_value']:.6f})")
print(f"  Lift: {stat_result['lift']:.2f}%")
print(f"  95% CI: [{stat_result['confidence_interval'][0]:.2f}%, {stat_result['confidence_interval'][1]:.2f}%]")

## Summary / الملخص

**Key concepts covered / المفاهيم الرئيسية المشمولة:**

1. **A/B testing**: Compare two versions to determine winner
2. **User assignment**: Deterministic hashing for consistent assignment
3. **Statistical significance**: Z-test for proportions, p-values
4. **GraphQL integration**: Queries and mutations for experiments
5. **Best practices**: One change, sufficient sample, pre-commitment

**النقاط الرئيسية المشمولة:**

1. **اختبارات A/B**: قارن نسختين لتحديد الفائز
2. **تعيين المستخدمين**: التجزئة الحتمية لتخصيص متسق
3. **الدلالة الإحصائية**: اختبار Z للنسب، قيم P
4. **التكامل مع GraphQL**: استعلامات وتغييرات للتجارب
5. **أفضل الممارسات**: تغيير واحد، عينة كافية، التزام مسبق