# ⚡ Notebook 09: Performance Optimization## Write faster pandas code- Vectorization- Memory optimization- Efficient operations- Chunking

In [None]:
import pandas as pdimport numpy as npimport timedf = pd.read_csv('../datasets/employees.csv')

## 1. Vectorization vs Loops

In [None]:
# BAD: Loop approachstart = time.time()df_copy = df.copy()for i in range(len(df_copy)):    df_copy.loc[i, 'bonus_taxed'] = df_copy.loc[i, 'bonus'] * 0.7 if pd.notna(df_copy.loc[i, 'bonus']) else 0loop_time = time.time() - startprint(f'Loop approach: {loop_time:.4f} seconds')

In [None]:
# GOOD: Vectorized approachstart = time.time()df['bonus_taxed'] = df['bonus'].fillna(0) * 0.7vector_time = time.time() - startprint(f'Vectorized approach: {vector_time:.4f} seconds')print(f'Speedup: {loop_time/vector_time:.1f}x faster!')

## 2. Memory Optimization

In [None]:
# Check memory usageprint('Original memory:')print(df.memory_usage(deep=True).sum() / 1024**2, 'MB')

In [None]:
# Optimize data typesdf_optimized = df.copy()# Int64 -> Int32 for smaller numbersfor col in ['employee_id', 'age', 'performance_score']:    df_optimized[col] = df_optimized[col].astype('int32')# Object -> Category for low cardinalityfor col in ['department', 'position', 'city', 'status']:    df_optimized[col] = df_optimized[col].astype('category')print('\nOptimized memory:')print(df_optimized.memory_usage(deep=True).sum() / 1024**2, 'MB')

## 3. Efficient Filtering

In [None]:
# SLOW: Multiple filters separatelystart = time.time()result = df[df['salary'] > 70000]result = result[result['age'] < 40]result = result[result['department'] == 'Engineering']slow_time = time.time() - start

In [None]:
# FAST: Single combined filterstart = time.time()result = df[    (df['salary'] > 70000) &    (df['age'] < 40) &    (df['department'] == 'Engineering')]fast_time = time.time() - startprint(f'Combined filter: {(slow_time/fast_time):.1f}x faster')

## 4. Using query() for Speed

In [None]:
# query() can be faster for complex conditionsstart = time.time()result = df.query('salary > 70000 and age < 40 and department == "Engineering"')query_time = time.time() - startprint(f'Query time: {query_time:.4f} seconds')

## 5. Chunking Large Files

In [None]:
# Read large file in chunkschunksize = 1000total_rows = 0# Uncomment to test with large file# for chunk in pd.read_csv('../datasets/website_traffic.csv', chunksize=chunksize):#     # Process each chunk#     total_rows += len(chunk)#     # Do calculations on chunk# print(f'Processed {total_rows} rows in chunks')

## 6. Best Practices Summary

In [None]:
# BEST PRACTICES:print("""1. Use vectorized operations instead of loops2. Convert to appropriate data types (int32, category)3. Use single combined boolean masks4. Use .loc[] for assignments to avoid chaining5. Read only needed columns: pd.read_csv(file, usecols=['col1', 'col2'])6. Use query() for complex filters7. Process large files in chunks8. Use inplace=False (default) unless memory is critical""")

## Practice### Exercise 1: Optimize the sales dataset

In [None]:
sales = pd.read_csv('../datasets/sales_data.csv')# Check memory usage# Optimize data types# Compare before/after

### Exercise 2: Benchmark filtering methods

In [None]:
# Compare speed of:# 1. Chained filters# 2. Combined boolean mask# 3. query() method

**Next**: Notebook 10 - Real-World Projects