# TG-03: Feature Engineering

This notebook creates new features from the cleaned dataset to improve ML model performance.

**Course:** DATA 230 (Data Visualization) at SJSU


In [9]:
import pandas as pd
import numpy as np

# Load the cleaned dataset
df = pd.read_csv('data/cleaned/cleaned_data.csv')
print(f"Original dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")


Original dataset shape: (5000, 26)
Columns: ['agent_id', 'agent_type', 'model_architecture', 'deployment_environment', 'task_category', 'task_complexity', 'autonomy_level', 'success_rate', 'accuracy_score', 'efficiency_score', 'execution_time_seconds', 'response_latency_ms', 'memory_usage_mb', 'cpu_usage_percent', 'cost_per_task_cents', 'human_intervention_required', 'error_recovery_rate', 'multimodal_capability', 'edge_compatibility', 'privacy_compliance_score', 'bias_detection_score', 'timestamp', 'data_quality_score', 'performance_index', 'cost_efficiency_ratio', 'autonomous_capability_score']


## A. Composite Performance & Efficiency Metrics


In [10]:
# 1. overall_performance_score: average of success, accuracy, efficiency
df['overall_performance_score'] = (df['success_rate'] + df['accuracy_score'] + df['efficiency_score']) / 3

# 2. resource_efficiency_score: efficiency relative to memory and CPU usage
df['resource_efficiency_score'] = (df['efficiency_score'] / (df['memory_usage_mb'] * df['cpu_usage_percent'])) * 1000

# 3. cost_effectiveness: performance per cent spent
df['cost_effectiveness'] = df['overall_performance_score'] / df['cost_per_task_cents']

# 4. weighted_quality_score: weighted combination of success, accuracy, data quality
df['weighted_quality_score'] = (df['success_rate'] * 0.4) + (df['accuracy_score'] * 0.4) + (df['data_quality_score'] * 0.2)

print("Composite metrics created:")
print(df[['overall_performance_score', 'resource_efficiency_score', 'cost_effectiveness', 'weighted_quality_score']].describe())


Composite metrics created:
       overall_performance_score  resource_efficiency_score  \
count                5000.000000                5000.000000   
mean                    0.550981                   0.031034   
std                     0.130022                   0.029572   
min                     0.333333                   0.003922   
25%                     0.438183                   0.012578   
50%                     0.543467                   0.021317   
75%                     0.650842                   0.038069   
max                     0.891700                   0.308478   

       cost_effectiveness  weighted_quality_score  
count         5000.000000             5000.000000  
mean            37.510978                0.598361  
std             25.350139                0.112947  
min              6.156125                0.430100  
25%             19.234706                0.496685  
50%             30.846935                0.588560  
75%             49.210490                

## B. Interaction Features


In [11]:
# 5. complexity_autonomy_ratio: autonomy relative to task complexity
df['complexity_autonomy_ratio'] = df['autonomy_level'] / df['task_complexity']

# 6. success_autonomy_interaction: multiplicative effect of success and autonomy
df['success_autonomy_interaction'] = df['success_rate'] * df['autonomy_level']

# 7. latency_per_operation: latency burden relative to execution time
df['latency_per_operation'] = df['response_latency_ms'] / df['execution_time_seconds']

# Handle infinities from division by zero
df.replace([np.inf, -np.inf], np.nan, inplace=True)

print("Interaction features created:")
print(df[['complexity_autonomy_ratio', 'success_autonomy_interaction', 'latency_per_operation']].describe())


Interaction features created:
       complexity_autonomy_ratio  success_autonomy_interaction  \
count                5000.000000                   5000.000000   
mean                    1.001217                      2.661647   
std                     0.288918                      0.794789   
min                     0.333333                      0.575900   
25%                     0.800000                      2.104050   
50%                     1.000000                      2.700000   
75%                     1.200000                      3.160575   
max                     2.000000                      5.131200   

       latency_per_operation  
count            5000.000000  
mean               27.042479  
std                35.661450  
min                 0.790298  
25%                 7.762539  
50%                15.928157  
75%                32.011225  
max               629.284916  


## C. Categorical Grouping & Aggregation


In [12]:
# 8. arch_performance_benchmark: agent performance vs median for its architecture
arch_median_performance = df.groupby('model_architecture')['overall_performance_score'].transform('median')
df['arch_performance_benchmark'] = df['overall_performance_score'] - arch_median_performance

# 9. env_avg_cost_per_complexity: average cost per complexity unit for each environment
df['cost_per_complexity'] = df['cost_per_task_cents'] / df['task_complexity']
env_avg_cost_complexity = df.groupby('deployment_environment')['cost_per_complexity'].transform('mean')
df['env_avg_cost_per_complexity'] = env_avg_cost_complexity

print("Categorical grouping features created:")
print(df[['arch_performance_benchmark', 'env_avg_cost_per_complexity']].describe())


Categorical grouping features created:
       arch_performance_benchmark  env_avg_cost_per_complexity
count                 5000.000000                  5000.000000
mean                     0.008380                     0.003336
std                      0.128960                     0.000030
min                     -0.221917                     0.003290
25%                     -0.103579                     0.003301
50%                      0.000000                     0.003353
75%                      0.108671                     0.003354
max                      0.330450                     0.003375


## D. Temporal Features


In [13]:
# 10. hour_of_day: extract hour from timestamp
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['hour_of_day'] = df['timestamp'].dt.hour

# 11. is_weekend: binary flag for Saturday/Sunday
df['is_weekend'] = df['timestamp'].dt.dayofweek // 5 == 1

print("Temporal features created:")
print(f"Hour distribution:\n{df['hour_of_day'].value_counts().sort_index()}")
print(f"\nWeekend distribution:\n{df['is_weekend'].value_counts()}")


Temporal features created:
Hour distribution:
hour_of_day
4    5000
Name: count, dtype: int64

Weekend distribution:
is_weekend
False    3551
True     1449
Name: count, dtype: int64


## Summary of Engineered Features


In [14]:
# List all new features
new_features = [
    'overall_performance_score',
    'resource_efficiency_score',
    'cost_effectiveness',
    'weighted_quality_score',
    'complexity_autonomy_ratio',
    'success_autonomy_interaction',
    'latency_per_operation',
    'arch_performance_benchmark',
    'env_avg_cost_per_complexity',
    'hour_of_day',
    'is_weekend'
]

print(f"Total new features created: {len(new_features)}")
print(f"\nNew features: {new_features}")
print(f"\nFinal dataset shape: {df.shape}")
print(f"\nPreview of new features:")
df[new_features].head(10)


Total new features created: 11

New features: ['overall_performance_score', 'resource_efficiency_score', 'cost_effectiveness', 'weighted_quality_score', 'complexity_autonomy_ratio', 'success_autonomy_interaction', 'latency_per_operation', 'arch_performance_benchmark', 'env_avg_cost_per_complexity', 'hour_of_day', 'is_weekend']

Final dataset shape: (5000, 38)

Preview of new features:


Unnamed: 0,overall_performance_score,resource_efficiency_score,cost_effectiveness,weighted_quality_score,complexity_autonomy_ratio,success_autonomy_interaction,latency_per_operation,arch_performance_benchmark,env_avg_cost_per_complexity,hour_of_day,is_weekend
0,0.593867,0.040073,56.025157,0.63992,0.6,1.4364,17.098573,0.061117,0.003301,4,False
1,0.544567,0.018484,80.083333,0.57616,0.833333,2.4165,13.696774,0.0216,0.00329,4,False
2,0.805367,0.090926,151.955975,0.81286,2.0,3.2464,210.72999,0.2824,0.003301,4,False
3,0.440133,0.012899,22.57094,0.50082,0.75,2.1444,42.130673,-0.104033,0.00329,4,False
4,0.6684,0.171486,63.657143,0.66896,1.333333,2.2824,11.046916,0.141067,0.003375,4,False
5,0.5662,0.028425,79.746479,0.60744,1.4,3.5875,31.958333,0.048517,0.003375,4,False
6,0.700167,0.034878,85.386179,0.75068,0.5,1.321,45.190891,0.147983,0.003341,4,False
7,0.3975,0.011096,18.661972,0.43432,1.0,2.4,10.659114,-0.165783,0.003375,4,False
8,0.3888,0.011529,8.452174,0.47732,1.0,2.408,7.812428,-0.158133,0.003375,4,False
9,0.525467,0.015013,25.141946,0.60882,0.857143,3.1176,25.973961,-0.040817,0.00329,4,False


In [15]:
# Check for missing values in new features
print("Missing values in new features:")
print(df[new_features].isnull().sum())


Missing values in new features:
overall_performance_score       0
resource_efficiency_score       0
cost_effectiveness              0
weighted_quality_score          0
complexity_autonomy_ratio       0
success_autonomy_interaction    0
latency_per_operation           0
arch_performance_benchmark      0
env_avg_cost_per_complexity     0
hour_of_day                     0
is_weekend                      0
dtype: int64


## Save Feature-Engineered Dataset


In [16]:
# Drop temporary column used for calculation
df.drop(columns=['cost_per_complexity'], inplace=True)

# Save to ML folder
df.to_csv('data/analytics/feature_engineered_data.csv', index=False)
print("Feature-engineered dataset saved to data/ML/feature_engineered_data.csv")
print(f"Final shape: {df.shape}")


Feature-engineered dataset saved to data/ML/feature_engineered_data.csv
Final shape: (5000, 37)
