# Data Preparation & Validation

### step 1) Importing libraries and loading datasets

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
df = pd.read_csv('data/raw/agentic_ai.csv')

In [3]:
df.head()

Unnamed: 0,agent_id,agent_type,model_architecture,deployment_environment,task_category,task_complexity,autonomy_level,success_rate,accuracy_score,efficiency_score,...,error_recovery_rate,multimodal_capability,edge_compatibility,privacy_compliance_score,bias_detection_score,timestamp,data_quality_score,performance_index,cost_efficiency_ratio,autonomous_capability_score
0,AG_01012,Project Manager,PaLM-2,Server,Text Processing,5,3,0.4788,0.6455,0.6573,...,0.4999,False,False,0.939,0.8061,2024-12-24 04:16:15,0.951,0.58236,50.203448,64.993
1,AG_00758,Marketing Assistant,Mixtral-8x7B,Hybrid,Decision Making,6,5,0.4833,0.566,0.5844,...,0.558,False,False,0.8281,0.7816,2024-12-24 04:16:15,0.7822,0.53844,69.030769,89.06
2,AG_00966,QA Tester,Mixtral-8x7B,Server,Communication,2,4,0.8116,0.8395,0.765,...,0.9196,False,False,0.745,0.8214,2024-12-24 04:16:15,0.7621,0.80599,127.934921,124.372
3,AG_00480,Code Assistant,CodeT5+,Hybrid,Creative Writing,8,6,0.3574,0.4888,0.4742,...,0.3809,True,False,0.9653,0.8684,2024-12-24 04:16:15,0.8117,0.43186,21.066341,86.663
4,AG_01050,QA Tester,Falcon-180B,Edge,Planning & Scheduling,3,4,0.5706,0.7137,0.7209,...,0.6717,False,True,0.9042,0.8417,2024-12-24 04:16:15,0.7762,0.65862,57.271304,87.019


### Step 2) Dataset overview and data type inspection

In [4]:
# Display concise summary of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 26 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   agent_id                     5000 non-null   object 
 1   agent_type                   5000 non-null   object 
 2   model_architecture           5000 non-null   object 
 3   deployment_environment       5000 non-null   object 
 4   task_category                5000 non-null   object 
 5   task_complexity              5000 non-null   int64  
 6   autonomy_level               5000 non-null   int64  
 7   success_rate                 5000 non-null   float64
 8   accuracy_score               5000 non-null   float64
 9   efficiency_score             5000 non-null   float64
 10  execution_time_seconds       5000 non-null   float64
 11  response_latency_ms          5000 non-null   float64
 12  memory_usage_mb              5000 non-null   float64
 13  cpu_usage_percent 

In [5]:
# Quick statistical overview of numeric columns
df.describe()

Unnamed: 0,task_complexity,autonomy_level,success_rate,accuracy_score,efficiency_score,execution_time_seconds,response_latency_ms,memory_usage_mb,cpu_usage_percent,cost_per_task_cents,error_recovery_rate,privacy_compliance_score,bias_detection_score,data_quality_score,performance_index,cost_efficiency_ratio,autonomous_capability_score
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,6.083,6.0306,0.490683,0.573375,0.588884,46.224444,899.599622,404.43856,68.95692,0.019453,0.565884,0.841938,0.774397,0.863688,0.544951,34.321394,102.345902
std,2.143789,2.403085,0.158827,0.127175,0.115969,25.997988,803.829179,111.743654,19.827232,0.009245,0.164149,0.081324,0.101954,0.066905,0.13255,21.864002,16.806164
min,2.0,1.0,0.3,0.4,0.3,1.0,100.01,112.9,10.9,0.0027,0.3001,0.7,0.6002,0.75,0.33,5.951597,56.284
25%,4.0,4.0,0.338975,0.465475,0.501975,26.6575,338.5325,322.2,54.6,0.0124,0.42665,0.772,0.685275,0.8064,0.428152,18.107412,89.82825
50%,6.0,6.0,0.47005,0.56595,0.58895,40.32,637.68,402.7,68.5,0.0175,0.5473,0.8422,0.7738,0.86245,0.5361,28.916278,102.633
75%,8.0,8.0,0.61335,0.667925,0.6747,60.1825,1202.3575,488.325,83.5,0.0245,0.690025,0.913425,0.864325,0.9222,0.647397,45.149029,116.005
max,10.0,10.0,0.9765,0.9596,0.8855,157.15,5478.15,686.6,133.6,0.0585,0.95,0.9799,0.95,0.9799,0.88747,219.327027,147.694


In [6]:
# Check unique values for categorical/object columns
print("Unique value counts (for categorical columns):")
cat_cols = df.select_dtypes(include=['object', 'bool']).columns
for col in cat_cols:
    print(f"{col}: {df[col].nunique()} ")


Unique value counts (for categorical columns):
agent_id: 5000 
agent_type: 16 
model_architecture: 10 
deployment_environment: 6 
task_category: 10 
human_intervention_required: 2 
multimodal_capability: 2 
edge_compatibility: 2 
timestamp: 362 


In [7]:
# Check unique values for numeric columns
print("Unique value counts (for numeric columns):")
cat_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in cat_cols:
    print(f"{col}: {df[col].nunique()} ")


Unique value counts (for numeric columns):
task_complexity: 9 
autonomy_level: 10 
success_rate: 2832 
accuracy_score: 2710 
efficiency_score: 2992 
execution_time_seconds: 3698 
response_latency_ms: 4929 
memory_usage_mb: 2957 
cpu_usage_percent: 897 
cost_per_task_cents: 459 
error_recovery_rate: 3283 
privacy_compliance_score: 2339 
bias_detection_score: 2660 
data_quality_score: 2069 
performance_index: 4707 
cost_efficiency_ratio: 4998 
autonomous_capability_score: 4557 


In [8]:
df.dtypes

agent_id                        object
agent_type                      object
model_architecture              object
deployment_environment          object
task_category                   object
task_complexity                  int64
autonomy_level                   int64
success_rate                   float64
accuracy_score                 float64
efficiency_score               float64
execution_time_seconds         float64
response_latency_ms            float64
memory_usage_mb                float64
cpu_usage_percent              float64
cost_per_task_cents            float64
human_intervention_required       bool
error_recovery_rate            float64
multimodal_capability             bool
edge_compatibility                bool
privacy_compliance_score       float64
bias_detection_score           float64
timestamp                       object
data_quality_score             float64
performance_index              float64
cost_efficiency_ratio          float64
autonomous_capability_sco

### step 3) Inspect for missing values, duplicates, and inconsistent types

In [9]:
# Check count of missing (NaN) values in each column
df.isna().sum().sort_values(ascending=False)

agent_id                       0
agent_type                     0
cost_efficiency_ratio          0
performance_index              0
data_quality_score             0
timestamp                      0
bias_detection_score           0
privacy_compliance_score       0
edge_compatibility             0
multimodal_capability          0
error_recovery_rate            0
human_intervention_required    0
cost_per_task_cents            0
cpu_usage_percent              0
memory_usage_mb                0
response_latency_ms            0
execution_time_seconds         0
efficiency_score               0
accuracy_score                 0
success_rate                   0
autonomy_level                 0
task_complexity                0
task_category                  0
deployment_environment         0
model_architecture             0
autonomous_capability_score    0
dtype: int64

In [10]:
# Check duplicate rows in the dataset
duplicates = df.duplicated().sum()
print(f"Total duplicate rows: {duplicates}")

Total duplicate rows: 0


### Saving the cleaned dataset


In [12]:
# Defining the cleaned data path
cleaned_folder = "/Users/srinidhigowda/Desktop/Git-Clones/agentic-cost-performance-analysis/data/cleaned"


In [13]:
# To make sure the folder exists before saving
if not os.path.exists(cleaned_folder):
    os.makedirs(cleaned_folder)
    print(f"üìÅ Created folder: {cleaned_folder}")
else:
    print(f"üìÅ Folder already exists: {cleaned_folder}")

üìÅ Folder already exists: /Users/srinidhigowda/Desktop/Git-Clones/agentic-cost-performance-analysis/data/cleaned


In [16]:
# Define the file name and save the cleaned dataset
cleaned_file_path = os.path.join(cleaned_folder, "cleaned_data.csv")
df.to_csv(cleaned_file_path, index=False)

# As of now the datasets seems clean and ready for EDA. There are no duplicates, no missing values and ready for analysis