#### Extracting schema from data

In [6]:
import pandas as pd

In [7]:
df = pd.read_csv("../Data/bodyPerformance.csv")
df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C
1,25.0,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A
2,31.0,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C
3,32.0,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B
4,28.0,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B


In [8]:
df.columns

Index(['age', 'gender', 'height_cm', 'weight_kg', 'body fat_%', 'diastolic',
       'systolic', 'gripForce', 'sit and bend forward_cm', 'sit-ups counts',
       'broad jump_cm', 'class'],
      dtype='object')

In [9]:
df.drop(["diastolic", "systolic", "sit and bend forward_cm", "broad jump_cm", "gripForce"], inplace=True, axis=1)

In [10]:
df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,sit-ups counts,class
0,27.0,M,172.3,75.24,21.3,60.0,C
1,25.0,M,165.0,55.8,15.7,53.0,A
2,31.0,M,179.6,78.0,20.1,49.0,C
3,32.0,M,174.5,71.1,18.4,53.0,B
4,28.0,M,173.8,67.7,17.1,45.0,B


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13393 entries, 0 to 13392
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             13393 non-null  float64
 1   gender          13393 non-null  object 
 2   height_cm       13393 non-null  float64
 3   weight_kg       13393 non-null  float64
 4   body fat_%      13393 non-null  float64
 5   sit-ups counts  13393 non-null  float64
 6   class           13393 non-null  object 
dtypes: float64(5), object(2)
memory usage: 732.6+ KB


In [12]:
df.to_csv("../Data/cleaned_Data.csv", index=False)

In [13]:
import pandas as pd
import yaml

# Load CSV data
df = pd.read_csv("../Data/cleaned_Data.csv")

# Store column info
columns_info = {col: str(df[col].dtype) for col in df.columns}

# Separate numerical columns into integers and floats
numerical_columns = {}

for col in df.select_dtypes(exclude="O"):
    if pd.api.types.is_integer_dtype(df[col]):
        numerical_columns[col] = {
            "min": int(df[col].min()),
            "max": int(df[col].max()),
            "type": "int"
        }
    else:
        numerical_columns[col] = {
            "min": float(df[col].min()),
            "max": float(df[col].max()),
            "type": "float"
        }

# Store categorical columns
categorical_columns = {
    col: df[col].unique().tolist()
    for col in df.select_dtypes(include="O")
}

# Define schema
schema = {
    "columns": columns_info,
    "numerical": numerical_columns,
    "categorical": categorical_columns
}

# Convert to YAML format
schema_yaml = yaml.dump(schema, default_flow_style=False, sort_keys=False)

# Save to file
with open("../Schema/data_schema.yaml", "w") as f:
    f.write(schema_yaml)

# Print the YAML output
print(schema_yaml)

columns:
  age: float64
  gender: object
  height_cm: float64
  weight_kg: float64
  body fat_%: float64
  sit-ups counts: float64
  class: object
numerical:
  age:
    min: 21.0
    max: 64.0
    type: float
  height_cm:
    min: 125.0
    max: 193.8
    type: float
  weight_kg:
    min: 26.3
    max: 138.1
    type: float
  body fat_%:
    min: 3.0
    max: 78.4
    type: float
  sit-ups counts:
    min: 0.0
    max: 80.0
    type: float
categorical:
  gender:
  - M
  - F
  class:
  - C
  - A
  - B
  - D

