#### Extracting schema from data

In [10]:
import pandas as pd

In [11]:
df = pd.read_csv("../Data/Airline_Data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,0,70172,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,1,5047,Male,disloyal Customer,25,Business travel,Business,235,3,2,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,2,110028,Female,Loyal Customer,26,Business travel,Business,1142,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,3,24026,Female,Loyal Customer,25,Business travel,Business,562,2,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,4,119299,Male,Loyal Customer,61,Business travel,Business,214,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [12]:
df.columns

Index(['Unnamed: 0', 'id', 'Gender', 'Customer Type', 'Age', 'Type of Travel',
       'Class', 'Flight Distance', 'Inflight wifi service',
       'Departure/Arrival time convenient', 'Ease of Online booking',
       'Gate location', 'Food and drink', 'Online boarding', 'Seat comfort',
       'Inflight entertainment', 'On-board service', 'Leg room service',
       'Baggage handling', 'Checkin service', 'Inflight service',
       'Cleanliness', 'Departure Delay in Minutes', 'Arrival Delay in Minutes',
       'satisfaction'],
      dtype='object')

In [13]:
df.drop(["Unnamed: 0", "id"], inplace=True, axis=1)

In [14]:
df.head()

Unnamed: 0,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,...,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
0,Male,Loyal Customer,13,Personal Travel,Eco Plus,460,3,4,3,1,...,5,4,3,4,4,5,5,25,18.0,neutral or dissatisfied
1,Male,disloyal Customer,25,Business travel,Business,235,3,2,3,3,...,1,1,5,3,1,4,1,1,6.0,neutral or dissatisfied
2,Female,Loyal Customer,26,Business travel,Business,1142,2,2,2,2,...,5,4,3,4,4,4,5,0,0.0,satisfied
3,Female,Loyal Customer,25,Business travel,Business,562,2,5,5,5,...,2,2,5,3,1,4,2,11,9.0,neutral or dissatisfied
4,Male,Loyal Customer,61,Business travel,Business,214,3,3,3,3,...,3,3,4,4,3,3,3,0,0.0,satisfied


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103904 entries, 0 to 103903
Data columns (total 23 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   Gender                             103904 non-null  object 
 1   Customer Type                      103904 non-null  object 
 2   Age                                103904 non-null  int64  
 3   Type of Travel                     103904 non-null  object 
 4   Class                              103904 non-null  object 
 5   Flight Distance                    103904 non-null  int64  
 6   Inflight wifi service              103904 non-null  int64  
 7   Departure/Arrival time convenient  103904 non-null  int64  
 8   Ease of Online booking             103904 non-null  int64  
 9   Gate location                      103904 non-null  int64  
 10  Food and drink                     103904 non-null  int64  
 11  Online boarding                    1039

In [16]:
df.to_csv("../Data/cleaned_Data.csv", index=False)

In [17]:
import pandas as pd
import yaml

# Load CSV data
df = pd.read_csv("../Data/cleaned_Data.csv")

# Store column info
columns_info = {col: str(df[col].dtype) for col in df.columns}

# Separate numerical columns into integers and floats
numerical_columns = {}

for col in df.select_dtypes(exclude="O"):
    if pd.api.types.is_integer_dtype(df[col]):
        numerical_columns[col] = {
            "min": int(df[col].min()),
            "max": int(df[col].max()),
            "type": "int"
        }
    else:
        numerical_columns[col] = {
            "min": float(df[col].min()),
            "max": float(df[col].max()),
            "type": "float"
        }

# Store categorical columns
categorical_columns = {
    col: df[col].unique().tolist()
    for col in df.select_dtypes(include="O")
}

# Define schema
schema = {
    "columns": columns_info,
    "numerical": numerical_columns,
    "categorical": categorical_columns
}

# Convert to YAML format
schema_yaml = yaml.dump(schema, default_flow_style=False, sort_keys=False)

# Save to file
with open("../Schema/data_schema.yaml", "w") as f:
    f.write(schema_yaml)

# Print the YAML output
print(schema_yaml)

columns:
  Gender: object
  Customer Type: object
  Age: int64
  Type of Travel: object
  Class: object
  Flight Distance: int64
  Inflight wifi service: int64
  Departure/Arrival time convenient: int64
  Ease of Online booking: int64
  Gate location: int64
  Food and drink: int64
  Online boarding: int64
  Seat comfort: int64
  Inflight entertainment: int64
  On-board service: int64
  Leg room service: int64
  Baggage handling: int64
  Checkin service: int64
  Inflight service: int64
  Cleanliness: int64
  Departure Delay in Minutes: int64
  Arrival Delay in Minutes: float64
  satisfaction: object
numerical:
  Age:
    min: 7
    max: 85
    type: int
  Flight Distance:
    min: 31
    max: 4983
    type: int
  Inflight wifi service:
    min: 0
    max: 5
    type: int
  Departure/Arrival time convenient:
    min: 0
    max: 5
    type: int
  Ease of Online booking:
    min: 0
    max: 5
    type: int
  Gate location:
    min: 0
    max: 5
    type: int
  Food and drink:
    min: 0
  