In [32]:
import pandas as pd
import numpy as np

import psycopg2
from psycopg2 import sql  # SQL composition

# Project level modules
from modules import database_connection, sql_statements
from modules.database_connection import (
    postgresql_connection, get_table_data_types, dataframe_to_csv,
    execute_sql_statement, get_descriptive_statistics, postgresql_to_csv
)

# Make PostgreSQL database connection

# Table Names

In [20]:
def db_schema_to_csv(csv_path: 'str | None' = None):
    """
    For each table in the database, the column names and datatypes are
    aggregated and saved into a single csv file.
    
    Parameters
    ----------
    csv_path : string
        
        
    Returns
    -------
    None
    """
    
    # Do not execute unless a file path is provided
    if csv_path is None:
        return None
    
    table_names = ['flights', 'flights_test', 'fuel_comsumption', 'passengers']
    
    # Create database connection
    con = postgresql_connection()
    
    frames = []  # list of dataframes
    for name in table_names:
        # Generate list of dataframes of table datatypes
        df = get_table_data_types(connection=con, table_name=name)
        frames.append(df)
    
    # Concatenate frames
    data_types_df = pd.concat(frames, axis=1)
    
    # export db schema to csv file
    dataframe_to_csv(data_types_df, csv_path)
    
    return None

In [47]:
schema_df = pd.read_csv('../data/table_data_types.csv')
schema_df.head()

Unnamed: 0,table_name,flights_column_name,flights_data_type,flights_is_categorical,flights_unit,table_name.1,flights_test_column_name,flights_test_data_type,flights_test_is_categorical,flights_test_unit,table_name.2,fuel_comsumption_column_name,fuel_comsumption_data_type,fuel_comsumption_is_categorical,fuel_comsumption_unit,table_name.3,passengers_column_name,passengers_data_type,passengers_is_categorical,passengers_unit
0,flights,fl_date,text,1.0,yyyy-mm-dd,flights_test,fl_date,timestamp without time zone,1.0,yyy-mm-dd,fuel_comsumption,month,bigint,1.0,index,passengers,departures_scheduled,double precision,0.0,count
1,flights,mkt_unique_carrier,text,1.0,,flights_test,mkt_unique_carrier,text,1.0,,fuel_comsumption,airline_id,double precision,1.0,,passengers,departures_performed,double precision,0.0,count
2,flights,branded_code_share,text,1.0,,flights_test,branded_code_share,text,1.0,,fuel_comsumption,unique_carrier,text,1.0,,passengers,payload,double precision,0.0,pounds
3,flights,mkt_carrier,text,1.0,,flights_test,mkt_carrier,text,1.0,,fuel_comsumption,carrier,text,1.0,,passengers,seats,double precision,0.0,count
4,flights,mkt_carrier_fl_num,integer,1.0,,flights_test,mkt_carrier_fl_num,bigint,1.0,,fuel_comsumption,carrier_name,text,1.0,,passengers,passengers,double precision,0.0,count


In [54]:
num_features = pd.DataFrame()

In [58]:
num_features = pd.concat([(schema_df[
                    schema_df['flights_is_categorical'] == 0
                ][['flights_column_name']]
                ),
                (schema_df[
                    schema_df['flights_test_is_categorical'] == 0
                ][['flights_test_column_name']]
                ),
                (schema_df[
                    schema_df['fuel_comsumption_is_categorical'] == 0
                ][['fuel_comsumption_column_name']]
                ),
                (schema_df[
                    schema_df['passengers_is_categorical'] == 0
                ][['passengers_column_name']]
                )], axis=1)
num_features.columns = ['flights', 'flights_test',
                        'fuel_comsumption', 'passengers']

In [59]:
num_features

Unnamed: 0,flights,flights_test,fuel_comsumption,passengers
0,,,,departures_scheduled
1,,,,departures_performed
2,,,,payload
3,,,,seats
4,,,,passengers
5,,,,freight
6,,,sdomt_gallons,mail
7,,,satl_gallons,distance
8,,,spac_gallons,ramp_to_ramp
9,,,slat_gallons,air_time


In [17]:
# Category frequency and relative frequency
categorical_statistics_sql = """
SELECT 
 {feature},
 COUNT({feature}) AS frequency
FROM {table}
GROUP BY {feature}
ORDER BY frequency DESC
"""

with postgresql_connection() as con:
    for table in tables[:1]:
        for feat in cat_features_flights['flights_column_name'][2:4]:

            # SQL composition
            query = sql.SQL(categorical_statistics_sql).format(
                feature=sql.Identifier(table, feat),
                table=sql.Identifier(table)
            ).as_string(context=con)
            
            df = execute_sql_statement(con, query=query)
            
            df['relative_frequency'] = df['frequency'] / df['frequency'].sum()
            
            dataframe_to_csv(df,
                             csv_path=(
                f'../data/descriptive_stats/{table}_{feat}_stats.csv'
            ))

Connected


In [4]:
cat_features_flights['flights_column_name'][2:4]

2    branded_code_share
3           mkt_carrier
Name: flights_column_name, dtype: object

In [9]:
tables = ['flights', 'flights_test', 'fuel_comsumption', 'passengers']

In [25]:
cat_features_flights = schema_df[schema_df['flights_is_categorical'] == 1][['flights_column_name']]

Calculate in pandas
- max - min as range
- skewness
# count, mean, standard deviation, variance, range, minimum, Q1/25%, median/Q2/50%, Q3/75%, maximum, interquartile range (IQR), skewness



In [10]:
num_features_flights

Unnamed: 0,flights_column_name
14,crs_dep_time
15,dep_time
16,dep_delay
17,taxi_out
18,wheels_off
19,wheels_on
20,taxi_in
21,crs_arr_time
22,arr_time
23,arr_delay


In [None]:
# numeric discriptive statistics
# count, mean, standard deviation, variance, range, minimum, Q1/25%,
# median/Q2/50%, Q3/75%, maximum, interquartile range (IQR), skewness
numerical_statistics_sql = """
 SELECT
  SUM(CASE WHEN {feature} IS NULL THEN 1 ELSE 0 END) as null_count,
  COUNT({feature}) AS count,
  AVG({feature}::NUMERIC) AS mean,
  STDDEV({feature}::NUMERIC) AS standard_deviation,
  VARIANCE({feature}::NUMERIC) AS variance,
  MIN({feature}::NUMERIC) AS min,
  PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY {feature}::NUMERIC) AS q1,
  PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY {feature}::NUMERIC) AS median,
  PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY {feature}::NUMERIC) AS q3,
  MAX({feature}::NUMERIC) AS max
   FROM {table};
"""

stats_index = ['null_count', 'count', 'mean', 'standard_deviation', 
               'variance', 'range', 'minimum', 'Q1/25%', 'median/Q2/50%',
               'Q3/75%', 'maximum', 'interquartile_range (IQR)', 'skewness']

for table in tables:
    frames = []
    stats = pd.DataFrame().index = stats_index
    with postgresql_connection() as con:
        for feat in num_features[num_features[table].notnull()][table]:
            # SQL composition
            query = sql.SQL(numerical_statistics_sql).format(
                feature=sql.Identifier(feat),
                table=sql.Identifier(table)
            ).as_string(context=con)

            df = execute_sql_statement(con, query=query)

            df.index = [feat]

            df.insert(loc=5, column='range', value=(df['max'] - df['min']))
            df['interquartile_range (IQR)'] = df['q3'] - df['q1']
            df['skewness'] = (3 * (df['mean'].astype(float) 
                                   - df['median'].astype(float))
                              / df['standard_deviation'].astype(float))

            frames.append(df.T[feat])
    
    stats = pd.concat(frames, axis=1)

    stats.to_csv(f'../data/descriptive_stats/{table}_numeric_stats.csv')

Connected
