In [1]:
import pandas as pd
import numpy as np

import psycopg2
from psycopg2 import sql  # SQL composition

# Project level modules
from modules import database_connection, sql_statements
from modules.database_connection import (
    postgresql_connection, get_table_data_types, dataframe_to_csv,
    execute_sql_statement, get_descriptive_statistics, postgresql_to_csv
)

# Make PostgreSQL database connection

# Table Names

In [20]:
def db_schema_to_csv(csv_path: 'str | None' = None):
    """
    For each table in the database, the column names and datatypes are
    aggregated and saved into a single csv file.
    
    Parameters
    ----------
    csv_path : string
        
        
    Returns
    -------
    None
    """
    
    # Do not execute unless a file path is provided
    if csv_path is None:
        return None
    
    table_names = ['flights', 'flights_test', 'fuel_comsumption', 'passengers']
    
    # Create database connection
    con = postgresql_connection()
    
    frames = []  # list of dataframes
    for name in table_names:
        # Generate list of dataframes of table datatypes
        df = get_table_data_types(connection=con, table_name=name)
        frames.append(df)
    
    # Concatenate frames
    data_types_df = pd.concat(frames, axis=1)
    
    # export db schema to csv file
    dataframe_to_csv(data_types_df, csv_path)
    
    return None

In [2]:
schema_df = pd.read_csv('../data/table_data_types.csv')
schema_df.head(2)

Unnamed: 0,table_name,flights_column_name,flights_data_type,flights_is_categorical,flights_unit,table_name.1,flights_test_column_name,flights_test_data_type,flights_test_is_categorical,flights_test_unit,table_name.2,fuel_comsumption_column_name,fuel_comsumption_data_type,fuel_comsumption_is_categorical,fuel_comsumption_unit,table_name.3,passengers_column_name,passengers_data_type,passengers_is_categorical,passengers_unit
0,flights,fl_date,text,1.0,yyyy-mm-dd,flights_test,fl_date,timestamp without time zone,1.0,yyy-mm-dd,fuel_comsumption,month,bigint,1.0,index,passengers,departures_scheduled,double precision,0.0,count
1,flights,mkt_unique_carrier,text,1.0,,flights_test,mkt_unique_carrier,text,1.0,,fuel_comsumption,airline_id,double precision,1.0,,passengers,departures_performed,double precision,0.0,count


# Categorical

In [3]:
cat_features = pd.DataFrame()
tables = ['flights', 'flights_test', 'fuel_comsumption', 'passengers']

In [4]:
frames = []
for table in tables:
    frames.append(schema_df[
        (schema_df[f'{table}_is_categorical'] == 1)
    ][[f'{table}_column_name']])

cat_features = pd.concat(frames, axis=1)
cat_features.columns = tables

In [8]:
cat_features

Unnamed: 0,flights,flights_test,fuel_comsumption,passengers
0,fl_date,fl_date,month,
1,mkt_unique_carrier,mkt_unique_carrier,airline_id,
2,branded_code_share,branded_code_share,unique_carrier,
3,mkt_carrier,mkt_carrier,carrier,
4,mkt_carrier_fl_num,mkt_carrier_fl_num,carrier_name,
5,op_unique_carrier,op_unique_carrier,carrier_group_new,
6,tail_num,tail_num,,
7,op_carrier_fl_num,op_carrier_fl_num,,
8,origin_airport_id,origin_airport_id,,
9,origin,origin,,


In [8]:
# Category frequency and relative frequency
categorical_statistics_sql = """
SELECT 
 {feature},
 COUNT({feature}) AS frequency
FROM (SELECT {feature}
       FROM {table}) AS t
GROUP BY {feature}
ORDER BY frequency DESC
"""

tables = ['flights', 'flights_test', 'fuel_comsumption', 'passengers']

for table in tables[3:]:
    frames = []
    with postgresql_connection() as con:
        for feat in cat_features[cat_features[table].notnull()][table]:
            # SQL composition
            query = sql.SQL(categorical_statistics_sql).format(
                feature=sql.Identifier(feat),
                table=sql.Identifier(table)
            ).as_string(context=con)

            df = execute_sql_statement(con, query=query)

            df.columns.name = feat
            
            df['relative_frequency'] = df['frequency'] / df['frequency'].sum()
            
            dataframe_to_csv(df, csv_path=(
                f'../data/descriptive_stats/{table}_{feat}_cat_stats.csv'
            ))

Connected


In [9]:
# Generate numeric statistics based on categorical frequencies
tables = ['flights', 'flights_test', 'fuel_comsumption', 'passengers']
stats_index = ['count', 'mean', 'standard_deviation', 
               'variance', 'range', 'minimum', 'Q1/25%', 'median/Q2/50%',
               'Q3/75%', 'maximum', 'interquartile_range (IQR)', 'skewness']

for table in tables[3:]:
    frames = []
    stats = pd.DataFrame().index = stats_index
    for feat in cat_features[cat_features[table].notnull()][table]:
        df = pd.read_csv(
            f'../data/descriptive_stats/{table}_{feat}_cat_stats.csv'
        )
        
        dfs = (df['frequency']
               .describe()
               .to_frame()
               .rename(columns={'frequency' : feat})
               .T)
        
        dfs.insert(loc=3, column='variance', value=df['frequency'].var())
        
        frames.append(dfs)
    
    stats = pd.concat(frames, axis=0)
    
    stats.insert(loc=4, column='range', value=(stats['max'] - stats['min']))
    stats['interquartile_range (IQR)'] = stats['75%'] - stats['25%']
    stats['skewness'] = (3 * (stats['mean'] - stats['50%'])
                         / stats['std'])
    stats = stats.T
    stats.index = stats_index
    
    stats.to_csv(f'../data/descriptive_stats/{table}_cat_freq_stats.csv')

# Numeric

In [3]:
num_features = pd.DataFrame()

In [4]:
tables = ['flights', 'flights_test', 'fuel_comsumption', 'passengers']

In [5]:
frames = []
for table in tables:
    frames.append(schema_df[
        (schema_df[f'{table}_is_categorical'] == 0)
    ][[f'{table}_column_name']])

num_features = pd.concat(frames, axis=1)
num_features.columns = tables

In [12]:
num_features

Unnamed: 0,flights,flights_test,fuel_comsumption,passengers
0,,,,departures_scheduled
1,,,,departures_performed
2,,,,payload
3,,,,seats
4,,,,passengers
5,,,,freight
6,,,sdomt_gallons,mail
7,,,satl_gallons,distance
8,,,spac_gallons,ramp_to_ramp
9,,,slat_gallons,air_time


# count, mean, standard deviation, variance, range, minimum, Q1/25%, median/Q2/50%, Q3/75%, maximum, interquartile range (IQR), skewness

In [11]:
# numeric discriptive statistics
# count, mean, standard deviation, variance, range, minimum, Q1/25%,
# median/Q2/50%, Q3/75%, maximum, interquartile range (IQR), skewness
numerical_statistics_sql = """
SELECT
  SUM(CASE WHEN {feature} IS NULL THEN 1 ELSE 0 END) as null_count,
  COUNT({feature}) AS count,
  AVG({feature}::NUMERIC) AS mean,
  STDDEV({feature}::NUMERIC) AS standard_deviation,
  VARIANCE({feature}::NUMERIC) AS variance,
  MIN({feature}::NUMERIC) AS min,
  PERCENTILE_CONT(ARRAY[0.25, 0.5, 0.75])
    WITHIN GROUP (ORDER BY {feature}::NUMERIC) AS "q1,q2,q3",
  MAX({feature}::NUMERIC) AS max
   FROM (SELECT {feature}
         FROM {table}) AS t;
"""

stats_index = ['null_count', 'count', 'mean', 'standard_deviation', 
               'variance', 'range', 'minimum', 'Q1/25%', 'median/Q2/50%',
               'Q3/75%', 'maximum', 'interquartile_range (IQR)', 'skewness']

tables = ['flights', 'flights_test', 'fuel_comsumption', 'passengers']
# For each table in database
for table in tables[:1]:  # [:1] is just 'flights'
    # List to store dataframe results for each feature
    frames = []
    
    # Create PostgreSQL connection
    with postgresql_connection() as con:
        # For each feature in the numeric features
        for feat in num_features[num_features[table].notnull()][table]:
            # SQL composition
            query = sql.SQL(numerical_statistics_sql).format(
                feature=sql.Identifier(feat),
                table=sql.Identifier(table)
            ).as_string(context=con)
            
            # Get query results in a dataframe
            # 1 row representing the feature
            # 8 columns for the descriptive statistics
            df = execute_sql_statement(con, query=query)
            
            # Set the index name to the feature
            df.index = [feat]
            df.to_csv(f'../data/descriptive_stats/backup/')
            # Append dataframe to list
            frames.append(df)
    
    # Concatenate on the rows
    stats = pd.concat(frames, axis=0)
    
    # Add additional statistics
    # Set the quartiles to their own columns
    stats.insert(loc=6, column='q1', value=stats['q1,q2,q3'][0][0])
    stats.insert(loc=7, column='median', value=stats['q1,q2,q3'][0][1])
    stats.insert(loc=8, column='q3', value=stats['q1,q2,q3'][0][2])
    stats = stats.drop('q1,q2,q3', axis=1)
    # Add range
    stats.insert(loc=5, column='range', value=(stats['max'] - stats['min']))
    # Add IQR
    stats['interquartile_range (IQR)'] = stats['q3'] - stats['q1']
    # Add skewness
    stats['skewness'] = (3 * (stats['mean'].astype(float) 
                           - stats['median'].astype(float))
                      / stats['standard_deviation'].astype(float))
    
    # Transpose dataframe
    # Rows are the desriptive statistics
    # Columns are the features
    stats = stats.T
    
    # Reset the index
    stats.index = stats_index
    
    # Write to csv
    stats.to_csv(f'../data/descriptive_stats/{table}_numeric_stats_TEST.csv')

Connected
