In [1]:
import pandas as pd
from pandas.core.frame import DataFrame
import matplotlib.pyplot as plt
from IPython.display import display, Markdown

In [2]:
def read_csv() -> DataFrame:
    return pd.read_csv('chicago_taxi_trips_2016_12.csv',
                       parse_dates=True,
                       infer_datetime_format=True)

In [3]:
def get_mode_stats(df) -> list:
    res = []
    for index, column in enumerate(df):
        frequencies = df[column].value_counts()
        res.append(
            {
                'col': column,
                'first_mode': frequencies.index[0],
                'fm_freq': frequencies[0],
                'fm_perc': (frequencies[0] / len(df)) * 100,
                'second_mode': frequencies.index[1],
                'sm_freq': frequencies[1],
                'sm_perc': (frequencies[1] / len(df)) * 100
            })
    return res

In [4]:
def get_stats(df, numeric = True):
    stats = pd.DataFrame(df.describe(include='all'))
    if (numeric):
        return {
            'val_num': len(df),
            'missing_values_perc': 100-(stats.loc['count']/len(df))*100,
            'cardinality': df.nunique(),
            'min_val': stats.loc['min'],
            'max_val': stats.loc['max'],
            'first_quartile': stats.loc['25%'],
            'third_quartile': stats.loc['75%'],
            'average': stats.loc['mean'],
            'median': stats.loc['50%'],
            'st_dev': stats.loc['std']
            }
    else:
        return {
            'val_num': len(df),
            'missing_values_perc': 100-(stats.loc['count']/len(df))*100,
            'cardinality': df.nunique(),
            'modes': get_mode_stats(df)
            }
        

In [5]:
def print_plots(data, numeric = True):
    if (numeric):
        data['missing_values_perc'].plot(kind='hist', title='Percentage of missing values')
        plt.show()
        data['cardinality'].plot(kind='hist', title='Cardinality')
        plt.show()
        data['min_val'].plot(kind='hist', title='Minimum values')
        plt.show()
        data['max_val'].plot(kind='hist', title='Maximum values')
        plt.show()
        data['first_quartile'].plot(kind='hist', title='First quartile')
        plt.show()
        data['third_quartile'].plot(kind='hist', title='Third quartile')
        plt.show()
        data['average'].plot(kind='hist', title='Mean')
        plt.show()
        data['median'].plot(kind='hist', title='Median')
        plt.show()
        data['st_dev'].plot(kind='hist', title='Standard deviation')
        plt.show()
    else:
        pass

IndentationError: expected an indented block (<ipython-input-5-9ef4873bd567>, line 21)

In [None]:
def main() -> None:
    df: DataFrame = read_csv()

    numerics_df: DataFrame = df.select_dtypes(include='number')
    categoricals_df: DataFrame = df.select_dtypes(exclude='number')

    numerics = get_stats(numerics_df)
    categoricals = get_stats(categoricals_df, False)

    display(Markdown('# Continuous values plot'))
    print_plots(numerics)
    display(Markdown('# Categorical values plot'))
    print_plots(categoricals, False)

In [None]:
if (__name__ == '__main__'):
    main()