In [1]:
import pandas as pd
from pandas.core.frame import DataFrame
import matplotlib.pyplot as plt

In [2]:
def read_csv() -> DataFrame:
    return pd.read_csv('chicago_taxi_trips_2016_12.csv',
                       parse_dates=True,
                       infer_datetime_format=True)

In [3]:
def get_mode_stats(df) -> list:
    res = []
    for index, column in enumerate(df):
        frequencies = df[column].value_counts()
        res.append(
            {
                'col': column,
                'first_mode': frequencies.index[0],
                'fm_freq': frequencies[0],
                'fm_perc': (frequencies[0] / len(df)) * 100,
                'second_mode': frequencies.index[1],
                'sm_freq': frequencies[1],
                'sm_perc': (frequencies[1] / len(df)) * 100
            })
    return res

In [4]:
def get_stats(df, numeric = True):
    stats = pd.DataFrame(df.describe(include='all'))
    if (numeric):
        return {
            'val_num': len(df),
            'missing_values_perc': 100-(stats.loc['count']/len(df))*100,
            'cardinality': df.nunique(),
            'min_val': stats.loc['min'],
            'max_val': stats.loc['max'],
            'first_quartile': stats.loc['25%'],
            'third_quartile': stats.loc['75%'],
            'average': stats.loc['mean'],
            'median': stats.loc['50%'],
            'st_dev': stats.loc['std']
            }
    else:
        return {
            'val_num': len(df),
            'missing_values_perc': 100-(stats.loc['count']/len(df))*100,
            'cardinality': df.nunique(),
            'modes': get_mode_stats(df)
            }
        

In [5]:
def main() -> None:
    df: DataFrame = read_csv()

    numerics_df: DataFrame = df.select_dtypes(include='number')
    categoricals_df: DataFrame = df.select_dtypes(exclude='number')

    numerics = get_stats(numerics_df)
    categoricals = get_stats(categoricals_df, False)
    
    numerics['missing_values_perc'].plot(kind='hist', title='Percentage of missing values')
    plt.show()
    numerics['cardinality'].plot(kind='hist', title='Cardinality')
    plt.show()
    numerics['min_val'].plot(kind='hist', title='Minimum values')
    plt.show()
    numerics['max_val'].plot(kind='hist', title='Maximum values')
    plt.show()
    numerics['first_quartile'].plot(kind='hist', title='First quartile')
    plt.show()
    numerics['third_quartile'].plot(kind='hist', title='Third quartile')
    plt.show()
    numerics['average'].plot(kind='hist', title='Mean')
    plt.show()
    numerics['median'].plot(kind='hist', title='Median')
    plt.show()
    numerics['st_dev'].plot(kind='hist', title='Standard deviation')
    plt.show()

In [6]:
if (__name__ == '__main__'):
    main()

            taxi_id  trip_seconds    trip_miles  pickup_census_tract  \
count  1.245094e+06  1.245614e+06  1.245690e+06                  0.0   
mean   4.361896e+03  7.565623e+02  2.722639e+00                  NaN   
std    2.511681e+03  1.090555e+03  4.914539e+00                  NaN   
min    0.000000e+00  0.000000e+00  0.000000e+00                  NaN   
25%    2.229000e+03  3.600000e+02  4.000000e-01                  NaN   
50%    4.316000e+03  5.400000e+02  1.100000e+00                  NaN   
75%    6.524000e+03  9.000000e+02  2.400000e+00                  NaN   
max    8.760000e+03  8.634000e+04  9.000000e+02                  NaN   

       dropoff_census_tract  pickup_community_area  dropoff_community_area  \
count          787242.00000           1.131361e+06            1.111901e+06   
mean              514.28259           2.281085e+01            2.106795e+01   
std               357.14152           1.879816e+01            1.694041e+01   
min                 2.00000           1