In [None]:
# Descriptive
# https://baselinesupport.campuslabs.com/hc/en-us/articles/204305665-Types-of-Descriptive-Statistics

# TESTS:
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3116565/
# ebook100

In [14]:
# Imports
# -------

import numpy as np
import scipy.stats
from scipy import stats
from scipy.stats import trim_mean, kurtosis
from scipy.stats.mstats import mode # gmean, hmean

import collections
from operator import itemgetter

import pandas as pd
from pandas import DataFrame as df
from pandas.api.types import is_numeric_dtype
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.expand_frame_repr', False)

from IPython.display import display, HTML
from IPython.core.interactiveshell import InteractiveShell # check doc
InteractiveShell.ast_node_interactivity = "all" # check doc

# Old, propably not used:
from __future__ import print_function, division
%matplotlib inline


# Get dataset
# -----------

def get_ds(path, delim=","):
    return pd.read_csv(path, delimiter=delim)


# Data exploration
# ----------------

def get_column_values_and_counts(ds, show_only=9): # len(ds)    
    vals_counts_descs = []
    for column in ds.columns:
        values, counts = np.unique(ds[column], return_counts=True)
        vals_counts = zip(values[:show_only], counts)
        vals_counts_desc = list(reversed(sorted(vals_counts,key=itemgetter(1))))
        vals_counts_strings = []
        for vc in vals_counts_desc:
            vc_string = str(vc[0]) + " (" + str(vc[1]) + ")"
            vals_counts_strings.append(vc_string)
        vals_counts_descs.append(vals_counts_strings)
    
    val_dicts = []
    for i in range(show_only):
        temp_dict = collections.OrderedDict()
        key_name = "Top {} val(counts):".format(str(i+1))
        temp_dict[key_name] = []
        for vc in vals_counts_descs:
            if i < len(vc):
                value = vc[i]
            else:
                value = None
            temp_dict[key_name].append(value)
        val_dicts.append(temp_dict)
            
    return val_dicts

def get_descriptive_stats(ds):    
    numeric_data = [] 
    for column in ds.columns:
        if is_numeric_dtype(ds[column]):
            numeric_data.append(column)
        else:
            numeric_data.append(None)
            
    desc_stats = collections.OrderedDict()
    desc_stats = {
        'dtypes' : [ds[column].dtype for column in ds.columns],
        
        'means' : [],
        'medians' : [(ds[col]).median() if col else None for col in numeric_data],
        'modes' : [list(mode(ds[col]).mode)[0] if col else None for col in numeric_data],
        
        'variances' : [],
        'stds' : [np.std(ds[col]) if col else None for col in numeric_data],
        
        'skewnesses' : [],
        'kurtosises' : [],
        
        'minis' : [],
        'q25s' : [],
        'q50s' : [],
        'q75s' : [],
        'maxes' : [],
        }
    for col in numeric_data:
        if col:
            q1, q2, q3 = ds[col].quantile([.25, 0.50, .75])
            desc_stats['q25s'].append(q1)
            desc_stats['q50s'].append(q2)
            desc_stats['q75s'].append(q3)
            
            descriptives = list(stats.describe(ds[col]))
            desc_stats['minis'].append(descriptives[1][0])
            desc_stats['maxes'].append(descriptives[1][1])
            desc_stats['means'].append(descriptives[2])
            desc_stats['variances'].append(descriptives[3])
            desc_stats['skewnesses'].append(descriptives[4])
            desc_stats['kurtosises'].append(descriptives[5])
        else:
            desc_stats['q25s'].append(None)
            desc_stats['q50s'].append(None)
            desc_stats['q75s'].append(None)
            
            desc_stats['minis'].append(None)
            desc_stats['maxes'].append(None)
            desc_stats['means'].append(None)
            desc_stats['variances'].append(None)
            desc_stats['skewnesses'].append(None)
            desc_stats['kurtosises'].append(None)
    return desc_stats

def data_exploration(ds):
    descriptive_stats = get_descriptive_stats(ds)
    
    for vc_dict in get_column_values_and_counts(ds, show_only=9):
        key = list(vc_dict.keys())[0]
        val = list(vc_dict.values())[0]
        descriptive_stats[key] = val
    
    descriptive_stats['column_names'] = list(ds.columns)

    return descriptive_stats
    
    
# Display (Table)
# ---------------

def display_explored_data(explored_data):
    col_names = explored_data.keys()
    dat = pd.DataFrame.from_dict(explored_data,orient='index').transpose()
    dat.index = dat['column_names']
    dat = dat.transpose()
    
    dat['statistics'] = dat.index
    
    return dat


# Run exploration function
# ------------------------

def Run_exploration(ds=None, raw_display_rows=5, dataset_file=None, dilim=None, tables=True):
    
    if ds is None:
        if dataset_file:
            ds = get_ds(dataset_file, dilim)
        else:
            print("Run_exploration(ds, raw_display_rows, dataset_file=None, dilim=None)\nSpecify 'ds' or 'dataset_file' with 'dilim'")
            return None
        
    explored_data = data_exploration(ds)
    exploration_table = display_explored_data(explored_data)
    
    if tables:
        display(ds.head(raw_display_rows))
        display(exploration_table)
    
    return ds


# Parameters & Run
# ----------------

ds = None
raw_display_rows = 2
dataset_file = 'student-mat.csv'
dilimeter = ';'
tables = True

if __name__ == '__main__':
    ds = Run_exploration(ds, raw_display_rows, dataset_file, dilimeter, tables)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,4,5,5,6


column_names,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,statistics
dtypes,object,object,int64,object,object,object,int64,int64,object,object,object,object,int64,int64,int64,object,object,object,object,object,object,object,object,int64,int64,int64,int64,int64,int64,int64,int64,int64,int64,dtypes
means,,,16.6962,,,,2.74937,2.52152,,,,,1.4481,2.03544,0.334177,,,,,,,,,3.9443,3.23544,3.10886,1.48101,2.29114,3.55443,5.70886,10.9089,10.7139,10.4152,means
medians,,,17,,,,3,2,,,,,1,2,0,,,,,,,,,4,3,3,1,2,4,4,11,11,11,medians
modes,,,16,,,,4,2,,,,,1,2,0,,,,,,,,,4,3,3,1,1,5,0,10,9,10,modes
variances,,,1.62829,,,,1.19845,1.18418,,,,,0.486513,0.704324,0.553017,,,,,,,,,0.803997,0.997725,1.23939,0.79342,1.65868,1.93294,64.0495,11.0171,14.1489,20.9896,variances
stds,,,1.27443,,,,1.09335,1.08682,,,,,0.696621,0.838177,0.742709,,,,,,,,,0.895523,0.997597,1.11187,0.889613,1.28627,1.38854,7.99296,3.31499,3.75674,4.57564,stds
skewnesses,,,0.464498,,,,-0.31717,-0.0315517,,,,,1.60092,0.629739,2.37795,,,,,,,,,-0.948263,-0.16273,0.11606,2.18243,0.609634,-0.492723,3.65762,0.239699,-0.430004,-0.729887,skewnesses
kurtosises,,,-0.0163579,,,,-1.09142,-1.19857,,,,,2.29947,-0.0294013,4.92645,,,,,,,,,1.11024,-0.313152,-0.775686,4.68431,-0.796021,-1.01644,21.43,-0.700229,0.604637,0.383181,kurtosises
minis,,,15,,,,0,0,,,,,1,1,0,,,,,,,,,1,1,1,1,1,1,0,3,0,0,minis
q25s,,,16,,,,2,2,,,,,1,1,0,,,,,,,,,4,3,2,1,1,3,0,8,9,8,q25s


In [11]:
ds.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [94]:
# Reduce dataset size to intresting data only
# -------------------------------------------

def minimize_ds(ds, categories):
    return ds[categories]

def reduce_dataset(ds=None, categories=None, input_categories=False, display_rows=10, explore=True):
    
    if input_categories:
        categories = [cat.strip() for cat in input().split(',')]
    ds_mini = minimize_ds(ds, categories) 
    
    print("Categories: {}.".format(", ".join(categories)))
    if explore:
        Run_exploration(ds_mini, display_rows, tables=True)
    else:
        display(ds_mini.head(display_rows))

    return ds_mini


# Initial parameters
# ------------------

ds = Run_exploration(None, 3, 'student-mat.csv', ';', False) # ds from prev exploration part
categories = ['sex', 'Mjob', 'Fjob', 'G1', 'G2', 'G3']
input_categories = False # default
display_rows = 10
explore = True


# Run
# ---

ds_mini = reduce_dataset(ds, categories, input_categories, display_rows, explore)

Categories: sex, Mjob, Fjob, G1, G2, G3.


Unnamed: 0,sex,Mjob,Fjob,G1,G2,G3
0,F,at_home,teacher,5,6,6
1,F,at_home,other,5,5,6
2,F,at_home,other,7,8,10
3,F,health,services,15,14,15
4,F,other,other,6,10,10
5,M,services,other,15,15,15
6,M,other,other,12,12,11
7,F,other,teacher,6,5,6
8,M,services,other,16,18,19
9,M,other,other,14,15,15


column_names,sex,Mjob,Fjob,G1,G2,G3
dtypes,object,object,object,int64,int64,int64
means,,,,10.9089,10.7139,10.4152
medians,,,,11,11,11
modes,,,,10,9,10
variances,,,,11.0171,14.1489,20.9896
stds,,,,3.31499,3.75674,4.57564
skewnesses,,,,0.239699,-0.430004,-0.729887
kurtosises,,,,-0.700229,0.604637,0.383181
minis,,,,3,0,0
q25s,,,,8,9,8
