In [1]:
# Pandas and numpy for data manipulation
import pandas as pd
import numpy as np
np.random.seed(42)
from collections import OrderedDict
 
# Matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
%matplotlib inline

import matplotlib
matplotlib.rcParams['font.size'] = 16
matplotlib.rcParams['figure.figsize'] = (9, 9)

import seaborn as sns

from IPython.core.pylabtools import figsize

# Scipy helper functions
from scipy.stats import percentileofscore
from scipy import stats

# Import summary

In [2]:
def get_import_summary(str_file_path, 
                    str_sep = ",", 
                    str_decimal = ".", 
                    b_doublequote = False, 
                    dict_verify_dtype = None,
                    ls_exclude_var = None):
    
    # Read in data
    df = pd.read_csv(str_file_path, 
                     sep=str_sep, 
                     decimal=str_decimal, 
                     doublequote=b_doublequote, 
                     dtype=dict_verify_dtype)
    
    # initial summary
    ls_var = []
    ls_var_type = []
    ls_dist_val = []
    ls_mean = []
    ls_std = []
    ls_min = []
    ls_25 = []
    ls_50 = []
    ls_75 = []
    ls_max = []
    ls_sample = []
    
    # selected columns
    if ls_exclude_var is None:
        ls_selected_col = [col for col in df.columns]
    else:
        ls_selected_col = [col for col in df.columns if col not in ls_exclude_var]        
    df = df[ls_selected_col]
    
    # Describe for Numerical Columns
    df_num_desc = df.describe()
    
    # Value Counts for Categorical Columns
    
    # loop through each variables
    for col in df.columns:
        # get var and var type
        ls_var.append(col)
        ls_var_type.append(df[col].dtype)
        
        # calculate distinct value
        i_dist_val = df[col].value_counts().shape[0]
        ls_dist_val.append(i_dist_val)
        
        # get numerical stats
        if df[col].dtype != "object":
            ls_mean.append(df_num_desc.loc["mean", col])
            ls_std.append(df_num_desc.loc["std", col])
            ls_min.append(df_num_desc.loc["min", col])
            ls_25.append(df_num_desc.loc["25%", col])
            ls_50.append(df_num_desc.loc["50%", col])
            ls_75.append(df_num_desc.loc["75%", col])
            ls_max.append(df_num_desc.loc["max", col])
        else:
            ls_mean.append(None)
            ls_std.append(None)
            ls_min.append(None)
            ls_25.append(None)
            ls_50.append(None)
            ls_75.append(None)
            ls_max.append(None)
            
        # get sample values
        i_num_sample = min([5, i_dist_val])
        str_sample = str(df[col][0:i_num_sample].tolist())
        ls_sample.append(str_sample)
        
    df_summary = pd.DataFrame(OrderedDict((
            ("variable", pd.Series(ls_var)),
            ("variable_type", pd.Series(ls_var_type)),
            ("n_distinct_values", pd.Series(ls_dist_val)),
            ("mean", pd.Series(ls_mean)),
            ("std", pd.Series(ls_std)),
            ("min", pd.Series(ls_min)),
            ("25p", pd.Series(ls_25)),
            ("50p", pd.Series(ls_50)),
            ("75p", pd.Series(ls_75)),
            ("max", pd.Series(ls_max)),
            ("sample_values", pd.Series(ls_sample))
        ))
    )
    return df, df_summary

In [3]:
# variable params
str_target_var = "address"
ls_exclude_var = [
    "school"
]

# read data params
str_file_path = "data/student.csv"
str_sep = ","
str_decimal = "."
b_doublequote = True

dict_verify_dtype = {
    "G3": np.float
}

In [4]:
df, df_summary = get_import_summary(str_file_path, 
                                dict_verify_dtype=dict_verify_dtype,
                                ls_exclude_var=ls_exclude_var)

print "Number of observations:", df.shape[0]
print "Number of variables:", df.shape[1]
print "Proportion of the target variable"
print df[str_target_var].value_counts()/df.shape[0]
display(df_summary)

Number of observations: 649
Number of variables: 32
Proportion of the target variable
U    0.696456
R    0.303544
Name: address, dtype: float64


Unnamed: 0,variable,variable_type,n_distinct_values,mean,std,min,25p,50p,75p,max,sample_values
0,sex,object,2,,,,,,,,"['F', 'F']"
1,age,int64,8,16.744222,1.218138,15.0,16.0,17.0,18.0,22.0,"[18, 17, 15, 15, 16]"
2,address,object,2,,,,,,,,"['U', 'U']"
3,famsize,object,2,,,,,,,,"['GT3', 'GT3']"
4,Pstatus,object,2,,,,,,,,"['A', 'T']"
5,Medu,int64,5,2.514638,1.134552,0.0,2.0,2.0,4.0,4.0,"[4, 1, 1, 4, 3]"
6,Fedu,int64,5,2.306626,1.099931,0.0,1.0,2.0,3.0,4.0,"[4, 1, 1, 2, 3]"
7,Mjob,object,5,,,,,,,,"['at_home', 'at_home', 'at_home', 'health', 'o..."
8,Fjob,object,5,,,,,,,,"['teacher', 'other', 'other', 'services', 'oth..."
9,reason,object,4,,,,,,,,"['course', 'course', 'other', 'home']"
