# Exploratory Data Analysis

When placed in Metapack data package, this notebook will load the package and run a variety of common EDA operations on the first resource. 


In [1]:
import matplotlib.pyplot as plt 
import seaborn as sns
import metapack as mp
import pandas as pd
import numpy as np

%matplotlib inline
sns.set_context('notebook')


In [2]:

try:
    pkg = mp.jupyter.open_package()
except mp.exc.PackageError:
    pkg = mp.open_package('http://s3.amazonaws.com/library.metatab.org/cde.ca.gov-accountability_dashboard-2.zip')

pkg

In [3]:
first_resource = next(iter(pkg.resources())).name
first_resource

'ela38'

In [4]:

pkg.resource(first_resource)


Header,Type,Description
cds,integer,CDS Code
rtype,text,Record Type
studentgroup,text,Student Group
currdenom,integer,Current year number of valid students
priordenom,integer,CURRENT STATUS - Average distance from level 3
currstatus,text,Prior year number of valid students
priorstatus,text,PRIOR STATUS - Average distance from level 3
change,text,CHANGE - Difference between current status and prior status
statuslevel,integer,Status Level
changelevel,integer,Change Level


In [5]:
df = pkg.resource(first_resource).read_csv(parse_dates=True)

In [6]:
df.head()

Unnamed: 0,cds,rtype,studentgroup,currdenom,priordenom,currstatus,priorstatus,change,statuslevel,changelevel,color,box
0,1100170123968,S,AA,14,23,-87.7,-41.6,-46.1,1,1,0,0
1,1100170123968,S,AI,2,0,,,,0,0,0,0
2,1100170123968,S,ALL,86,81,-58.8,-48.0,-10.8,2,2,2,170
3,1100170123968,S,AS,4,5,,,,0,0,0,0
4,1100170123968,S,EL,38,23,-79.9,-70.0,-9.9,1,2,0,0


In [7]:
empty_col_names = [cn for cn in df.columns if df[cn].nunique() == 0]
const_col_names= [cn for cn in df.columns if df[cn].nunique() == 1]
ignore_cols = empty_col_names+const_col_names
dt_col_names= list(df.select_dtypes(include=[np.datetime64]).columns)

number_col_names = [ cn for cn in df.select_dtypes(include=[np.number]).columns if cn not in ignore_cols ]
other_col_names = [cn for cn in df.columns if cn not in (empty_col_names+const_col_names+dt_col_names+number_col_names) ]

# Constant Columns

In [8]:
df[const_col_names].drop_duplicates().T

ValueError: not enough values to unpack (expected 2, got 0)

# Empty Columns

In [None]:
df[empty_col_names].drop_duplicates().T

# Date and Time Columns

In [None]:
df[dt_col_names].info()

In [None]:
df[dt_col_names].describe()

# Number Columns

In [None]:
df[number_col_names].info()

In [None]:
df[number_col_names].describe()

In [None]:
def plot_histograms(df):

    col_names = list(df.columns)

    n_cols = np.ceil(np.sqrt(len(col_names)))
    n_rows = np.ceil(np.sqrt(len(col_names)))

    plt.figure(figsize=(2*n_cols,5*n_rows))

    for i in range(0,len(col_names)):
        plt.subplot(n_rows + 1,n_cols,i+1)
        try:
            sns.distplot(df[col_names[i]].fillna(0),kde=True) 
        except:
            pass


In [None]:
plot_histograms(df[number_col_names])

# Other Columns

In [None]:
df[other_col_names].info()

In [None]:
df[other_col_names].describe().T