## Grouping operations 

#### References
1. Python for data analysis
2. Think stats: exploratory data analysis
3. https://pandas.pydata,org

#### Purpose
1. Work in the abstract (its good for the brain)
2. Keep up to date with changes in the library
3. Explore new ways of doing common tasks --- get better



In [2]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pickle
import re


In [3]:
import os
here = os.getcwd()
data = '/home/mw-shovel/web/notes/basel/data'
here

'/home/mw-shovel/web/notes/pandas'

In [4]:
#os.listdir(data)

In [5]:
slr = pd.read_pickle(data + '/combined.p')
mcb = pd.read_pickle(data + '/combined_mc.p')
mcb_slr = pd.read_pickle(data + '/mcb_slr.p')
g_all = pd.read_pickle(data + '/water_bodies_all.p')
g = pd.read_pickle(data + '/water_bodies.p')
codes = pd.read_pickle(data + '/codes.p')

In [6]:
slr.rename(columns={'density':'pcs_m'}, inplace=True)
mcb.rename(columns={'density':'pcs_m'}, inplace=True)

In [7]:
code_dict = {}
for code in codes:
    a = {code['code']:[code['description'], code['material'], code['source']]}
    code_dict.update(a)

In [8]:
a = list(code_dict)
sources = []
for item in a:
    c = code_dict[item][2]
    if c not in sources:
        sources.append(c)
source_dict = {}
for source in sources:
    e = []
    for code in a:
        d = code_dict[code][2]
        if d == source:
            e.append(code)
    source_dict.update({source:e})

In [9]:

def group_codes(the_source, the_key):
    e = {}
    for b in the_source:
        c = {b['code']:b[the_key]}
        e.update(c)
    return e
source_group = group_codes(codes, 'source')
material_group = group_codes(codes, 'material')

In [10]:
mcb_slr.columns

Index(['code_id', 'date', 'pcs_m', 'length', 'location_id', 'quantity', 'city',
       'latitude', 'longitude', 'post', 'water', 'pop', 'area', 'pop_dens',
       'day'],
      dtype='object')

In [11]:
dfA = mcb_slr.pivot_table('pcs_m', index=['date', 'day', 'water','city','location_id'], columns=['code_id'], fill_value=0)

In [12]:
dfA['total'] = dfA[dfA.columns].sum(axis=1)

In [13]:
idx = pd.IndexSlice

In [14]:
dfA.loc[idx[:, :, :, :, g_all['Aare']], 'total'][:10]

date        day  water  city           location_id               
2017-04-02  6    river  Bern           aare_bern_scheurerk           0.022
                                       aarezufluss_bern_scheurerk    0.503
                        Muri bei Bern  Aare_Bern_CaveltiN            1.681
2017-04-07  4    river  Belp           aare_kehrsatz_stolten         0.175
2017-04-12  2    river  Brugg          Aare_Brugg_BuchiE             2.450
2017-04-13  3    river  Aarau          aare_suhrespitz_badert        2.506
                        Rupperswil     aare_rupperswil_badert        0.480
2017-04-14  4    river  Bern           Aare_bern_gerberm             1.728
2017-04-21  4    river  Solothurn      aare_solothurn_nguyena        0.351
2017-04-23  6    river  Köniz          aare_köniz_hoppej             1.677
Name: total, dtype: float64

In [15]:
dfA.loc[idx[:, 6, :, :,g_all['Reuss']], 'total'][:5]

date        day  water  city          location_id                    
2017-04-30  6    river  Luzern        Reuss_St.Karli_LinigerS/ImhofY     8.710
2017-05-28  6    river  Merenschwand  reuss_ottenbach_schoenenbergerl    3.365
2017-07-09  6    river  Hünenberg     reuss_hünenberg_eberhardy          0.965
2017-07-30  6    river  Merenschwand  reuss_ottenbach_schoenenbergerl    3.819
2017-08-27  6    river  Merenschwand  reuss_ottenbach_schoenenbergerl    2.364
Name: total, dtype: float64

In [16]:
dfA.columns

Index(['G1', 'G10', 'G100', 'G101', 'G102', 'G11', 'G12', 'G124', 'G125',
       'G126',
       ...
       'G9', 'G90', 'G91', 'G92', 'G93', 'G95', 'G96', 'G97', 'G99', 'total'],
      dtype='object', name='code_id', length=121)

In [21]:
def cycle_through_index(a):
    b = a.index.names
    for c in b:
        print(c)
cycle_through_index(dfA)

date
day
water
city
location_id


In [22]:
group_source = dfA.groupby(source_group, axis=1)
group_material = dfA.groupby(material_group, axis=1)

In [24]:
g_s = group_source.sum()
g_s_d = group_source.describe()

In [29]:
frag_max = g_s_d.loc['Fragmented']['max']

In [36]:
frag_max

code_id
G124     2.808000
G66      0.764706
G67      6.100000
G76      0.189189
G79      9.823529
G80      1.273000
G82     13.875000
G83      3.107143
G89      1.575000
G93      0.100000
Name: max, dtype: float64

In [None]:
# get the df:
def city(df, x):
    a = df[df.city == x].copy()
    return a
# group by piece per m (total)
def group_pcs_m(df):
    a = df.groupby(['date', 'location_id'])[['pcs_per_m']].sum()
    return a
# graph the city surveys
def graph_city_surveys(df):
    plt.figure(figsize=(12,5));
    df.plot()
    plt.show()
# get the top ten surveys
def top_ten_surveys(df, n=10, column='pcs_per_m'):
    a = df.sort_values(by=column)[-n:]
    return a
#graph the top ten
def graph_top(df):
    plt.figure(figsize=(12,5));
    df.plot.bar()
    plt.show()
# make a pivot table
def make_pivot(df):
    a = df.pivot_table(['pcs_per_m'], index=['date', 'post', 'water', 'location_id'], columns=['code_id'])
    return a
# get the grouping data
def use_pickle(file):
    a = pickle.load(open(file, 'rb'))
# make grouping dict
def code_desc(x):
    a, c, f = {}, {}, {}
    for i in x:
        b = {i['code']:i['description']}
        d = {i['code']:i['material']}
        e = {i['code']:i['source']}
        a.update(b)
        c.update(d)
        f.update(e)
    return a, c, f
dfZQ = z_pivot_q.groupby(sources, level='code_id', axis=1).sum()
# use code_desc to group pivot table as sources or material
def make_a_group(pivot_table, group):
    a = pivot_table.groupby(group, level='code_id', axis=1).sum()
    return a
# get the index values you need
def get_index_names(df, level):
    a = df.index.get_level_values(level).unique()
    return a