In [None]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display

%matplotlib inline

DIR = r'c://downloads'

plt.style.use('ggplot')

# Grouping and aggregation

In [None]:
viperdb = pd.read_csv(os.path.join(DIR, 'viperdb.csv'))
display(viperdb)

In [None]:
# Calculating the average inner radius per family using vanilla Python is cumbersome and inefficient.
# And we don't get a Pandas object...

print({family: viperdb.loc[viperdb['Family'] == family, 'Inner Radius'].mean() for family in viperdb['Family'].unique()})

In [None]:
# Use 'groupby' to calculate the mean per family
print(viperdb.groupby('Family')['Inner Radius'].mean().head(10))

In [None]:
%timeit {family: viperdb.loc[viperdb['Family'] == family, 'Inner Radius'].mean() for family in viperdb['Family'].unique()}
%timeit viperdb.groupby('Family')['Inner Radius'].mean()

In [None]:
display(viperdb.groupby('Family').mean().head())

In [None]:
print(viperdb.groupby('Family').size().head(10))
print('*' * 10)
print(viperdb['Family'].value_counts().head(10))

In [None]:
# Multiple aggregates at once
display(viperdb.groupby('Family')['Inner Radius'].agg(['mean', 'std']).head())
display(viperdb.groupby('Family').agg(['mean', 'std']).head())

In [None]:
# Can give any function for aggregation

def second_highest(values):
    
    sorted_values = sorted(values, reverse = True)
    
    if len(sorted_values) == 1:
        return sorted_values[0]
    else:
        return sorted_values[1]
        
display(viperdb.groupby('Family').agg([np.average, second_highest]).head())

In [None]:
# What groupby actually retruns?

family_groups = viperdb.groupby('Family')
print(family_groups)

In [None]:
print(family_groups.groups)

In [None]:
display(family_groups.get_group('Bromoviridae').head(3))

In [None]:
for family_name, family_group in family_groups:
    print(family_name, type(family_group))

In [None]:
%timeit [viperdb[viperdb['Family'] == family] for family in viperdb['Family'].unique()]
%timeit [family_group for family_name, family_group in viperdb.groupby('Family')]

In [None]:
# Grouping by multiple columns

print(len(viperdb.groupby('Genus')))
print(len(viperdb.groupby(['Family', 'Genus'])))

display(viperdb.groupby(['Family', 'Genus']).mean().head(10))

In [None]:
# Example:
# Plotting the avg and std of the radiuses per family.
# Every genus will be counted only once; its values will be taken to be the maximum among its records.

genera = viperdb.groupby(['Family', 'Genus'])['Outer Radius', 'Inner Radius', 'Average Radius'].max()
family_groups = genera.reset_index().groupby('Family')

fig, ax = plt.subplots(figsize = (16, 8))
family_groups.mean().plot(kind = 'bar', ax = ax, yerr = family_groups.std())

Read more at: http://pandas.pydata.org/pandas-docs/stable/groupby.html

# Merging and concatenating

In [None]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']},
                    index = [0, 1, 2, 3])

df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                    'B': ['B4', 'B5', 'B6', 'B7'],
                    'C': ['C4', 'C5', 'C6', 'C7'],
                    'D': ['D4', 'D5', 'D6', 'D7']},
                     index = [4, 5, 6, 7])
                     
display(df1)
display(df2)

In [None]:
display(pd.concat([df1, df2]))

In [None]:
df2 = pd.DataFrame({'E': ['E0', 'E1', 'E2', 'E3'],
                    'F': ['F0', 'F1', 'F2', 'F3']},
                    index = [0, 1, 2, 3])
                    
display(df1)
display(df2)

In [None]:
display(pd.concat([df1, df2], axis = 1))

In [None]:
df2 = pd.DataFrame({'A': ['A2', 'A1', 'A0'],
                    'E': ['E2', 'E1', 'E0'],
                    'F': ['F2', 'F1', 'F0']},
                    index = [66, 12, 55])

display(df1)
display(df2)
display(pd.concat([df1, df2], axis = 1))

In [None]:
display(pd.merge(df1, df2, on = 'A'))

In [None]:
display(pd.merge(df1, df2, left_on = 'A', right_on = 'A', how = 'outer'))

**merge** has more important optional parameters. Read its help.

# Example - merge ViralZone and VIPERdb

In [None]:
import json

viralzone = pd.read_csv(os.path.join(DIR, 'viralzone.csv'))
viralzone['Genome'] = viralzone['Genome'].apply(json.loads)
viralzone['Genome Length'] = viralzone['Genome'].apply(lambda genome: sum([fragment['size'] for fragment in genome]))
viralzone_genome_length_per_genus = viralzone.groupby(['Group', 'Family', 'Genus'])['Genome Length'].max().reset_index()

display(viralzone_genome_length_per_genus)

In [None]:
viperdb_radiuses_per_genera = viperdb.groupby(['Family', 'Genus'])[['Inner Radius', 'Outer Radius']].max().reset_index()
display(viperdb_radiuses_per_genera)

In [None]:
combined_genera = pd.merge(viralzone_genome_length_per_genus, viperdb_radiuses_per_genera, on = ['Family', 'Genus'])
combined_genera['Log10 Genome Length'] = np.log10(combined_genera['Genome Length'])
display(combined_genera)

In [None]:
fig, ax = plt.subplots(figsize = (16, 8))
combined_genera.plot(kind = 'scatter', ax = ax, x = 'Log10 Genome Length', y = 'Inner Radius', s = 50)

# MultiIndex

In [None]:
df = viperdb.groupby(['Family', 'Genus'])[['Inner Radius', 'Outer Radius']].max()
display(df.head())

In [None]:
print(df.loc[('Birnaviridae', 'Aquabirnavirus')])
print('*' * 50)
print(df.loc[('Birnaviridae', 'Aquabirnavirus'), 'Inner Radius'])

In [None]:
display(df.loc['Birnaviridae'])

In [None]:
# Can also have MultiIndex columns

df = viperdb.groupby('Family')[['Inner Radius', 'Outer Radius']].agg(['mean', 'std'])
display(df.head())

In [None]:
print(df[('Inner Radius', 'mean')].head())
print('*' * 50)
print(df.loc['Caliciviridae', ('Inner Radius', 'mean')])
print('*' * 50)
print(df.loc['Caliciviridae', 'Inner Radius'])

In [None]:
display(df['Inner Radius'].head())