# Groupby and Aggregate Practice Using Phone Data

In [1]:
#---Step 0: Import necessary libraries and render plots inline---

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#---Step 1: Read in CSV file---

phone_data = pd.read_csv("/Users/cheon/phone_data.csv")
phone_data["date"] = pd.to_datetime(phone_data["date"])

phone_data.head()

Unnamed: 0,index,date,duration,item,month,network,network_type
0,0,2014-10-15 06:58:00,34.429,data,2014-11,data,data
1,1,2014-10-15 06:58:00,13.0,call,2014-11,Vodafone,mobile
2,2,2014-10-15 14:46:00,23.0,call,2014-11,Meteor,mobile
3,3,2014-10-15 14:48:00,4.0,call,2014-11,Tesco,mobile
4,4,2014-10-15 17:27:00,4.0,call,2014-11,Tesco,mobile


# Getting a Feel for the Data

In [3]:
# Find number of records
print("Number of records: " + str(phone_data.shape[0]))

# Find longest phone call / data entry
print("Longest phone call / data entry: " + str(phone_data["duration"].max()))

# Find total time (seconds) of phone calls recorded
print("Total phone call time: " + str(round(phone_data.loc[phone_data["item"] == "call", "duration"].sum(), 1)))

# Find number of non-null unique network entries
print("Number of non-null network entries: " + str(phone_data["network"].nunique()))

Number of records: 830
Longest phone call / data entry: 10528.0
Total phone call time: 92321.0
Number of non-null network entries: 9


In [4]:
# Find number of entries per month
phone_data["month"].value_counts()

2014-11    230
2015-01    205
2014-12    157
2015-02    137
2015-03    101
Name: month, dtype: int64

In [5]:
# Find number of entries per month (alternative method)
month_counts = pd.DataFrame(phone_data.groupby("month", as_index=False).size())
month_counts.columns=["count"]

month_counts

Unnamed: 0_level_0,count
month,Unnamed: 1_level_1
2014-11,230
2014-12,157
2015-01,205
2015-02,137
2015-03,101


# Practicing Groupby

In [6]:
# Find keys of groupby object when grouping data by month
phone_data.groupby(["month"]).groups.keys()

dict_keys(['2014-11', '2014-12', '2015-01', '2015-02', '2015-03'])

In [7]:
# Find number of entries associated with a specific group
len(phone_data.groupby(['month']).groups['2014-11'])

230

# Adding Functions to Groupby

Functions like max(), min(), mean(), first(), last() can be quickly applied to the GroupBy object to obtain summary statistics for each group.

In [8]:
# Find first entry for each month
phone_data.groupby(["month"]).first()

Unnamed: 0_level_0,index,date,duration,item,network,network_type
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-11,0,2014-10-15 06:58:00,34.429,data,data,data
2014-12,228,2014-11-13 06:58:00,34.429,data,data,data
2015-01,381,2014-12-13 06:58:00,34.429,data,data,data
2015-02,577,2015-01-13 06:58:00,34.429,data,data,data
2015-03,729,2015-12-02 20:15:00,69.0,call,landline,landline


In [9]:
# Find duration sum per month
phone_data.groupby(["month"])["duration"].sum()

month
2014-11    26639.441
2014-12    14641.870
2015-01    18223.299
2015-02    15522.299
2015-03    22750.441
Name: duration, dtype: float64

In [10]:
phone_data.groupby(["month"])["duration"].std()

month
2014-11     287.548051
2014-12     243.151296
2015-01     216.853383
2015-02     250.778963
2015-03    1076.018287
Name: duration, dtype: float64

In [11]:
# Find number of dates (i.e., entries) per month
phone_data.groupby(["month"])["date"].count()

month
2014-11    230
2014-12    157
2015-01    205
2015-02    137
2015-03    101
Name: date, dtype: int64

In [12]:
# Find number of dates (i.e., entries) per month - can actually use any column name besides "duration"
phone_data.groupby(["month"])["duration"].count()

# Dataframe version
# pd.DataFrame(phone_data.groupby(["month"], as_index=False)["duration"].count())

month
2014-11    230
2014-12    157
2015-01    205
2015-02    137
2015-03    101
Name: duration, dtype: int64

In [13]:
# Find duration sum for calls only to each network
phone_data.loc[phone_data["item"] == "call", :].groupby(["network"])["duration"].sum()

# Dataframe version
# pd.DataFrame(phone_data.loc[phone_data["item"] == "call", :].groupby(["network"], as_index=False)["duration"].sum())

network
Meteor        7200.0
Tesco        13828.0
Three        36464.0
Vodafone     14621.0
landline     18433.0
voicemail     1775.0
Name: duration, dtype: float64

# Grouping by More Than One Variable

In [14]:
# Find number of calls, sms, and data entries (under "item" column) in each month
phone_data.groupby(["month", "item"])["date"].count()

# Dataframe version
# pd.DataFrame(phone_data.groupby(["month", "item"], as_index=False)["date"].count())

month    item
2014-11  call    107
         data     29
         sms      94
2014-12  call     79
         data     30
         sms      48
2015-01  call     88
         data     31
         sms      86
2015-02  call     67
         data     31
         sms      39
2015-03  call     47
         data     29
         sms      25
Name: date, dtype: int64

In [15]:
# Find number of calls, texts, and data sent per month
phone_data.groupby(["month", "network_type"])["date"].count()

month    network_type
2014-11  data             29
         landline          5
         mobile          189
         special           1
         voicemail         6
2014-12  data             30
         landline          7
         mobile          108
         voicemail         8
         world             4
2015-01  data             31
         landline         11
         mobile          160
         voicemail         3
2015-02  data             31
         landline          8
         mobile           90
         special           2
         voicemail         6
2015-03  data             29
         landline         11
         mobile           54
         voicemail         4
         world             3
Name: date, dtype: int64

In [16]:
phone_data.groupby('month')[['duration']].sum()

Unnamed: 0_level_0,duration
month,Unnamed: 1_level_1
2014-11,26639.441
2014-12,14641.87
2015-01,18223.299
2015-02,15522.299
2015-03,22750.441


# Using Agg To Calculate Multiple Statistics per Group

In [17]:
# Find duration sum by month using agg() function
phone_data.groupby('month', as_index=False).agg({"duration": "sum"})

Unnamed: 0,month,duration
0,2014-11,26639.441
1,2014-12,14641.87
2,2015-01,18223.299
3,2015-02,15522.299
4,2015-03,22750.441


In [18]:
# Group by month and item and extract some stats from each group
phone_data.groupby(["month","item"]).agg({"duration" : "sum",
                                          "network_type" : "count",
                                          "date" : "first",
                                         })

Unnamed: 0_level_0,Unnamed: 1_level_0,duration,network_type,date
month,item,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-11,call,25547.0,107,2014-10-15 06:58:00
2014-11,data,998.441,29,2014-10-15 06:58:00
2014-11,sms,94.0,94,2014-10-16 22:18:00
2014-12,call,13561.0,79,2014-11-14 17:24:00
2014-12,data,1032.87,30,2014-11-13 06:58:00
2014-12,sms,48.0,48,2014-11-14 17:28:00
2015-01,call,17070.0,88,2014-12-15 20:03:00
2015-01,data,1067.299,31,2014-12-13 06:58:00
2015-01,sms,86.0,86,2014-12-15 19:56:00
2015-02,call,14416.0,67,2015-01-15 10:36:00


In [19]:
# Define aggregation parameters separately
aggregations = {
    "duration":"sum",
    "date": "first"
}

phone_data.groupby(["month"]).agg(aggregations)

Unnamed: 0_level_0,duration,date
month,Unnamed: 1_level_1,Unnamed: 2_level_1
2014-11,26639.441,2014-10-15 06:58:00
2014-12,14641.87,2014-11-13 06:58:00
2015-01,18223.299,2014-12-13 06:58:00
2015-02,15522.299,2015-01-13 06:58:00
2015-03,22750.441,2015-12-02 20:15:00


In [20]:
# Apply multiple functions to columns in groups
aggregations = {
    "duration" : ["min", "max", "sum"],
    "network_type" : "count",
    "date" : ["min", "first", "nunique"]
}

# Applying above aggregation parameters yields a multi-index row and multi-index column dataframe
phone_data.groupby(["month", "item"]).agg(aggregations)

Unnamed: 0_level_0,Unnamed: 1_level_0,duration,duration,duration,network_type,date,date,date
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,sum,count,min,first,nunique
month,item,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2014-11,call,1.0,1940.0,25547.0,107,2014-01-11 15:13:00,2014-10-15 06:58:00,104
2014-11,data,34.429,34.429,998.441,29,2014-01-11 06:58:00,2014-10-15 06:58:00,29
2014-11,sms,1.0,1.0,94.0,94,2014-03-11 08:40:00,2014-10-16 22:18:00,79
2014-12,call,2.0,2120.0,13561.0,79,2014-02-12 11:40:00,2014-11-14 17:24:00,76
2014-12,data,34.429,34.429,1032.87,30,2014-01-12 06:58:00,2014-11-13 06:58:00,30
2014-12,sms,1.0,1.0,48.0,48,2014-01-12 12:51:00,2014-11-14 17:28:00,41
2015-01,call,2.0,1859.0,17070.0,88,2014-12-15 20:03:00,2014-12-15 20:03:00,84
2015-01,data,34.429,34.429,1067.299,31,2014-12-13 06:58:00,2014-12-13 06:58:00,31
2015-01,sms,1.0,1.0,86.0,86,2014-12-15 19:56:00,2014-12-15 19:56:00,58
2015-02,call,1.0,1863.0,14416.0,67,2015-01-02 13:33:00,2015-01-15 10:36:00,67
