# Aggregating DataFrames

### Import library and dataset

In [2]:
# Import library
import pandas as pd

# Import data
DATA_PATH = '/home/gustavo/Documents/Study/007_skills/000_data/walmart_sales.csv'
dataset = pd.read_csv(DATA_PATH)

## Summary statistics

methods for summarizing numerical data:
* .mean()
* .median()
* .mode()
* .min()
* .max()
* .var()
* .std()
* .sum()
* .quantile()

### mean, median and mode

In [11]:
# mean of a vector (column)

mws = dataset["weekly_sales"].mean()
print(round(mws, 2))

23843.95


In [14]:
# median of a vector (column)

mews = dataset["weekly_sales"].median()
print(round(mews, 2))

12049.06


In [17]:
# mode of a vector (column)

mows = dataset["weekly_sales"].mode()
print(round(mows, 2))

0    12.0
Name: weekly_sales, dtype: float64


### minimum and maximum

In [18]:
maxws = dataset["weekly_sales"].max()
print(maxws)

293966.05


In [20]:
minws = dataset["weekly_sales"].min()
print(minws)

-1098.0


### .var() - .std() - .sum() - .quantile()

In [26]:
vws = dataset['weekly_sales'].var()
print(round(vws, 2))

913271824.09


In [28]:
sws = dataset["weekly_sales"].std()
print(round(sws, 2))

30220.39


In [31]:
sumws = dataset["weekly_sales"].sum()
print(round(sumws, 2))

256894718.9


In [32]:
qws = dataset["weekly_sales"].quantile()
print(round(qws, 2))

12049.06


## Summarizing dates

In [33]:
# Max date - most recent date

print(dataset["date"].max())

2012-10-26


In [34]:
# Min date - older date

print(dataset["date"].min())

2010-02-05


## Efficient summaries

In [41]:
# A custom IQR function - This function is short for inter-quartile range, which is the 75th percentile minus the 25th percentile. It's an alternative to standard deviation that is helpful if your data contains outliers.

def iqr(column):
    return column.quantile(0.75) - column.quantile(0.25)


# Print IQR of the temperature_c column
iqr_1 = dataset["temperature_c"].agg(iqr)
print(round(iqr_1, 2))

16.58


In [44]:
# Update to print IQR of temperature_c, fuel_price_usd_per_l, & unemployment
iqr_2 = dataset[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg(iqr)
print(round(iqr_2, 2))

temperature_c           16.58
fuel_price_usd_per_l     0.07
unemployment             0.56
dtype: float64


In [46]:
# Update to print IQR and median of temperature_c, fuel_price_usd_per_l, & unemployment
iqr_3 = dataset[["temperature_c", "fuel_price_usd_per_l", "unemployment"]].agg([iqr, "median"])
print(round(iqr_3, 2))

        temperature_c  fuel_price_usd_per_l  unemployment
iqr             16.58                  0.07          0.56
median          16.97                  0.74          8.10


## Cumulative statistics
* .cumsum()
* .cummax()