In [None]:
import pandas as pd

# Summarizing and reorganizing DataFrames

Load the state CO<sub>2</sub> data

In [None]:
url = 'https://github.com/HeardLibrary/digital-scholarship/raw/master/data/codegraf/co2_data.xlsx'
state_co2 = pd.read_excel(url)
state_co2.head()

## Grouping and summarizing data

Grouping data allows us to pull out particular group members and summarize data by group.

In [None]:
co2_state_grouped = state_co2.groupby(['State'])
co2_state_grouped.get_group(('Texas'))

In [None]:
# Calculate sum of all sectors by state
co2_state_grouped.sum()

In [None]:
co2_sector_grouped = state_co2.groupby(['Sector'])
co2_sector_grouped.get_group(('Industrial')).head()

In [None]:
# Calculate sum of all states by sector
total_co2_sector = co2_sector_grouped.sum()
total_co2_sector

In [None]:
# Transpose, then sum
print(total_co2_sector.T)
total_co2_sector.T.sum()
# Notice that since the sum is one-dimensional, the output is a series, not a DataFrame.

In [None]:
# Calculate average instead of sum
total_co2_sector.T.mean()

## Changing the DataFrame organization

In [None]:
state_co2.head()

In [None]:
double_label = state_co2.copy().set_index(['Sector', 'State'])
double_label.head()

In [None]:
year_total = double_label.sum(axis='columns')
year_total.head(12)
# The data are one-dimensional, so the result is a series with two labels

In [None]:
column_df = year_total.unstack('Sector')
column_df.head()

## Changing between wide and long formats

Data organized as above is said to be in "wide" format. Wide tables are often easier to edit and review. But many statistical analyses and visualizations require the data to be in "long" (or "tall") format. Data in long format are also part of making data "tidy" (see [this page](https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html) for more). 

Pandas has built-in functions and methods to change transform DataFrames between "long" and "wide" forms.

The `pd.melt()` function can be used to trandform wide data into long. Data to be used as *grouping variables* (a.k.a. *factors* or "group indicators") must be in columns rather than index labels.

In [None]:
# Change State from an index label to a regular column.
wide = column_df.reset_index('State')
wide.head()

The arguments of the `pd.melt()` function are the DataFrame name followed by a list of columns to be used as grouping variables. Remaining columns are data values for the combinations of grouping variables.

In [None]:
long = pd.melt(wide, ['State'])
long

To transform a long DataFrame to a wide one, we can use the `.pivot()` method. The first two arguments passed into the method are the row and column indices. The third argument is the column to be used as the data values.

Any long table can be turned wide in two ways.

In [None]:
state_wide = long.pivot('State', 'Sector', 'value')
state_wide.head()

In [None]:
sector_wide = long.pivot('Sector', 'State', 'value')
sector_wide.head()