In [None]:
import pandas as pd

# Basic DataFrame manipulation

Data source information is [here](https://github.com/HeardLibrary/digital-scholarship/tree/master/data/codegraf)

Load Excel spreadsheet into DataFrame

In [None]:
url = 'https://github.com/HeardLibrary/digital-scholarship/raw/master/data/codegraf/co2_state_2016_sector.xlsx'
state_co2_sector = pd.read_excel(url)

Examine contents of DataFrame

In [None]:
state_co2_sector.head()

In [None]:
state_co2_sector.tail()

## Setting the row index

The `.set_index()` method changes one of the columns into the row index. 

The `.reset_index()` method changes a row index into a regular column.

In [None]:
# Set the State column as the index
state_co2_sector.set_index('State')

In [None]:
# What happened to the index ???
state_co2_sector.tail()

In [None]:
# Create a new view of the DataFrame
new_df = state_co2_sector.set_index('State')
print(new_df.head())
print()
print(state_co2_sector.head())

Use the `inplace` attribute to change the source DataFrame (no assignment)

In [None]:
state_co2_sector.set_index('State', inplace=True)
state_co2_sector.head()

## Removing rows and columns

`.drop()` defaults to rows

In [None]:
state_co2_sector.tail()

In [None]:
state_co2_sector.drop('Total').tail()

In [None]:
# .drop() can be a list
state_co2_sector.drop(['Virginia', 'West Virginia', 'Wyoming']).tail()

In [None]:
# Use inplace argument to change the source table
state_co2_sector.drop('Total', inplace=True)
state_co2_sector.tail()

Use `axis` argument to drop columns

In [None]:
state_co2_sector.drop('Total', axis='columns').head()

## Dealing with missing data



In [None]:
url = 'https://github.com/HeardLibrary/digital-scholarship/raw/master/data/gis/wg/Metro_Nashville_Schools.csv'
schools = pd.read_csv(url)
schools.head()

In some cases, cells were empty because the group wasn't represented (i.e. there were zero students). In that case, those `NaN` values should be zeros.

The first argument of the `.fillna()` method can be a single value if it applys to the entire table, or a dictionary if it applies only to certain columns.

In [None]:
schools.fillna({'Native Hawaiian or Other Pacific Islander': 0}, inplace=True)
schools.head()

In other cases, cells were empty because that column didn't apply to that kind of school (e.g. high schools don't have PreK students). The `.dropna()` method can be used to skip rows with any `NaN` values, but that won't work if you only care about certain columns. In that case, we can filter rows using the `.notnull()` method. The `.isnull()` method can be used to select only rows that have `NaN` valued for a column.

In [None]:
schools[schools['Grade PreK 3yrs'].notnull()]

## Sorting rows

Load state CO2 emissions by fuel spreadsheet

In [None]:
url = 'https://github.com/HeardLibrary/digital-scholarship/raw/master/data/codegraf/co2_state_2016_fuel.xlsx'
state_co2_fuel = pd.read_excel(url)
# Set the State column as the row index
state_co2_fuel.set_index('State', inplace=True)
state_co2_fuel.tail()

In [None]:
# Remove the total row
state_co2_fuel.drop('Total', inplace=True)
state_co2_fuel.tail()

In [None]:
# Sort ascending
state_co2_fuel.sort_values(by='Total mmt').head()

In [None]:
# Sort descending, do inplace to modify source table
state_co2_fuel.sort_values(by='Total mmt', ascending=False, inplace=True)
state_co2_fuel.head()

## Slicing columns and rows

To slice using labels, need to use the `.loc()` method. To slice columns, we need to specify both indices, with "all rows" (`:`) selected as the first index.

Recall that slicing with labels is inclusive of last label selected.

In [None]:
# Create a slice (view) with only the fractions
state_co2_fuel_fractions = state_co2_fuel.loc[:, 'Coal fraction': 'Natural Gas fraction']
state_co2_fuel_fractions.head()

To slice rows, only the first index needs to be specified. For integer indices, use the `.iloc()` method.

In [None]:
# Create a slice with only the top four states
top_state_co2_fuel = state_co2_fuel.iloc[:4]
# Note that included rows are 0, 1, 2, and 3 (but not 4).
top_state_co2_fuel

Combine both slicing operations at once.

In [None]:
top_state_co2_fuel_fraction = state_co2_fuel.iloc[:4].loc[:, 'Coal fraction': 'Natural Gas fraction']
top_state_co2_fuel_fraction

# Selecting data

Units are million metric tons

In [None]:
url = 'https://github.com/HeardLibrary/digital-scholarship/raw/master/data/codegraf/co2_data.xlsx'
state_co2 = pd.read_excel(url)
state_co2.head(15)

Performing a boolean operation on a column generates a series of booleans whose index matches the DataFrame rows

In [None]:
state_co2.State=='Alabama'

The boolean series can be used to filter a subset of rows in the DataFrame.

Notice that the indices for the rows carry through in the selection.

In [None]:
state_co2[state_co2.State=='Alaska']

In [None]:
state_co2[state_co2['Sector']=='Industrial'].head()

You can assign the selection to a named view (but remember that indices are maintained).

In [None]:
state_co2_industrial = state_co2[state_co2['Sector']=='Industrial']
state_co2_industrial.head()

# Rearranging data

## Transposing a data frame

Use the `.transpose()` or short form `.T` to switch rows and columns.

In [None]:
top_state_co2_fuel_fraction

In [None]:
top_state_co2_fuel_fraction.T

## Grouping data

Grouping data allows us to pull out particular group members and summarize data by group.

In [None]:
state_co2.head()

In [None]:
co2_state_grouped = state_co2.groupby(['State'])
co2_state_grouped.get_group(('Texas'))

In [None]:
# Calculate sum of all sectors by state
co2_state_grouped.sum()

In [None]:
co2_sector_grouped = state_co2.groupby(['Sector'])
co2_sector_grouped.get_group(('Industrial')).head()

In [None]:
# Calculate sum of all states by sector
total_co2_sector = co2_sector_grouped.sum()
total_co2_sector

In [None]:
# Transpose, then sum
print(total_co2_sector.T)
total_co2_sector.T.sum()
# Notice that since the sum is one-dimensional, the output is a series, not a DataFrame.

In [None]:
# Calculate average instead of sum
total_co2_sector.T.mean()

## Changing the DataFrame organization

In [None]:
state_co2.head()

In [None]:
double_label = state_co2.copy().set_index(['Sector', 'State'])
double_label.head()

In [None]:
year_total = double_label.sum(axis='columns')
year_total.head(12)
# The data are one-dimensional, so the result is a series with two labels

In [None]:
column_df = year_total.unstack('Sector')
column_df.head()

## Changing between wide and long formats

Data organized as above is said to be in "wide" format. Wide tables are often easier to edit and review. But many statistical analyses and visualizations require the data to be in "long" (or "tall") format. Data in long format are also part of making data "tidy" (see [this page](https://cran.r-project.org/web/packages/tidyr/vignettes/tidy-data.html) for more). 

Pandas has built-in functions and methods to change transform DataFrames between "long" and "wide" forms.

The `pd.melt()` function can be used to trandform wide data into long. Data to be used as *grouping variables* (a.k.a. *factors* or "group indicators") must be in columns rather than index labels.

In [None]:
# Change State from an index label to a regular column.
wide = column_df.reset_index('State')
wide.head()

The arguments of the `pd.melt()` function are the DataFrame name followed by a list of columns to be used as grouping variables. Remaining columns are data values for the combinations of grouping variables.

In [None]:
long = pd.melt(wide, ['State'])
long

To transform a long DataFrame to a wide one, we can use the `.pivot()` method. The first two arguments passed into the method are the row and column indices. The third argument is the column to be used as the data values.

Any long table can be turned wide in two ways.

In [None]:
state_wide = long.pivot('State', 'Sector', 'value')
state_wide.head()

In [None]:
sector_wide = long.pivot('Sector', 'State', 'value')
sector_wide.head()

# Joins

Read in the two files that can be joined by the `State` key columns.

In [None]:
url = 'https://github.com/HeardLibrary/digital-scholarship/raw/master/data/codegraf/co2_state_2016_sector.xlsx'
state_co2_sector = pd.read_excel(url)
state_co2_sector.head()

In [None]:
url = 'https://github.com/HeardLibrary/digital-scholarship/raw/master/data/codegraf/census_by_state_2000.csv'
state_population = pd.read_csv(url)
state_population.head()

Notice that the population DataFrame has leading spaces in its `State` column. This needs to be fixed or the strings will not match when we do the join.

In [None]:
state_co2_sector.State[0]

In [None]:
state_population.State[0]

In [None]:
# The .strip() method removes leading and trailing whitespace
state_population['State'] = state_population['State'].str.strip()
state_population.State[0]

Notice that the `Population` column values in the population DataFrame are strings, not numbers. That's because they contain commas.

In [None]:
state_population.Population[0]

In [None]:
# Get rid of the commas by replacing them with nothing. Then change the type from string to integer.
state_population['Population'] = state_population['Population'].str.replace(',', '').astype(int)
state_population.Population[0]

Notice the difference between the outer and inner joins at the bottom of the table (Total row).

In [None]:
state_data_outer = pd.merge(state_co2_sector, state_population, on=['State'], how='outer')
state_data_outer.set_index('State', inplace=True)
state_data_outer.tail()

In [None]:
state_data = pd.merge(state_co2_sector, state_population, on=['State'], how='inner')
state_data.set_index('State', inplace=True)
state_data.tail()

It is now easy to carry out calculations involving data from the two original tables. We can also save the merged table as a spreadsheet if we want.

In [None]:
# Calculate the per capita metric tons of CO2 emitted for each state
state_data.Residential * 1000000 / state_data.Population

In [None]:
# Write data to an Excel file in the current working directory
state_data.to_excel('state_data.xlsx')