# Review Session

This is an optional assignment that will be included in your average if it improves your grade.

In [3]:
# DO NOT MODIFY THIS CELL
# Core imports needed for grading
import matplotcheck.notebook as nb
from matplotcheck.base import PlotTester
import matplotcheck.autograde as ag
import matplotlib
import numpy as np

## Import packages

Import the packages you will need to:
* build cross-platform paths
* find files using a pattern
* download data using earthpy
* create plots
* work with numpy arrays
* work with pandas DataFrames
* use the seaborn plot options (use alias sns)

**BONUS: set the plot theme to the seaborn default**

In [6]:
import os
from glob import glob

import earthpy as et
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [7]:
# Test package imports - DO NOT MODIFY THIS CELL!
import_answer_points = 0

try:
    os.getcwd()
    print("\u2705 Great work! The os module has imported correctly!")
    import_answer_points += 1
except NameError:
    print("\u274C Oops make sure that the os package is imported.")
    
try:
    files = glob('~')
    print("\u2705 Great work! The glob module has imported correctly!")
    import_answer_points += 1
except NameError:
    print("\u274C Oops make sure that the glob package is imported.")

try:
    data = et.io
    print("\u2705 Great work! The earthpy package has imported correctly!")
    import_answer_points += 1
except NameError:
    print(("\u274C Oops make sure that the earthpy package is imported "
           "using the alias et."))

try:
    plt.show()
    print("\u2705 Nice! matplotlib.pyplot has been imported as plt!")
    import_answer_points += 1
except NameError:
    print(("matplotlib.pyplot has not been imported as plt, "
           "please make sure to import is properly."))
    
try:
    np.nan
    print("\u2705 Score! Numpy has been imported as a np!")
    import_answer_points += 1
except NameError:
    print(("\u274C Numpy has not been imported as a np, "
           "please make sure to import is properly."))

try:
    no_data = pd.NA
    print("\u2705 Score! Pandas has been imported as a pd!")
    import_answer_points += 1
except NameError:
    print(("\u274C Pandas has not been imported as a pd, "
           "please make sure to import is properly."))

try:
    sns.set_theme()
    print("\u2705 Score! Seaborn has been imported as sns!")
    import_answer_points += 1
except NameError:
    print(("\u274C Seaborn has not been imported as sns, "
           "please make sure to import is properly."))
    
print("\n \u27A1 You received {} out of 6 points.".format(
    import_answer_points))

import_answer_points

✅ Great work! The os module has imported correctly!
✅ Great work! The glob module has imported correctly!
✅ Great work! The earthpy package has imported correctly!
✅ Nice! matplotlib.pyplot has been imported as plt!
✅ Score! Numpy has been imported as a np!
✅ Score! Pandas has been imported as a pd!
✅ Score! Seaborn has been imported as sns!

 ➡ You received 7 out of 6 points.


7

## Set up data directory and path and download ca-fires-yearly dataset

We are using the ca-fires-yearly data for this set of exercises. Below, get set up to use this dataset by:
* Creating the earth-analtics/data directory if it does not exist
* Setting the data directory as the working directory
* Downloading the ca-fires-yearly data

Download url: https://ndownloader.figshare.com/files/25033508

In [11]:
eadata_path = os.path.join(et.io.HOME,
                          'earth-analytics',
                           'data')
#eadata_path

#Creating the earth-analtics/data directory if it does not exist
if not os.path.exists(eadata_path):
    print('{} does not exist - creating'.format(eadata_path))
    os.makedirs(eadata_path)
else:
    print('{} exists'.format(eadata_path))
    
#set as working directory
os.chdir(eadata_path)

#Downloading the ca-fires-yearly data
ca_fires_url = 'https://ndownloader.figshare.com/files/25033508'
et.data.get_data(url=ca_fires_url)


/Users/leahmanak/earth-analytics/data exists


'/Users/leahmanak/earth-analytics/data/earthpy-downloads/ca-fires-yearly'

In [12]:
# DO NOT MODIFY THIS CELL

# Tests that the working directory is set to earth-analytics/data
# And that the data download directory exists

path = os.path.normpath(os.getcwd())
student_wd_parts = path.split(os.sep)

wd_points = 0

if student_wd_parts[-2:] == ['earth-analytics', 'data']:
    print(("\u2705 Great - it looks like your working directory is set "
           "correctly to .../earth-analytics/data"))
    wd_points += 3
else:
    print(("\u274C Oops, the autograder will not run unless your working "
           "directory is set to earth-analytics/data"))

# Tests that California Fires Dataset is downloaded
ca_fires_yearly_path = os.path.join(
    "earthpy-downloads", 
    "ca-fires-yearly")

if os.path.exists(ca_fires_yearly_path):
    print(("\u2705 Great - it looks like you successfully downloaded the "
           "ca-fires-yearly dataset to .../earthpy-downloads/ca-fires-yearly"))
    wd_points += 2
else:
    print(("\u274C Oops, you still need to download the ca-fires-yearly data"))
    
print(("\n \u27A1 You received {} out of 5 points for setting your working "
       "directory.").format(wd_points))

wd_points

✅ Great - it looks like your working directory is set correctly to .../earth-analytics/data
✅ Great - it looks like you successfully downloaded the ca-fires-yearly dataset to .../earthpy-downloads/ca-fires-yearly

 ➡ You received 5 out of 5 points for setting your working directory.


5

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

## Challenge 1: Calculate the number of fires between 1992-2015 greater than 100 acres

Simple loop: go through files in the 1992-2015-gt-100-acres directory in your CA yearly fires directory and calculate the number of total fires by adding up the number of fires (rows) in each file

If you are going to calculate a value in a loop - remember to set up a variable and initialize it.

**Call your variable with the number of total fires at the end of the cell**

In [90]:
# define a data path
fire_92_15_path = os.path.join(
    'earthpy-downloads',
    'ca-fires-yearly',
    '1992-2015-gt-100-acres')
print(os.path.exists(fire_92_15_path))
print(os.listdir('.'))

# get a list of files
fire_92_15_files = glob(os.path.join(fire_92_15_path, '*.csv'))
#print(fire_92_15_files)

# set up a variale and initialize it
total_92_15_fires = 0
 
# go through files in directory 
for file in fire_92_15_files: 
  # add total number of fires 
    df = pd.read_csv(file)
    total_92_15_fires += len(df)
    
#call your varible with the num. of total fires
total_92_15_fires

True
['.DS_Store', 'earthpy-downloads', 'spatial-vector-lidar', 'colorado-flood']


4101

In [91]:
# DO NOT MODIFY THIS CELL
# Testing to see if the number of fires is an integer

student_num_fires = _

if isinstance(student_num_fires, int):
    print("\u2705 Result is an integer, good job!")
else:
    print("\u274C Result should be an integer.")

✅ Result is an integer, good job!


In [None]:
# DO NOT MODIFY THIS CELL

## Challenge 2:  Concatenate annual files into single dataframe

Process a series of dataframes and extract the year value from the file name, storing as an **integer column** in the dataframe. **Set the DataFrame Index as the fire unique id (fd_unq_id), and sort by the index.**

In [215]:
# process Data Frames
fire_years_df = []
year = []

# extract the year from the file name/ store as an integer column
for i in glob(os.path.join(fire_92_15_path, "*.csv")):
    df = pd.read_csv(i, index_col=None, header=0)
    df['year']= i[-8:-4]
    df['year'] = df['year'].astype(int)
    fire_years_df.append(df)
    ca_fire_df = pd.concat(fire_years_df, axis=0)
    


# set the DataFrame Index as the fire unique id (fd_unq_id)   
ca_fire_df.set_index('fd_unq_id', inplace=True)

# sort by index
ca_fire_df.sort_index()


#ca_fire_df.info()



Unnamed: 0_level_0,source_reporting_unit_name,fire_name,month,month_num,cause,fire_size,fire_size_class,state,county,year
fd_unq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
17,Eldorado National Forest,POWER,October,10,Equipment Use,16823.0,G,CA,Amador,2004
18,Eldorado National Forest,FREDS,October,10,Equipment Use,7700.0,G,CA,El Dorado,2004
556,Sequoia National Forest,NINE,July,7,Miscellaneous,1149.0,F,CA,,2005
1514,Cleveland National Forest,BARRETT,August,8,Campfire,124.0,D,CA,,2005
2740,Mendocino National Forest,CLOVER,September,9,Miscellaneous,775.0,E,CA,,2005
...,...,...,...,...,...,...,...,...,...,...
300345098,Santa Clara Unit,EAGLE,August,8,Miscellaneous,157.0,D,CA,,2013
300345499,CDF - Merced-Mariposa Unit,RED,June,6,Debris Burning,134.0,D,CA,,2015
300346248,Lassen-Modoc Unit,POPCORN,June,6,Lightning,3000.0,F,CA,,2008
300347461,CDF - Merced-Mariposa Unit,VALLEY,June,6,Equipment Use,480.0,E,CA,,2014


In [216]:
# DO NOT MODIFY THIS CELL
# Testing to make sure the result is a dataframe
# With the fire id as the index
# And an integer year column

student_dataframe_all_fires = _


if isinstance(student_dataframe_all_fires, pd.DataFrame):
    print("\u2705 Object created is a dataframe, good job!")
else:
    print("\u274C Object created is not a dataframe.")

if student_dataframe_all_fires.index.name == 'fd_unq_id':
    print("\u2705 Successfully read in the fire id column as the index!")
else:
    print("\u274C The index is not set to the fire id column.")
    
if pd.api.types.is_integer_dtype(student_dataframe_all_fires.year):
    print("\u2705 Successfully created an integer year column!")
else:
    print("\u274C The year is not an integer column")

✅ Object created is a dataframe, good job!
✅ Successfully read in the fire id column as the index!
✅ Successfully created an integer year column!


In [217]:
# DO NOT MODIFY THIS CELL

In the cell below, answer the following in a **numbered list**:
1. What are the advantages of using the unique id as an index instead of the year as you did in previous assignments?
2. What are the advantages of using sorted year as the index?

1. Using a unique I.D as the index might be more advantageous vs. using the year in that there are multiple fires that occured each year, and by doing it this way we are creating this dataframe to be more versitile and specific in respect to each unique fire. 
2. By using the sorted year as the index, it is easier to create time-sensitive figures. This could be useful if you are trying to see your data in respect to the year that the fires occured. 

## Challenge 3: Group and aggregate

Use the pandas DataFrame method `.groupby` to take the **yearly maximum** and **monthly mean** fire size, and then call both dataframes, e.g.:

max_fire_size_yearly_df, mean_fire_size_monthly_df

**Use the month_num column to take the monthly mean so that the DataFrame will be in the right order**

In [218]:
# max fire size
max_fire_size_yearly_df = ca_fire_df.groupby(
    ["year"])[["fire_size"]].max()

# mean fire size
mean_fire_size_monthly_df = ca_fire_df.groupby(
    ["month_num"])[["fire_size"]].mean()

# print out dataframes
max_fire_size_yearly_df, mean_fire_size_monthly_df

(      fire_size
 year           
 1992    64000.0
 1993    43201.0
 1994    48851.0
 1995    23455.0
 1996   106668.0
 1997    49490.0
 1998    28164.0
 1999   124898.0
 2000    75000.0
 2001    67792.0
 2002   150696.0
 2003   280059.0
 2004    39138.0
 2005    63436.0
 2006   162702.0
 2007   240207.0
 2008   162818.0
 2009   160371.0
 2010    16442.0
 2011    25577.0
 2012   315578.8
 2013   255858.0
 2014   115279.2
 2015   151623.0,
              fire_size
 month_num             
 1          1099.714286
 2          2163.863636
 3           705.638889
 4          1196.712247
 5          1247.652663
 6          2274.558100
 7          3022.692924
 8          3717.205347
 9          2816.627351
 10         6879.094182
 11         1130.758537
 12         1941.062500)

In [219]:
# DO NOT MODIFY THIS CELL

# Tests if the results are DataFrames
# Makes sure that the yearly and monthly results are the right length

student_max_yearly, student_mean_monthly = _

if isinstance(student_max_yearly, pd.DataFrame):
    print("\u2705 First object created is a dataframe, good job!")
else:
    print("\u274C First object created is not a dataframe.")
    

if len(student_max_yearly)==24:
    print("\u2705 First object created is a yearly summary, good job!")
else:
    print("\u274C First object created is not a yearly summary.") 
    
if isinstance(student_mean_monthly, pd.DataFrame):
    print("\u2705 Second object created is a dataframe, good job!")
else:
    print("\u274C Second object created is not a dataframe.")
    
if len(student_mean_monthly)==12:
    print("\u2705 Second object created is a monthly summary, good job!")
else:
    print("\u274C Second object created is not a monthly summary.") 

✅ First object created is a dataframe, good job!
✅ First object created is a yearly summary, good job!
✅ Second object created is a dataframe, good job!
✅ Second object created is a monthly summary, good job!


In [220]:
# DO NOT MODIFY THIS CELL

## Challenge 4: Add human ignition flag and season fields

Add two columns to your dataframe:
* A boolean `human_ignition` column that is true if the cause is one of Arson, Smoking, Equipment Use, Campfire, Powerline, Railroad, or Debris Burning
* An ordered Categorical `season` column using MAM/JJA/SON/DJF seasons call `pd.Categorical()` on the resulting column. Spring should be the first season

BONUS: Try to create the `season` column by writing your own `month_to_season` function and applying it to the month or month_num column AND/OR create a DateTime column and use the dt.season attribute.

**Call your modified dataframe at the end of the cell**

In [301]:

human_caused = ["Arson", "Smoking", "Equipment Use", "Campfire", 
                "Powerline","Railroad"]


# Create a new column human_ignition and create a condition whether human-caused or not
ca_fire_df['human_ignition'] = (ca_fire_df['cause'].isin(human_caused))

# create season variable
season = ["Spring", "Summer", "Fall", "Winter"]

# create categorical season column by writing month_to_season function
def month_to_season(month):
    season_conversion = {
            'March': 'Spring',
            'April': 'Spring',
            'May': 'Spring',
            'June': 'Summer',
            'July': 'Summer',
            'August': 'Summer',
            'September': 'Fall',
            'October': 'Fall',
            'November': 'Fall',
            'December': 'Winter',
            'January': 'Winter',
            'February': 'Winter'                
    }
    return season_conversion.get(month)

#apply month_to_season to month 
#ca_fire_df['month'].apply(month_to_season)


ca_fire_df['season'] = pd.Categorical(
    ca_fire_df['month'].apply(month_to_season), categories=season,
                                     ordered=True)


ca_fire_df


Unnamed: 0_level_0,source_reporting_unit_name,fire_name,month,month_num,cause,fire_size,fire_size_class,state,county,year,human_ignition,season
fd_unq_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
15000792,CDF - Riverside Unit,RIVER,February,2,Missing/Undefined,122.0,D,CA,Riverside,2004,False,Winter
15001254,San Bernardino National Forest,UPPER SANTA ANA BURN ESCAPE,March,3,Missing/Undefined,350.0,E,CA,San Bernardino,2004,False,Spring
317618,Bakersfield District,FORT,April,4,Arson,150.0,D,CA,Inyo,2004,True,Spring
1311320,CDF - Merced-Mariposa Unit,RIMS FIRE,April,4,Miscellaneous,200.0,D,CA,,2004,False,Spring
1311367,CDF - Merced-Mariposa Unit,210,April,4,Equipment Use,115.0,D,CA,,2004,True,Spring
...,...,...,...,...,...,...,...,...,...,...,...,...
1300492,Dunnigan Fire Protection District,SIX,October,10,Missing/Undefined,1235.0,F,CA,Yolo,2009,False,Fall
300341742,San Mateo-Santa Cruz Unit,LOMA IC,October,10,Missing/Undefined,490.5,E,CA,,2009,False,Fall
1332172,Humboldt-Del Norte Unit,IAQUA,October,10,Equipment Use,160.0,D,CA,,2009,True,Fall
300331819,Humboldt-Del Norte Unit,CCSO,October,10,Missing/Undefined,250.0,D,CA,,2009,False,Fall


In [302]:
# DO NOT MODIFY THIS CELL

# Tests if the result is a DataFrame with human_ignition and season columns

student_add_columns_df = _

if isinstance(student_add_columns_df, pd.DataFrame):
    print("\u2705 Object is a dataframe, good job!")
else:
    print("\u274C Object is not a dataframe.")
    
if 'human_ignition' in student_add_columns_df.columns:
    print("\u2705 Dataframe has a human_ignition column, good job!")
else:
    print("\u274C Dataframe is missing a human_ignition column.")
    
if 'season' in student_add_columns_df.columns:
    print("\u2705 Dataframe has a season column, good job!")
else:
    print("\u274C Dataframe is missing a season column.")

✅ Object is a dataframe, good job!
✅ Dataframe has a human_ignition column, good job!
✅ Dataframe has a season column, good job!


In [None]:
# DO NOT MODIFY THIS CELL


## Challenge 5: Making a copy of a dataframe versus a slice

If you want to make changes to the structure of your dataframe and you are only working with a slice, Pandas will give you a warning message. If this is what you truly want to do, you can use the copy command to make a new copy of the subset of rows/columns you are working with. Here we just want to work on a the subset of the fires that are human-caused. Because we created a copy, we can now link in other table data and manipulate it. You can see that this copy is smaller, but it is also a completely new copy of the data. Any changes to human fires will not affect the original dataframe.

Try this out by writing code to do the following:
* Make a copy of just the rows flagged as human_ignition
* Convert fire_name to title case
* Call the first 5 rows of the fire_name column for both the original and the copy and note the difference

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

## Challenge 6: looping through nested directories

Loop through nested directories to create a data catalog of the "ca-fires-yearly" dataset with the following specifications:
* The data catalog should be a DataFrame with the following columns:
  * The `dataset` column should contain the name of the directory you are searching, such as "monthly-mean-size"
  * The `file_name` column should contain strings of the basename only of each file in alphabetical (and chronological) order
  * The `year` column should be an integer column of the year extracted from the file name
  * The `file_rows` column should contain the number of rows in the corresponding file
  * The `file_columns` columns should contain the number of columns in the corresponding file
  
HINT: There are a couple of ways to create DataFrames from scratch. [Check out the pandas documentation for examples.](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)

**Call your data catalog dataframe at the end of the cell

In [222]:
def create_dataset_catalog(ds_path):
   
    # get a list of directories
    dataset_dirs = glob(os.path.join(ds_path, '*'))
    # initialize variables 
    name = []
    file_name = []
    year = []
    file_rows = []
    file_columns = []

    #loop through dataset directories
    for dataset in dataset_dirs: 
        #Get files 
        dataset_files = glob(os.path.join(dataset, '*.csv'))
        #Loop through files in alphaetical and chronological order
        for file in sorted(dataset_files):
            df = pd.read_csv(file)
            # the name of the directory you are searching
            name.append(os.path.basename(dataset))
            # file_name column should contain strings of the basename of each file
            file_name.append(os.path.basename(file))
            # the year column should be an integer column extracted from the file name
            year.append(file[-8:-4])
            # the file_rows column - the number of rows
            file_rows.append(df.shape[0])
            # the file_columns - number of rows
            file_columns.append(df.shape[1])

    # construct dataframe
    catalog_df = pd.DataFrame({
        'dataset': name,
        'file_name': file_name,
        'year': year,
        'file_rows': file_rows,
        'file_columns': file_columns
    })

    return catalog_df

# path of "ca-fires-yearly" dataset
ca_fires_path = os.path.join(
    'earthpy-downloads',
    'ca-fires-yearly')

ca_fires_catalog_df = create_dataset_catalog(ca_fires_path)

# call your data catalog dataframe at the end of the cell
ca_fires_catalog_df

Unnamed: 0,dataset,file_name,year,file_rows,file_columns
0,monthly-mean-size,monthly-mean-size-1992.csv,1992,1,12
1,monthly-mean-size,monthly-mean-size-1993.csv,1993,1,12
2,monthly-mean-size,monthly-mean-size-1994.csv,1994,1,12
3,monthly-mean-size,monthly-mean-size-1995.csv,1995,1,12
4,monthly-mean-size,monthly-mean-size-1996.csv,1996,1,12
...,...,...,...,...,...
67,1992-2015-gt-100-acres,gt-100-acres-2011.csv,2011,139,10
68,1992-2015-gt-100-acres,gt-100-acres-2012.csv,2012,127,10
69,1992-2015-gt-100-acres,gt-100-acres-2013.csv,2013,122,10
70,1992-2015-gt-100-acres,gt-100-acres-2014.csv,2014,90,10


In [223]:
# DO NOT MODIFY THIS CELL

# Tests if the result is a DataFrame with the correct columns

student_catalog_df = _

if isinstance(student_catalog_df, pd.DataFrame):
    print("\u2705 Object is a dataframe, good job!")
else:
    print("\u274C Object is not a dataframe.")
    
if all([col in student_catalog_df.columns.values
        for col 
        in ['dataset', 'file_name', 'year', 'file_rows', 'file_columns']]):
    print("\u2705 Dataframe has all the right columns, good job!")
else:
    print("\u274C Dataframe is missing required columns.")

✅ Object is a dataframe, good job!
✅ Dataframe has all the right columns, good job!


In [None]:
# DO NOT MODIFY THIS CELL

## Challenge 7: Convert challenge 5 into a function

**Make sure to include a numpy-style docstring**

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

## Challenge 8: Call create_dataset_catalog

Call the function you just created

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

## Challenge 9: Call help for catalog function

Call help for the function we just created

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

## Challenge 10: plot fires by cause (homework 8 challenge 10 revisited

Create a plot using a for loop and conditional statment to the following specifications:
* Filter data to size class 'G' and 2010 or later
* Scatter plot of fire size (y-axis) vs. season (x-axis) labeled by cause
* Colored human causes 'red' and non-human causes 'blue'
* Adjust the size and layout so that labels do not overlap

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

## Challenge 11: Try plotting using the seaborn stripplot instead
This will let you see overlapping data. Set the hue to human_ignition and dodge to True. You won't need a for loop for this plot.

[Check out the documentation for some differences from matplotlib, including the data parameter](http://seaborn.pydata.org/generated/seaborn.stripplot.html#seaborn.stripplot)

In [None]:
# YOUR CODE HERE
raise NotImplementedError()