# Data Analysis and Visualization in Python
## Data workflows and automation
Questions
* Can I automate operations in Python?
* What are functions and why should I use them?

Objectives
* Employ `for` loops to automate data analysis.
* Write unique filenames in Python.
* Build reusable code in Python.
* Write functions using conditional statements (`if`, `then`, `else`).

## Loading our Data

In [2]:
import pandas as pd

# Load the data
surveys_df = pd.read_csv("data/surveys.csv")
species_df = pd.read_csv("data/species.csv")

## Automating data processing using For Loops

In [3]:
import os

In [4]:
folder_years = "yearly_files"
os.mkdir(folder_years)

In [5]:
os.listdir('.')

['.git',
 '.gitignore',
 '00-about.ipynb',
 '01-data.ipynb',
 '02-selection.ipynb',
 '03-combining.ipynb',
 '04-workflows.ipynb',
 '05-plotnine.ipynb',
 '06-matplotlib.ipynb',
 'README.md',
 'data',
 'extras',
 'solutions',
 '.ipynb_checkpoints',
 'surveys_complete.csv',
 'surveys_sub.csv',
 'yearly_files']

In [12]:
os.listdir('./yearly_files')

['surveys_1977.csv',
 'surveys_1978.csv',
 'surveys_1979.csv',
 'surveys_1980.csv',
 'surveys_1981.csv',
 'surveys_1982.csv',
 'surveys_1983.csv',
 'surveys_1984.csv',
 'surveys_1985.csv',
 'surveys_1986.csv',
 'surveys_1987.csv',
 'surveys_1988.csv',
 'surveys_1989.csv',
 'surveys_1990.csv',
 'surveys_1991.csv',
 'surveys_1992.csv',
 'surveys_1993.csv',
 'surveys_1994.csv',
 'surveys_1995.csv',
 'surveys_1996.csv',
 'surveys_1997.csv',
 'surveys_1998.csv',
 'surveys_1999.csv',
 'surveys_2000.csv',
 'surveys_2001.csv',
 'surveys_2002.csv']

In [11]:
for year in surveys_df['year'].unique():
    # Create a unique filename for each year
    filename = os.path.join(folder_years, "surveys_" + str(year) + ".csv")
    print(filename)

    # Select data for the year
    surveys_year = surveys_df[surveys_df['year'] == year]
    surveys_year.to_csv(filename, index=False)

os.listdir(folder_years)

yearly_files/surveys_1977.csv
yearly_files/surveys_1978.csv
yearly_files/surveys_1979.csv
yearly_files/surveys_1980.csv
yearly_files/surveys_1981.csv
yearly_files/surveys_1982.csv
yearly_files/surveys_1983.csv
yearly_files/surveys_1984.csv
yearly_files/surveys_1985.csv
yearly_files/surveys_1986.csv
yearly_files/surveys_1987.csv
yearly_files/surveys_1988.csv
yearly_files/surveys_1989.csv
yearly_files/surveys_1990.csv
yearly_files/surveys_1991.csv
yearly_files/surveys_1992.csv
yearly_files/surveys_1993.csv
yearly_files/surveys_1994.csv
yearly_files/surveys_1995.csv
yearly_files/surveys_1996.csv
yearly_files/surveys_1997.csv
yearly_files/surveys_1998.csv
yearly_files/surveys_1999.csv
yearly_files/surveys_2000.csv
yearly_files/surveys_2001.csv
yearly_files/surveys_2002.csv


['surveys_1977.csv',
 'surveys_1978.csv',
 'surveys_1979.csv',
 'surveys_1980.csv',
 'surveys_1981.csv',
 'surveys_1982.csv',
 'surveys_1983.csv',
 'surveys_1984.csv',
 'surveys_1985.csv',
 'surveys_1986.csv',
 'surveys_1987.csv',
 'surveys_1988.csv',
 'surveys_1989.csv',
 'surveys_1990.csv',
 'surveys_1991.csv',
 'surveys_1992.csv',
 'surveys_1993.csv',
 'surveys_1994.csv',
 'surveys_1995.csv',
 'surveys_1996.csv',
 'surveys_1997.csv',
 'surveys_1998.csv',
 'surveys_1999.csv',
 'surveys_2000.csv',
 'surveys_2001.csv',
 'surveys_2002.csv']

### Exercises - Creating multiple CSV files
Instead of splitting out the data by years, a colleague wants to analyse each species separately. How would you write a unique csv file for each species?

In [13]:
folder_species = "species_files"
os.mkdir(folder_species)  # Create the directory

In [17]:
merged_left = pd.merge(left=surveys_df, right=species_df, how='left', on="species_id")

for species in merged_left['species'].unique():
    # Create a unique filename for each species
    filename = os.path.join(folder_species, "surveys_" + str(species) + ".csv")
    print(filename)

    # Select data for the current species
    merged_left_species = merged_left[merged_left['species'] == species].dropna()
    merged_left_species.to_csv(filename, index=False)

os.listdir(folder_species)

species_files/surveys_nan.csv
species_files/surveys_merriami.csv
species_files/surveys_flavus.csv
species_files/surveys_eremicus.csv
species_files/surveys_spectabilis.csv
species_files/surveys_penicillatus.csv
species_files/surveys_hispidus.csv
species_files/surveys_torridus.csv
species_files/surveys_ordii.csv
species_files/surveys_sp..csv
species_files/surveys_spilosoma.csv
species_files/surveys_leucogaster.csv
species_files/surveys_megalotis.csv
species_files/surveys_albigula.csv
species_files/surveys_audubonii.csv
species_files/surveys_maniculatus.csv
species_files/surveys_harrisi.csv
species_files/surveys_bilineata.csv
species_files/surveys_brunneicapillus.csv
species_files/surveys_melanocorys.csv
species_files/surveys_squamata.csv
species_files/surveys_fulvescens.csv
species_files/surveys_chlorurus.csv
species_files/surveys_gramineus.csv
species_files/surveys_fuscus.csv
species_files/surveys_viridis.csv
species_files/surveys_leucophrys.csv
species_files/surveys_scutalatus.csv
spec

['surveys_nan.csv',
 'surveys_merriami.csv',
 'surveys_flavus.csv',
 'surveys_eremicus.csv',
 'surveys_spectabilis.csv',
 'surveys_penicillatus.csv',
 'surveys_hispidus.csv',
 'surveys_torridus.csv',
 'surveys_ordii.csv',
 'surveys_sp..csv',
 'surveys_spilosoma.csv',
 'surveys_leucogaster.csv',
 'surveys_megalotis.csv',
 'surveys_albigula.csv',
 'surveys_audubonii.csv',
 'surveys_maniculatus.csv',
 'surveys_harrisi.csv',
 'surveys_bilineata.csv',
 'surveys_brunneicapillus.csv',
 'surveys_melanocorys.csv',
 'surveys_squamata.csv',
 'surveys_fulvescens.csv',
 'surveys_chlorurus.csv',
 'surveys_gramineus.csv',
 'surveys_fuscus.csv',
 'surveys_viridis.csv',
 'surveys_leucophrys.csv',
 'surveys_scutalatus.csv',
 'surveys_clarki.csv',
 'surveys_taylori.csv',
 'surveys_fulviventer.csv',
 'surveys_montanus.csv',
 'surveys_savannarum.csv',
 'surveys_ochrognathus.csv',
 'surveys_intermedius.csv',
 'surveys_tereticaudus.csv',
 'surveys_uniparens.csv',
 'surveys_undulatus.csv',
 'surveys_baileyi.c

## Building reusable and modular code with functions
* Automatically create the `folder_to_save` if it does not exist.
* Use `None` as default `start_year` and `end_year`.
* Make the second function return a list of generated files.

In [18]:
def one_year_csv_writer(all_data, folder_to_save, prefix, this_year):
    """
    Writes a csv file for data from a given year. Returns the filename.

    all_data --- DataFrame with multi-year data
    folder_to_save --- folder to save the data files
    prefix --- prefix for the CSV file name
    this_year --- year for which data is extracted
    """

    # Create a unique filename for each year
    filename = os.path.join(folder_to_save, prefix + str(this_year) + ".csv")

    # Select data for the year
    data_for_year = all_data[all_data['year'] == this_year]
    data_for_year.to_csv(filename, index=False)

    return filename

In [19]:
def yearly_data_csv_writer(all_data, folder_to_save, prefix,
                           start_year = None, end_year = None):
    """
    Modified from yearly_data_csv_writer to test default argument values!

    all_data --- DataFrame with multi-year data
    folder_to_save --- folder to save the data files
    prefix --- prefix for the CSV file name
    start_year --- the first year of data we want --- default: None - check all_data
    end_year --- the last year of data we want --- default: None - check all_data
    """

    if folder_to_save in os.listdir('.'):
        print('Processed directory exists')
    else:
        os.mkdir(folder_to_save)
        print('Processed directory created')

    if not start_year:
        start_year = min(all_data['year'])

    if not end_year:
        end_year = max(all_data['year'])

    filenames = []

    # "end_year" is the last year of data we want to pull, so we loop to end_year+1
    for year in range(start_year, end_year + 1):
        filenames.append(one_year_csv_writer(all_data, folder_to_save, prefix, year))

    return filenames

In [20]:
yearly_data_csv_writer(surveys_df, 'final', 'results_', 1995, 1998)

Processed directory created


['final/results_1995.csv',
 'final/results_1996.csv',
 'final/results_1997.csv',
 'final/results_1998.csv']

In [21]:
os.listdir("final")

['results_1995.csv',
 'results_1996.csv',
 'results_1997.csv',
 'results_1998.csv',
 '.ipynb_checkpoints']

In [22]:
yearly_data_csv_writer(surveys_df, 'finalall', 'results_all')

Processed directory created


['finalall/results_all1977.csv',
 'finalall/results_all1978.csv',
 'finalall/results_all1979.csv',
 'finalall/results_all1980.csv',
 'finalall/results_all1981.csv',
 'finalall/results_all1982.csv',
 'finalall/results_all1983.csv',
 'finalall/results_all1984.csv',
 'finalall/results_all1985.csv',
 'finalall/results_all1986.csv',
 'finalall/results_all1987.csv',
 'finalall/results_all1988.csv',
 'finalall/results_all1989.csv',
 'finalall/results_all1990.csv',
 'finalall/results_all1991.csv',
 'finalall/results_all1992.csv',
 'finalall/results_all1993.csv',
 'finalall/results_all1994.csv',
 'finalall/results_all1995.csv',
 'finalall/results_all1996.csv',
 'finalall/results_all1997.csv',
 'finalall/results_all1998.csv',
 'finalall/results_all1999.csv',
 'finalall/results_all2000.csv',
 'finalall/results_all2001.csv',
 'finalall/results_all2002.csv']

In [23]:
os.rmdir('finalall')

OSError: [Errno 39] Directory not empty: 'finalall'

In [24]:
help(os.rmdir)

Help on built-in function rmdir in module posix:

rmdir(path, *, dir_fd=None)
    Remove a directory.
    
    If dir_fd is not None, it should be a file descriptor open to a directory,
      and path should be relative; path will then be relative to that directory.
    dir_fd may not be implemented on your platform.
      If it is unavailable, using it will raise a NotImplementedError.

