# Looping Over Data Sets



#### Objectives:
- Be able to read and write globbing expressions that match sets of files.
- Use glob to create lists of files.
- Write for loops to perform operations on files given their names in a list.


In [1]:
import glob
import pandas as pd

# Use a for loop to process files given a list of their names.
for filename in ['../data/2011_circ.csv', '../data/2012_circ.csv']:
    data = pd.read_csv(filename)
    print(f'{filename}: Max YTD:', data['ytd'].max())

../data/2011_circ.csv: Max YTD: 966720
../data/2012_circ.csv: Max YTD: 937649


In [5]:
# Use glob to find sets of files whose names match a pattern.
print(f"All CSV files in the data directory: {glob.glob('../data/*.csv')}")


All CSV files in the data directory: ['../data/2011_circ.csv', '../data/total_circulation_by_branch.csv', '../data/2016_circ.csv', '../data/2017_circ.csv', '../data/pklhigh_usage.csv', '../data/2022_circ.csv', '../data/2018_circ.csv', '../data/2019_circ.csv', '../data/2012_circ.csv', '../data/2013_circ.csv', '../data/2021_circ.csv', '../data/2020_circ.csv', '../data/2015_circ.csv', '../data/2014_circ.csv']


In [6]:
import glob
import pandas as pd

# Use glob and for loop to process batches of files and handle KeyError
for csv in sorted(glob.glob('../data/*.csv')):
    data = pd.read_csv(csv)
    
    # Print column names to check for consistency across files
    print(f'Processing file: {csv}')
    print(f'Columns: {data.columns}')
    
    # Check if 'ytd' column exists before trying to access it
    if 'ytd' in data.columns:
        print(f'{csv}: Max YTD:', data['ytd'].max())
    else:
        print(f'{csv}: Missing "ytd" column')


Processing file: ../data/2011_circ.csv
Columns: Index(['branch', 'address', 'city', 'zip code', 'january', 'february', 'march',
       'april', 'may', 'june', 'july', 'august', 'september', 'october',
       'november', 'december', 'ytd'],
      dtype='object')
../data/2011_circ.csv: Max YTD: 966720
Processing file: ../data/2012_circ.csv
Columns: Index(['branch', 'address', 'city', 'zip code', 'january', 'february', 'march',
       'april', 'may', 'june', 'july', 'august', 'september', 'october',
       'november', 'december', 'ytd'],
      dtype='object')
../data/2012_circ.csv: Max YTD: 937649
Processing file: ../data/2013_circ.csv
Columns: Index(['branch', 'address', 'city', 'zip code', 'january', 'february', 'march',
       'april', 'may', 'june', 'july', 'august', 'september', 'october',
       'november', 'december', 'ytd'],
      dtype='object')
../data/2013_circ.csv: Max YTD: 821749
Processing file: ../data/2014_circ.csv
Columns: Index(['branch', 'address', 'city', 'zip code', '

In [7]:

# Appending DataFrames to a list and adding 'year' column
dfs = []
counter = 1

for csv in sorted(glob.glob('../data/*.csv')):
    year = csv[8:12]  # Capture the year from the file name
    data = pd.read_csv(csv)
    data['year'] = year
    print(f'{counter}: Saving {len(data)} rows from {csv}')
    dfs.append(data)
    counter += 1

# Check the first DataFrame in the list
dfs[0].head(2)


1: Saving 80 rows from ../data/2011_circ.csv
2: Saving 79 rows from ../data/2012_circ.csv
3: Saving 80 rows from ../data/2013_circ.csv
4: Saving 80 rows from ../data/2014_circ.csv
5: Saving 80 rows from ../data/2015_circ.csv
6: Saving 80 rows from ../data/2016_circ.csv
7: Saving 80 rows from ../data/2017_circ.csv
8: Saving 80 rows from ../data/2018_circ.csv
9: Saving 81 rows from ../data/2019_circ.csv
10: Saving 81 rows from ../data/2020_circ.csv
11: Saving 81 rows from ../data/2021_circ.csv
12: Saving 81 rows from ../data/2022_circ.csv
13: Saving 81 rows from ../data/pklhigh_usage.csv
14: Saving 82 rows from ../data/total_circulation_by_branch.csv


Unnamed: 0,branch,address,city,zip code,january,february,march,april,may,june,july,august,september,october,november,december,ytd,year
0,Albany Park,5150 N. Kimball Ave.,Chicago,60625.0,8427,7023,9702,9344,8865,11650,11778,11306,10466,10997,10567,9934,120059,2011
1,Altgeld,13281 S. Corliss Ave.,Chicago,60827.0,1258,708,854,804,816,870,713,480,702,927,787,692,9611,2011


In [8]:
# Concatenate DataFrames together into one
df = pd.concat(dfs, ignore_index=True)
f'Number of rows in the concatenated DataFrame: {len(df)}'

'Number of rows in the concatenated DataFrame: 1126'

In [9]:

# Challenge 1: Determining Matches
# Only file 1 is matched by the glob pattern.
# glob.glob('../data/*circ.csv') matches:
# 1. ../data/2011_circ.csv


In [11]:
import glob
import pandas as pd

# Use glob and for loop to process batches of files and handle KeyError
for csv in sorted(glob.glob('../data/*.csv')):
    data = pd.read_csv(csv)
    
    # Print column names to check for consistency across files
    print(f'Processing file: {csv}')
    print(f'Columns: {data.columns}')
    
    # Check if 'ytd' column exists before trying to access it
    if 'ytd' in data.columns:
        print(f'{csv}: Min YTD:', data['ytd'].min())
    else:
        print(f'{csv}: Missing "ytd" column')


Processing file: ../data/2011_circ.csv
Columns: Index(['branch', 'address', 'city', 'zip code', 'january', 'february', 'march',
       'april', 'may', 'june', 'july', 'august', 'september', 'october',
       'november', 'december', 'ytd'],
      dtype='object')
../data/2011_circ.csv: Min YTD: 9218
Processing file: ../data/2012_circ.csv
Columns: Index(['branch', 'address', 'city', 'zip code', 'january', 'february', 'march',
       'april', 'may', 'june', 'july', 'august', 'september', 'october',
       'november', 'december', 'ytd'],
      dtype='object')
../data/2012_circ.csv: Min YTD: 10010
Processing file: ../data/2013_circ.csv
Columns: Index(['branch', 'address', 'city', 'zip code', 'january', 'february', 'march',
       'april', 'may', 'june', 'july', 'august', 'september', 'october',
       'november', 'december', 'ytd'],
      dtype='object')
../data/2013_circ.csv: Min YTD: 572
Processing file: ../data/2014_circ.csv
Columns: Index(['branch', 'address', 'city', 'zip code', 'januar

In [15]:
import numpy as np 

# Bonus Challenge: Compile CSVs into one DataFrame
dfs = []
for csv in sorted(glob.glob('../data/*.csv')):
    data = pd.read_csv(csv)
    dfs.append(data)


# Concatenate the DataFrames into one
new_df = pd.concat(dfs, ignore_index=True)



np.shape(new_df)

import numpy as np

# Bonus Challenge: Compile CSVs into one DataFrame
dfs = []
for csv in sorted(glob.glob('../data/*.csv')):
    data = pd.read_csv(csv)
    dfs.append(data)

# Concatenate the DataFrames into one
new_df = pd.concat(dfs, ignore_index=True)

# Display first few rows of the new DataFrame
print(new_df.head())

np.shape(new_df)

           branch                  address     city  zip code  january  \
0     Albany Park     5150 N. Kimball Ave.  Chicago   60625.0   8427.0   
1         Altgeld    13281 S. Corliss Ave.  Chicago   60827.0   1258.0   
2  Archer Heights      5055 S. Archer Ave.  Chicago   60632.0   8104.0   
3          Austin        5615 W. Race Ave.  Chicago   60644.0   1755.0   
4   Austin-Irving  6100 W. Irving Park Rd.  Chicago   60634.0  12593.0   

   february    march    april      may     june     july   august  september  \
0    7023.0   9702.0   9344.0   8865.0  11650.0  11778.0  11306.0    10466.0   
1     708.0    854.0    804.0    816.0    870.0    713.0    480.0      702.0   
2    6899.0   9329.0   9124.0   7472.0   8314.0   8116.0   9177.0     9033.0   
3    1316.0   1942.0   2200.0   2133.0   2359.0   2080.0   2405.0     2417.0   
4   11791.0  14807.0  14382.0  11754.0  14402.0  14605.0  15164.0    14306.0   

   october  november  december       ytd year  Unnamed: 0  circulation  
0

(1126, 20)