# Combining Data from Multiple Spreadsheets

In [1]:
# python -m pip install pandas, openpyxl

### Import installed packages

In [2]:
import os # for performing operating system operations, such as accessing file directories

import glob  # used for finding files matching a particular pattern

import pandas as pd

from openpyxl import load_workbook

### Specify the folder where the files are held and the file type

In [3]:
path = './data' # i.e. the data folder within the current directory (represented by the ".")

file_list = glob.glob(os.path.join(path, '*.xlsx')) # get all the .xlsx files in the folder

# The "glob" function in the "glob" library looks for any files that match the pattern of the file path + any .xlsx file
# where the "*" is a wildcard for any string (i.e. any file name). The "os" library is used to join the file path to the
# file names.

### Let's count how many files that relates to:

In [4]:
print(len(file_list))

36


### Nested functions to extract data from specific cell ranges and iterate over multiple files

This will make use of the function for lifting data from a formatted Excel file and put it inside a function that cycles through multiple files.

The top-level function "extract_multiple()" will process the data from each file into neat columns.

The nested function "extract_data()" builds mini data frames from the cell ranges in the worksheets and passes those to the main function for processing.

The main function "extract_multiple()" will then get applied in a for loop to iterate over all of the files in the "data" folder, producing a dataframe for each. These dataframes then get concatenated (UNIONed, in SQL-speak) into one dataframe, which becomes the one to be exported to .csv

In [None]:
def extract_multiple(file):
    # Firstly, specify which file to open, namely the one passed to the "file" argument of the "extract_multiple" function
    workbook = load_workbook(filename=file, read_only=True, data_only=True)

    # Then, specify the worksheet within the workbook from which to extract the data.
    worksheet = workbook['5.2'] 

    ###################################################################################
    # A nested function that creates individual dataframes of the data that we want to
    # extract from each worksheet.
     
    def extract_data(first,last): #i.e. reference for first cell and last cell in table 
        data_rows = []

        for row in worksheet[first:last]:
            data_cols = []
            for cell in row:
                data_cols.append(cell.value)
            data_rows.append(data_cols)
        df = pd.DataFrame(data_rows)
        return df
    ###################################################################################

    # Apply the nested function to the ranges that we want to extract
    current_month = extract_data('S8','S8') # NB: creates a dataframe
    current_month = current_month.iloc[0,0] # convert dataframe to a single value
    all_inpatients_data = extract_data('S11','T11')
    icb_list = extract_data('A16','A57')
    icbs_data = extract_data('S16','T57')

    # name of columns for our data extract
    cols = ['month_year', 
            'org_code',
            'adult_inpatients',
            'rate_per_million'
            ]

    # give all_inpatients_data column names
    all_inpatients_data = all_inpatients_data.rename(columns={all_inpatients_data.columns[0]:cols[2],
                                                            all_inpatients_data.columns[1]:cols[3]
                                                            }
                                                    )

    # add org_code to all_inpatients_data
    all_inpatients_data[cols[1]] = 'All'

    # bring "org_code" to the front by matching the order in "cols" (without "month_year")
    all_inpatients_data = all_inpatients_data[cols[1:]]

    # give icbs_data columns names
    icbs_data = icbs_data.rename(columns={icbs_data.columns[0]:cols[2],
                                        icbs_data.columns[1]:cols[3],
                                        }
                                )

    # give icb_list a column name
    icb_list = icb_list.rename(columns={icb_list.columns[0]:cols[1]}) 

    # concatenate "icb_list" and "icbs_data" into one table
    icbs_data = pd.concat([icb_list,icbs_data] , axis = 1)

    # create consolidated data
    consolidated_data = pd.concat([all_inpatients_data,icbs_data], axis = 0)
    consolidated_data = consolidated_data.reset_index()

    # add the month description and bring it to the front
    consolidated_data[cols[0]] = current_month
    consolidated_data = consolidated_data[cols]

    return consolidated_data

### Apply the top-level "extract_multiple()" function and concatenate the results into one data table

In [None]:
# extract_multiple() creates dataframes for data from each of the files (in the background, so to speak)
# and these get concatenated to form one dataframe which has been assigned to the "data_extract" variable.

data_extract = pd.concat([extract_multiple(file) for file in file_list], ignore_index=True)

# We can examime this dataframe variable. It appears to contain data for ICBs outside of our five BI customer ICBs.

data_extract.head(10)

Unnamed: 0,month_year,org_code,adult_inpatients,rate_per_million
0,April 2021,All,1825,42
1,April 2021,QOX,25,36
2,April 2021,QHG,20,30
3,April 2021,QHL,50,59
4,April 2021,QUY,35,48
5,April 2021,QU9,45,33
6,April 2021,QUE,30,41
7,April 2021,QYG,80,40
8,April 2021,QT6,25,55
9,April 2021,QWU,40,55


### Extract the data for the five BI customer ICBs, plus all-England for comparison 

In [None]:
# list of relevant geographies
relevant = ['All', 
            'QRL', # HIOW
            'QNQ', # Frimley
            'QU9', # BOB
            'QNX', # Sussex
            'QSL', # Somerset
            ]

data_extract = data_extract[data_extract['org_code'].isin(relevant)]

data_extract.head(10)

Unnamed: 0,month_year,org_code,adult_inpatients,rate_per_million
0,April 2021,All,1825,42
5,April 2021,QU9,45,33
14,April 2021,QNQ,10,18
17,April 2021,QRL,50,33
34,April 2021,QSL,10,18
40,April 2021,QNX,50,38
43,May 2021,All,1845,42
48,May 2021,QU9,45,35
57,May 2021,QNQ,10,21
60,May 2021,QRL,50,36


### Export the final extract to .csv

In [10]:
data_extract.to_csv('output/ld_inpatients_per_million.csv', index=False)