In [1]:
import pandas as pd
import os

- Input the data
- We want to stack the tables on top of one another, since they have the same fields in each sheet. We can do this one of 2 ways:
    - Drag each table into the canvas and use a union step to stack them on top of one another
    - Use a wildcard union in the input step of one of the tables
- Some of the fields aren't matching up as we'd expect, due to differences in spelling. Merge these fields together
- Make a Joining Date field based on the Joining Day, Table Names and the year 2023
- Now we want to reshape our data so we have a field for each demographic, for each new customer 
- Make sure all the data types are correct for each field
- Remove duplicates 
- If a customer appears multiple times take their earliest joining date
- Output the data

In [2]:
# Define the directory where your Excel workbooks are located
workbook_dir = os.getcwd()

# Get the list of Excel files in the directory
workbooks = [file for file in os.listdir(workbook_dir) if file.endswith('.xlsx')]

# Initialize an empty list to store the dataframes
dfs = []

# Iterate through each workbook
for workbook in workbooks:
    # Read each sheet of the workbook into a dataframe
    xls = pd.ExcelFile(os.path.join(workbook_dir, workbook))
    for sheet_name in xls.sheet_names:
        df = pd.read_excel(xls, sheet_name=sheet_name)
        # Add a column for the month
        df['Month'] = sheet_name
        # Append the dataframe to the list
        dfs.append(df)
# Concatenate all the dataframes into a single dataframe

combined_df = pd.concat(dfs, ignore_index=True)

# Print the combined dataframe
combined_df


Unnamed: 0,ID,Joining Day,Demographic,Value,Month,Demographiic,Demagraphic
0,490910,3,Ethnicity,White,January,,
1,490910,3,Date of Birth,5/23/1981,January,,
2,490910,3,Account Type,Basic,January,,
3,369221,18,Ethnicity,Black,January,,
4,369221,18,Date of Birth,3/4/2019,January,,
...,...,...,...,...,...,...,...
2965,174699,2,Date of Birth,3/13/1989,December,,
2966,174699,2,Account Type,Gold,December,,
2967,514598,28,Ethnicity,Other,December,,
2968,514598,28,Date of Birth,10/10/1971,December,,


In [3]:
#Fill NA in Demographic from Demagraphic and Demographiic columns
combined_df['Demographic'] = combined_df['Demographic'].fillna(combined_df['Demographiic'])
combined_df['Demographic'] = combined_df['Demographic'].fillna(combined_df['Demagraphic'])
combined_df.drop(['Demographiic', 'Demagraphic'], inplace= True, axis =1 )

In [5]:
#Create a Joining Date column
combined_df['Joining Date'] = pd.to_datetime(combined_df['Month'] + ' ' + combined_df['Joining Day'].astype(str) + ', 2023')


In [8]:
#Reshape data
df_out = ( pd.pivot_table(combined_df, 
                          index=['ID', 'Joining Date'], 
                          columns='Demographic', 
                          values='Value', 
                          aggfunc='min')
             .reset_index()
             .sort_values('Joining Date')
             .drop_duplicates('ID') )


# change data types
df_out['Date of Birth'] = pd.to_datetime(df_out['Date of Birth'])

In [9]:
df_out

Demographic,ID,Joining Date,Account Type,Date of Birth,Ethnicity
871,893948,2023-01-01,Gold,2013-09-01,White
837,863243,2023-01-01,Platinum,2007-12-30,Other
847,871764,2023-01-02,Gold,1967-10-29,White
725,761244,2023-01-02,Platinum,1942-12-26,White
393,471554,2023-01-02,Basic,1990-04-22,White
...,...,...,...,...,...
660,704688,2023-12-28,Basic,2017-01-02,Other
627,679893,2023-12-28,Basic,1951-07-02,Asian
447,514598,2023-12-28,Platinum,1971-10-10,Other
560,618817,2023-12-30,Gold,1950-12-22,Asian
