# Merge Revelio labs and Audit Analytics data, and built rest of variables
* Jie XIA, SUSTech
* 2025-02-14

# 1. Import dataset
* Panel dataset built from revelio labs
  * `revelio` is sample include auditors who have ever worked in multiple offices or places
  * `revelio_filtered` is sample exclude auditors who have ever worked in multiple offices or places
* Panel dataset Built from Audit Analytics

In [47]:
import pandas as pd

revelio_path = r"E:\USA auditor turnover data\turnover data\office-year_auditor_turnover_us.csv"
revelio_filtered_path = r"E:\USA auditor turnover data\turnover data\office-year_auditor_turnover_us_filtered.csv"
aa_path = r"e:\USA auditor turnover data\Audit Analytics data\office-year us auditor variables from AA data.csv"

print("Start to load in csv file")
df_revelio = pd.read_csv(revelio_path, engine='pyarrow')
df_revelio_filtered = pd.read_csv(revelio_filtered_path, engine='pyarrow')
df_aa = pd.read_csv(aa_path, engine='pyarrow')
print("Files loaded successfully")

Start to load in csv file
Files loaded successfully


In [48]:
#print(df_revelio.columns)
#print(df_aa.columns)

In [49]:
#print(df_revelio.describe())

In [50]:
#print(df_aa.describe())

# 2. Merge Revelio and AA variables
* Merge dataset based on `office_key_location`

In [51]:
df_revelio['office_key_location'] = df_revelio['office_key_location'].astype(str)
df_revelio_filtered['office_key_location'] = df_revelio_filtered['office_key_location'].astype(str)
df_revelio['year'] = df_revelio['year'].astype(int)
df_revelio_filtered['year'] = df_revelio_filtered['year'].astype(int)

df_aa['office_key_location'] = df_aa['office_key_location'].astype(str)
df_aa['FY_IC_OP'] = df_aa['FY_IC_OP'].astype(int)

print(f"There are {df_revelio['office_key_location'].nunique()} office in revelio")
print(f"There are {df_revelio_filtered['office_key_location'].nunique()} office in revelio_filtered")
print(f"There are {df_aa['office_key_location'].nunique()} office in aa")

There are 6904 office in revelio
There are 5420 office in revelio_filtered
There are 1348 office in aa


* For variables in aa data, create offices's value from 2001 to 2003 base on offices' values in 2004, if they do not exist.
* And create a indicator column `copy_2004_indicator` to indicate these offices sample

In [52]:
import pandas as pd

# Add a 'copy_value' column to mark original records as 0.
df_aa['copy_2004_indicator'] = 0

# Define the target years that need to be filled in (the years that are missing).
target_years = [2001, 2002, 2003]

# Extract the data for 2004 and, for each office, take the first record 
df_aa_2004 = df_aa[df_aa['FY_IC_OP'] == 2004].drop_duplicates(subset='office_key_location')

# List to store rows that need to be added for the missing years.
rows_to_add = []

# For each office from 2004:
for _, row in df_aa_2004.iterrows():
    office = row['office_key_location']
    # Identify the years (from 2001 to 2003) that already have records for this office.
    existing_years = df_aa.loc[
        (df_aa['office_key_location'] == office) & (df_aa['FY_IC_OP'].isin(target_years)),
        'FY_IC_OP'
    ].unique().tolist()
    # Determine the missing years.
    missing_years = set(target_years) - set(existing_years)
    
    # For each missing year, duplicate the 2004 record, update the year, and set copy_value to 1.
    for yr in missing_years:
        new_row = row.copy()
        new_row['FY_IC_OP'] = yr
        new_row['copy_2004_indicator'] = 1
        rows_to_add.append(new_row)

# If there are rows to add, convert them to a DataFrame and concatenate with the original df_aa.
if rows_to_add:
    df_add = pd.DataFrame(rows_to_add)
    df_aa_extended = pd.concat([df_aa, df_add], ignore_index=True)
else:
    df_aa_extended = df_aa.copy()

# Sort the extended DataFrame by office_key_location and FY_IC_OP, then reset the index.
df_aa_extended = df_aa_extended.sort_values(by=['office_key_location', 'FY_IC_OP']).reset_index(drop=True)

# Output statistics.
num_offices_2004 = df_aa[df_aa['FY_IC_OP'] == 2004]['office_key_location'].nunique()
print(f"There are {num_offices_2004} office samples in 2004")
print("Number of rows in the extended df_aa:", df_aa_extended.shape[0])

# Display a sample of the extended data.
print(df_aa_extended.head(10))


There are 458 office samples in 2004
Number of rows in the extended df_aa: 12676
   office_key_location  FY_IC_OP  OFFICE_SIZE  TOTAL_RESTATEMENT  \
0  1, Albany, New York      2001          5.0                0.0   
1  1, Albany, New York      2002          5.0                0.0   
2  1, Albany, New York      2003          5.0                0.0   
3  1, Albany, New York      2004          5.0                0.0   
4  1, Albany, New York      2005          6.0                0.0   
5  1, Albany, New York      2006          7.0                0.0   
6  1, Albany, New York      2007          3.0                0.0   
7  1, Albany, New York      2008          3.0                0.0   
8  1, Albany, New York      2009          3.0                0.0   
9  1, Albany, New York      2010          4.0                0.0   

   TOTAL_AUDIT_FEE  TOTAL_COMBINED_IC  TOTAL_REPORTS  COMBINED_OP_INDICATOR  \
0        4920460.0                4.0            5.0                    1.0   
1        492

* Merge the revelio and aa data based on `office_key_location` and `year`(`FY_IC_OP`)
* Print the description of merge

In [53]:
from pandasgui import show

def merge_dataframe(df_revelio, df_aa):
    # 1. Filter Revelio data to include only records from 1990 onwards and count rows per year.
    revelio_filtered_year = df_revelio[df_revelio['year'] >= 1990]
    revelio_year_counts = revelio_filtered_year.groupby('year').size().reset_index(name='revelio_count')
    print("Revelio row counts for years from 1990 onwards:")
    print(revelio_year_counts)
    
    # 2. Count the number of records per year in the AA data.
    aa_year_counts = df_aa.groupby('FY_IC_OP').size().reset_index(name='aa_count')
    print("AA row counts per year:")
    print(aa_year_counts)
    
    # 3. Merge the original Revelio data with AA data using office_key_location and year.
    #    The merge is done as an outer join to allow inspection of merge results.
    df_merge = df_revelio.merge(
        df_aa,
        left_on=['office_key_location', 'year'], 
        right_on=['office_key_location', 'FY_IC_OP'],
        how='outer',
        indicator=True
    )

    # Print overall merge status using the _merge column.
    merge_report = df_merge['_merge'].value_counts()
    print("Overall merge status:")
    print(merge_report)

    # Count the number of merged records (i.e., records present in both datasets) for each year.
    yearly_both = (
        df_merge[df_merge['_merge'] == 'both']
        .groupby('year')
        .size()
        .reset_index(name='both_count')
    )
    print("Merged (both) record counts per year:")
    print(yearly_both)
    
    # Optionally, keep only the records that successfully merged from both datasets.
    df_merge = df_merge[df_merge['_merge'] == 'both']
    # Drop the merge indicator and redundant FY_IC_OP column.
    df_merge = df_merge.drop(columns=['_merge', 'FY_IC_OP'])

    return df_merge

if __name__ == "__main__":
    print("Start to merge Revelio with AA data")
    df_office_year = merge_dataframe(df_revelio, df_aa_extended)
    df_office_year.to_csv(r"E:\USA auditor turnover data\result data\revelio_aa_merge.csv", index=False)
    
    print("Start to merge filtered Revelio with AA data")
    df_office_year_filtered = merge_dataframe(df_revelio_filtered, df_aa_extended)


Start to merge Revelio with AA data
Revelio row counts for years from 1990 onwards:
    year  revelio_count
0   1990           1217
1   1991           1232
2   1992           1257
3   1993           1296
4   1994           1325
5   1995           1359
6   1996           1381
7   1997           1470
8   1998           1562
9   1999           1602
10  2000           1630
11  2001           1624
12  2002           1610
13  2003           1639
14  2004           1728
15  2005           1820
16  2006           1891
17  2007           1968
18  2008           1964
19  2009           1896
20  2010           1803
21  2011           1704
22  2012           1658
23  2013           1652
24  2014           1628
25  2015           1640
26  2016           1670
27  2017           1685
28  2018           1735
29  2019           1767
30  2020           1836
31  2021           1917
32  2022           2109
33  2023           1951
34  2024           1723
AA row counts per year:
    FY_IC_OP  aa_count
0    

* Reset the gap_indicator_revelio_office
  * Because when generate gap_indicator_revelio_office, we use the whole year range. However, now the year range is deceased to 2001. Therefore, we need to reset the gap of office

In [54]:
# Step 1: Reset indicators to 0
df_office_year['gap_indicator_revelio'] = 0
df_office_year_filtered['gap_indicator_revelio'] = 0
df_office_year['gap_indicator_aa'] = 0
df_office_year_filtered['gap_indicator_aa'] = 0

# Step 2: Recalculate based on row indicators

df_office_year.loc[df_office_year.groupby('office_id')['gap_indicator_revelio_row'].transform(lambda x: (x == 1).any()), 'gap_indicator_revelio'] = 1
df_office_year_filtered.loc[df_office_year_filtered.groupby('office_id')['gap_indicator_revelio_row'].transform(lambda x: (x == 1).any()), 'gap_indicator_revelio'] = 1

df_office_year.loc[df_office_year.groupby('office_id')['gap_indicator_aa_row'].transform(lambda x: (x == 1).any()), 'gap_indicator_aa'] = 1
df_office_year_filtered.loc[df_office_year_filtered.groupby('office_id')['gap_indicator_aa_row'].transform(lambda x: (x == 1).any()), 'gap_indicator_aa'] = 1



* Manually check the result

In [55]:
df_office_year_check = df_office_year.sort_values(by=['office_id', 'year'])

path = r"E:\USA auditor turnover data\result data\office_year_panel_for_check_gap_reset.csv"

df_office_year_check.to_csv(path, index=False)

In [56]:
# Check for incorrect `gap_indicator_revelio`
incorrect_revelio = df_office_year.groupby('office_id').filter(
    lambda g: (g['gap_indicator_revelio_row'] == 0).all() and (g['gap_indicator_revelio'].iloc[0] == 1)
)

# Check for incorrect `gap_indicator_aa`
incorrect_aa = df_office_year.groupby('office_id').filter(
    lambda g: (g['gap_indicator_aa_row'] == 0).all() and (g['gap_indicator_aa'].iloc[0] == 1)
)

# Print the number of incorrect cases
print(f"Number of incorrect `gap_indicator_revelio`: {len(incorrect_revelio['office_id'].unique())}")
print(f"Number of incorrect `gap_indicator_aa`: {len(incorrect_aa['office_id'].unique())}")

# Show a few examples for debugging
print("Incorrect `gap_indicator_revelio` examples:")
print(incorrect_revelio[['office_id', 'year', 'gap_indicator_revelio_row', 'gap_indicator_revelio']].drop_duplicates().head(10))

print("Incorrect `gap_indicator_aa` examples:")
print(incorrect_aa[['office_id', 'year', 'gap_indicator_aa_row', 'gap_indicator_aa']].drop_duplicates().head(10))


Number of incorrect `gap_indicator_revelio`: 0
Number of incorrect `gap_indicator_aa`: 0
Incorrect `gap_indicator_revelio` examples:
Empty DataFrame
Columns: [office_id, year, gap_indicator_revelio_row, gap_indicator_revelio]
Index: []
Incorrect `gap_indicator_aa` examples:
Empty DataFrame
Columns: [office_id, year, gap_indicator_aa_row, gap_indicator_aa]
Index: []


## 3.1 Build office-level variables

1. Compute LARGE_OFFICE Indicator:

   * Calculate the median office size (using OFFICE_SIZE) for each metro_id and year.
   * Create a new indicator, LARGE_OFFICE, which is set to 1 if an office's OFFICE_SIZE is greater than the computed median, and 0 otherwise.
2. Compute MARKET_SHARE Indicator:

   * For each metro_id and year, sum the OFFICE_SIZE of all offices flagged as Big 4 (BIG4_flag==1) to determine the total number of Big 4 clients (total_big4_clients).
   * Merge this total back into the main DataFrame, filling missing values with 0.
   * Compute MARKET_SHARE as the ratio of an office's OFFICE_SIZE to total_big4_clients, defaulting to 0 if no Big 4 clients are present.
3. Return the Updated DataFrame:

   * The function returns the DataFrame with the newly computed indicators, which are then applied to both full and filtered samples, and the first few rows are printed for verification.


* **Variables definitions:**
  * $\text{LARGE\_OFFICE} = \text{if an audit office’s size is greater than the sample median in an MSA}$
  * $\text{MARKET\_SHARE} = \cfrac{\text{Total number of audit clients in an office in a year}}{\text{Total number of audit clients for all Big 4 offices in the same MSA in a year}}$

In [57]:
def compute_indicators(df_merge):
    """
    Computes two new indicators:
      1. LARGE_OFFICE: 1 if an office's size (OFFICE_SIZE) is greater than the median 
         office size for its MSA (identified by metro_id) in that year.
      2. MARKET_SHARE: OFFICE_SIZE divided by the total OFFICE_SIZE of all Big 4 offices 
         (BIG4_flag==1) in the same MSA and year.
    """
    # ---------------------------
    # LARGE_OFFICE Calculation
    # ---------------------------
    # Compute the median office size (using OFFICE_SIZE) for each metro_id and year.
    df_merge['median_office_size'] = df_merge.groupby(['metro_id', 'year'])['OFFICE_SIZE'].transform('median')
    # Set LARGE_OFFICE = 1 if OFFICE_SIZE > median_office_size, else 0.
    df_merge['LARGE_OFFICE'] = (df_merge['OFFICE_SIZE'] > df_merge['median_office_size']).astype(int)
    
    # ---------------------------
    # MARKET_SHARE Calculation
    # ---------------------------
    # First, for each metro_id and year, compute total OFFICE_SIZE for all Big 4 offices.
    # We filter rows with BIG4_flag==1 and group by metro_id and year.
    big4_totals = (
        df_merge[df_merge['BIG4_flag'] == 1]
        .groupby(['metro_id', 'year'])['OFFICE_SIZE']
        .sum()
        .reset_index()
        .rename(columns={'OFFICE_SIZE': 'total_big4_clients'})
    )
    
    # Merge the total Big 4 clients info back into df_merge.
    df_merge = df_merge.merge(big4_totals, on=['metro_id', 'year'], how='left')
    # Fill missing values with 0 (if no Big 4 office exists in that MSA-year).
    df_merge['total_big4_clients'] = df_merge['total_big4_clients'].fillna(0)
    
    # Compute MARKET_SHARE: OFFICE_SIZE divided by total_big4_clients.
    df_merge['MARKET_SHARE'] = df_merge.apply(
        lambda row: row['OFFICE_SIZE'] / row['total_big4_clients'] if row['total_big4_clients'] != 0 else 0,
        axis=1
    )
    
    return df_merge

if __name__ == "__main__":

    df_office_year = compute_indicators(df_office_year)
    print("Merged DataFrame with new indicators (full sample):")
    print(df_office_year.head())
    
    df_office_year_filtered = compute_indicators(df_office_year_filtered)
    print("Merged DataFrame with new indicators (filtered sample):")
    print(df_office_year_filtered.head())

Merged DataFrame with new indicators (full sample):
     year  office_id                              office_fullname  \
0  2001.0      416.0  Pricewaterhousecoopers Llp Albany, New York   
1  2002.0      416.0  Pricewaterhousecoopers Llp Albany, New York   
2  2003.0      416.0  Pricewaterhousecoopers Llp Albany, New York   
3  2004.0      416.0  Pricewaterhousecoopers Llp Albany, New York   
4  2005.0      416.0  Pricewaterhousecoopers Llp Albany, New York   

   office_key_location  metro_id                metro_area  total_auditors  \
0  1, Albany, New York      76.0  Albany Metropolitan Area             3.0   
1  1, Albany, New York      76.0  Albany Metropolitan Area             1.0   
2  1, Albany, New York      76.0  Albany Metropolitan Area             1.0   
3  1, Albany, New York      76.0  Albany Metropolitan Area             1.0   
4  1, Albany, New York      76.0  Albany Metropolitan Area             1.0   

   flow_in  flow_out  net_flow  ...  OFFICE_GROWTH_NUMBERS  \
0 

In [58]:
print(df_office_year.columns)

Index(['year', 'office_id', 'office_fullname', 'office_key_location',
       'metro_id', 'metro_area', 'total_auditors', 'flow_in', 'flow_out',
       'net_flow', 'flow_in_rate', 'flow_out_rate', 'net_flow_rate',
       'gap_indicator_revelio_row', 'total_employees', 'flow_in_rate_employee',
       'flow_out_rate_employee', 'net_flow_rate_employee',
       'gap_indicator_revelio', 'state', 'office_city_cleaned',
       'office_avg_salary', 'metro_median_salary', 'above_median_salary',
       'office_count_in_MSA', 'OFFICE_SIZE', 'TOTAL_RESTATEMENT',
       'TOTAL_AUDIT_FEE', 'TOTAL_COMBINED_IC', 'TOTAL_REPORTS',
       'COMBINED_OP_INDICATOR', 'COMBINED_OP_RATE', 'gap_indicator_aa_row',
       'gap_indicator_aa', 'OFFICE_GROWTH_NUMBERS', 'OFFICE_GROWTH_FEES',
       'RESTATE_PERC', 'HIGH_GROWTH', 'BIG4_flag', 'copy_2004_indicator',
       'median_office_size', 'LARGE_OFFICE', 'total_big4_clients',
       'MARKET_SHARE'],
      dtype='object')


* Rename and reorder the columns

In [59]:
### Rename and reorder the columns
def rename_and_reorder_df(df):
    df.rename(columns={
        'flow_in_rate': 'IN_FLOW_RATE',
        'flow_out_rate' : 'OUT_FLOW_RATE',
        'net_flow_rate' : 'NET_FLOW_RATE',
        'office_city_cleaned' : 'city',
        'above_median_salary' : 'HIGH_SALARY',
        'office_count_in_MSA' : 'MSA_OFFICES',
        'total_auditors' : 'HIRED_AUDITORS_NUM',
        'flow_in_rate_employee' : 'IN_FLOW_RATE_EMP',
        'flow_out_rate_employee' : 'OUT_FLOW_RATE_EMP',
        'net_flow_rate_employee' : 'NET_FLOW_RATE_EMP'
        }, inplace=True)

    # Define the columns to move to the beginning
    cols_to_move = ['office_key_location', 
                    'office_fullname', 
                    'year',
                    'HIRED_AUDITORS_NUM',
                    'IN_FLOW_RATE',
                    'OUT_FLOW_RATE',
                    'NET_FLOW_RATE',
                    'OFFICE_SIZE',
                    'LARGE_OFFICE',
                    'MARKET_SHARE',
                    'HIGH_SALARY',
                    'RESTATE_PERC',
                    'MSA_OFFICES',
                    'OFFICE_GROWTH_NUMBERS',
                    'OFFICE_GROWTH_FEES',
                    'HIGH_GROWTH',
                    'metro_area', 
                    'city',
                    'state'
                    ] 
    # Get the remaining columns (excluding those moved)
    remaining_cols = [col for col in df.columns if col not in cols_to_move]

    # Reorder DataFrame
    df = df[cols_to_move + remaining_cols]

    return df

if __name__ == "__main__":
    df_office_year = rename_and_reorder_df(df_office_year)
    df_office_year_filtered = rename_and_reorder_df(df_office_year_filtered)


In [60]:
#show(df_office_year)

## 3.2 Build state-level variables
* **Data sources：**
  * GDP：Bureau of Economic Analysis (BEA)
    * https://apps.bea.gov/itable/?ReqID=70&step=1&_gl=1*1acat52*_ga*MTk2NDA1OTAzMi4xNzM5NTI0MzQy*_ga_J4698JNNFT*MTczOTUzMDM1Mi4yLjEuMTczOTUzMDczNy42MC4wLjA.
  * Unemployment rate：Bureau of Labor Statistics (BLS)
    * https://www.bls.gov/lau/rdscnp16.htm
  
* **Variables definitions:**
  
  * $\text{GDP\_GROWTH = The average annual percentage change in GDP in the state}$
  * $\text{UNEMPLOYED = The average annual unemployment rate in the state}$

### 3.2.1 Import and prepare data

In [61]:
import pandas as pd
from pandasgui import show

# 1. Import data
gdp_path = r"E:\USA auditor turnover data\usa state-level gdp and unemployment rate\State GDP\1997-2023_USA_STATE_GDP.csv"
unemployment_path = r"E:\USA auditor turnover data\usa state-level gdp and unemployment rate\state unemployment rate\1995-2023_USA_ANNUAL_STATE_UNEMPLOYMENT_RATE_RAW.xlsx"

df_gdp = pd.read_csv(gdp_path)
df_unemployment = pd.read_excel(unemployment_path)

df_gdp.head()

Unnamed: 0,state,1997,1998,1999,2000,2001,2002,2003,2004,2005,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Alabama,154700.0,160396.4,166531.8,168695.3,168448.3,172430.4,178040.0,189913.7,197246.5,...,206070.0,208950.3,212862.8,216615.5,220808.8,225272.8,222288.8,233726.6,238556.5,245354.7
1,Alaska,41071.0,40263.7,39783.1,38428.1,40014.4,41904.7,41235.0,42880.1,44406.8,...,53303.3,53681.1,53463.9,53550.9,52479.6,52377.5,50332.8,51454.1,50767.1,54059.7
2,Arizona,180293.9,197492.1,214293.9,224729.3,230885.6,238257.1,253743.1,264990.0,283806.2,...,301721.2,308582.8,319008.4,333099.0,346398.3,359576.7,365027.7,395035.9,410228.4,422399.6
3,Arkansas,87180.3,89905.3,94756.9,95509.8,95528.7,98556.6,102948.0,108581.8,112451.1,...,121409.4,121532.5,123034.9,123882.6,126371.2,127220.0,128340.9,137463.6,139560.7,142860.6
4,California,1441226.4,1538612.3,1655438.5,1784320.6,1784567.7,1821509.5,1895287.6,1956745.5,2042141.0,...,2428675.7,2545979.5,2623711.7,2740550.3,2850970.3,2969609.0,2933320.2,3154188.6,3184007.8,3248656.6


In [62]:
df_unemployment.head()

Unnamed: 0,state,year,unemployment_rate
0,Alabama,1976,6.7
1,Alaska,1976,7.6
2,Arizona,1976,9.8
3,Arkansas,1976,6.9
4,California,1976,9.2


### 3.2.1 Build `GDP_GROWTH` variable
* Reshape df_gdp to build panel data
* Calculate the year-over-year percentage change in GDP for each state

In [63]:
### Reshape df_gdp into panel data
# 1. Standardize state name and year
df_gdp['state'] = df_gdp['state'].str.title()

# 2. Use .melt()to turn year row into one column
df_gdp_panel = pd.melt(df_gdp, id_vars=['state'], var_name='year', value_name='gdp')
df_gdp_panel['year'] = df_gdp_panel['year'].astype(int)

# 3. Resort data by state-year
df_gdp_panel = df_gdp_panel.sort_values(by=['state', 'year']).reset_index(drop=True)

# 4. Generate GDP_GROWTH
df_gdp_panel['GDP_GROWTH'] = df_gdp_panel.groupby('state')['gdp'].pct_change() 
df_gdp_panel.head()

Unnamed: 0,state,year,gdp,GDP_GROWTH
0,Alabama,1997,154700.0,
1,Alabama,1998,160396.4,0.036822
2,Alabama,1999,166531.8,0.038251
3,Alabama,2000,168695.3,0.012992
4,Alabama,2001,168448.3,-0.001464


### 3.2.2 Build `UNEMPLOYED` variable
* Sort df_unemployment to build panel data
* Notes:
  * The original data provide from BLS is exact the average annual unemployment rate for each states

In [64]:
# Sort state unemployment rate data
df_unemployment_sorted = df_unemployment.sort_values(by=['state', 'year']).reset_index(drop=True)

df_unemployment_sorted['year'] = df_unemployment_sorted['year'].astype(int)

# Rename the variable
df_unemployment_sorted = df_unemployment_sorted.rename(columns={
    'unemployment_rate' : 'UNEMPLOYED'
})

df_unemployment_sorted = df_unemployment_sorted[df_unemployment_sorted['year'] >= 1997]
df_unemployment_sorted = df_unemployment_sorted.sort_values(by=['state', 'year']).reset_index(drop=True)

df_unemployment_sorted.head(100)

Unnamed: 0,state,year,UNEMPLOYED
0,Alabama,1997,4.9
1,Alabama,1998,4.5
2,Alabama,1999,4.7
3,Alabama,2000,4.6
4,Alabama,2001,5.2
...,...,...,...
95,Arkansas,2011,7.9
96,Arkansas,2012,7.3
97,Arkansas,2013,7.1
98,Arkansas,2014,5.9


### 3.2.3 Append GDP and unemployment variables into office-year panel, and reorder variables

In [65]:
def merge_dataframe(df_left, df_right):
    df_left['state'] = df_left['state'].astype(str)
    df_right['state'] = df_right['state'].astype(str)

    df_left['year'] = df_left['year'].astype(int)
    df_right['year'] = df_right['year'].astype(int)

    df_merge = df_left.merge(df_right,
                                left_on=['state', 'year'], 
                                right_on =['state', 'year'] ,
                                how='outer',
                                indicator=True)

    merge_report = df_merge['_merge'].value_counts()
    print(merge_report) 

    df_merge = df_merge[df_merge['_merge'] == 'both']
    df_merge = df_merge.drop(columns = ['_merge'])
    
    return df_merge

def reorder_dataframe(df):
    # Define the columns to move to the beginning
    cols_to_move = ['office_key_location', 
                    'office_fullname', 
                    'year',
                    'HIRED_AUDITORS_NUM',
                    'IN_FLOW_RATE',
                    'OUT_FLOW_RATE',
                    'NET_FLOW_RATE',
                    'IN_FLOW_RATE_EMP',
                    'OUT_FLOW_RATE_EMP',
                    'NET_FLOW_RATE_EMP',                    
                    'OFFICE_SIZE',
                    'LARGE_OFFICE',
                    'MARKET_SHARE',
                    'HIGH_SALARY',
                    'RESTATE_PERC',
                    'MSA_OFFICES',
                    'OFFICE_GROWTH_NUMBERS',
                    'OFFICE_GROWTH_FEES',
                    'HIGH_GROWTH',
                    'UNEMPLOYED',
                    'GDP_GROWTH',
                    'COMBINED_OP_INDICATOR',
                    'COMBINED_OP_RATE',
                    'gap_indicator_revelio',
                    'gap_indicator_aa',
                    'copy_2004_indicator',
                    'metro_area', 
                    'city',
                    'state'
                    ] 
    # Get the remaining columns (excluding those moved)
    remaining_cols = [col for col in df.columns if col not in cols_to_move]

    # Reorder DataFrame
    df = df[cols_to_move + remaining_cols]

    return df
if __name__ == "__main__":

    print("Start to merge office-year")
    df_office_year = merge_dataframe(df_office_year, df_gdp_panel)
    df_office_year = merge_dataframe(df_office_year, df_unemployment_sorted)
    df_office_year = reorder_dataframe(df_office_year)

    print("start to merge office-year filtered")
    df_office_year_filtered = merge_dataframe(df_office_year_filtered, df_gdp_panel)  
    df_office_year_filtered = merge_dataframe(df_office_year_filtered, df_unemployment_sorted)  
    df_office_year_filtered = reorder_dataframe(df_office_year_filtered)

Start to merge office-year
_merge
both          6402
right_only     619
left_only      119
Name: count, dtype: int64
_merge
both          6402
right_only     457
left_only        0
Name: count, dtype: int64
start to merge office-year filtered


_merge
both          5497
right_only     644
left_only      104
Name: count, dtype: int64
_merge
both          5497
right_only     482
left_only        0
Name: count, dtype: int64


# 4 Save the office-year panel dataset

In [66]:
print(df_office_year.columns)
print(df_office_year_filtered.columns)

Index(['office_key_location', 'office_fullname', 'year', 'HIRED_AUDITORS_NUM',
       'IN_FLOW_RATE', 'OUT_FLOW_RATE', 'NET_FLOW_RATE', 'IN_FLOW_RATE_EMP',
       'OUT_FLOW_RATE_EMP', 'NET_FLOW_RATE_EMP', 'OFFICE_SIZE', 'LARGE_OFFICE',
       'MARKET_SHARE', 'HIGH_SALARY', 'RESTATE_PERC', 'MSA_OFFICES',
       'OFFICE_GROWTH_NUMBERS', 'OFFICE_GROWTH_FEES', 'HIGH_GROWTH',
       'UNEMPLOYED', 'GDP_GROWTH', 'COMBINED_OP_INDICATOR', 'COMBINED_OP_RATE',
       'gap_indicator_revelio', 'gap_indicator_aa', 'copy_2004_indicator',
       'metro_area', 'city', 'state', 'office_id', 'metro_id', 'flow_in',
       'flow_out', 'net_flow', 'gap_indicator_revelio_row', 'total_employees',
       'office_avg_salary', 'metro_median_salary', 'TOTAL_RESTATEMENT',
       'TOTAL_AUDIT_FEE', 'TOTAL_COMBINED_IC', 'TOTAL_REPORTS',
       'gap_indicator_aa_row', 'BIG4_flag', 'median_office_size',
       'total_big4_clients', 'gdp'],
      dtype='object')
Index(['office_key_location', 'office_fullname', 'year', 

## 4.1 Marking Raw Records in the Dataset
* rev_raw_office: Set to 1 if an audit office has no discontinued records (i.e., both gap_indicator_revelio and gap_indicator_aa equal 0); otherwise, set to 0.
* rev_raw_row: Set to 1 if a row is not inferred (i.e., if gap_indicator_revelio_raw, gap_indicator_aa_row, and copy_2004_indicator are all 0); otherwise, set to 0.

In [67]:
# 1. Create 'rev_raw_office'

df_office_year['rev_raw_office'] = 0  # Initialize the column with 0
mask = (
    (df_office_year['gap_indicator_revelio'] == 0) & 
    (df_office_year['gap_indicator_aa'] == 0) 
)
df_office_year.loc[mask, 'rev_raw_office'] = 1

df_office_year_filtered['rev_raw_office'] = 0  # Initialize the column with 0
mask = (
    (df_office_year_filtered['gap_indicator_revelio'] == 0) & 
    (df_office_year_filtered['gap_indicator_aa'] == 0) 
)
df_office_year_filtered.loc[mask, 'rev_raw_office'] = 1

# 2. Create 'rev_raw_row'
df_office_year['rev_raw_row'] = 0  # Initialize the column with 0
mask = (
    (df_office_year['gap_indicator_revelio_row'] == 0) & 
    (df_office_year['gap_indicator_aa_row'] == 0) &
    (df_office_year['copy_2004_indicator'] == 0) 
)
df_office_year.loc[mask, 'rev_raw_row'] = 1

df_office_year_filtered['rev_raw_row'] = 0  # Initialize the column with 0
mask = (
    (df_office_year_filtered['gap_indicator_revelio_row'] == 0) & 
    (df_office_year_filtered['gap_indicator_aa_row'] == 0) &
    (df_office_year_filtered['copy_2004_indicator'] == 0) 
)
df_office_year_filtered.loc[mask, 'rev_raw_row'] = 1

# 3. Reorder the columns
# 3.1 Define the reorder function
def reorder_dataframe(df):
    # Define the columns to move to the beginning
    cols_to_move = ['office_key_location', 
                    'office_fullname', 
                    'year',
                    'HIRED_AUDITORS_NUM',
                    'IN_FLOW_RATE',
                    'OUT_FLOW_RATE',
                    'NET_FLOW_RATE',
                    'IN_FLOW_RATE_EMP',
                    'OUT_FLOW_RATE_EMP',
                    'NET_FLOW_RATE_EMP',                    
                    'OFFICE_SIZE',
                    'LARGE_OFFICE',
                    'MARKET_SHARE',
                    'HIGH_SALARY',
                    'RESTATE_PERC',
                    'MSA_OFFICES',
                    'OFFICE_GROWTH_NUMBERS',
                    'OFFICE_GROWTH_FEES',
                    'HIGH_GROWTH',
                    'UNEMPLOYED',
                    'GDP_GROWTH',
                    'COMBINED_OP_INDICATOR',
                    'COMBINED_OP_RATE',
                    'rev_raw_office',
                    'rev_raw_row',
                    'gap_indicator_revelio',
                    'gap_indicator_revelio_row',
                    'gap_indicator_aa',
                    'gap_indicator_aa_row',
                    'copy_2004_indicator',
                    'metro_area', 
                    'city',
                    'state'
                    ] 
    # Get the remaining columns (excluding those moved)
    remaining_cols = [col for col in df.columns if col not in cols_to_move]

    # Reorder DataFrame
    df = df[cols_to_move + remaining_cols]

    return df

# 3.2 Apply the reorder function
df_office_year = reorder_dataframe(df_office_year)
df_office_year_filtered = reorder_dataframe(df_office_year_filtered)

In [68]:
#show(df_office_year)

## 4.2 Save the office-year panel data

In [69]:
save_path1 = r"E:\USA auditor turnover data\result data\office-year panel date_final.csv"
save_path2 = r"E:\USA auditor turnover data\result data\office-year panel date_filtered_final.csv"

df_office_year = df_office_year.sort_values(by=['office_key_location', 'year'])
df_office_year_filtered = df_office_year_filtered.sort_values(by=['office_key_location', 'year'])

df_office_year.to_csv(save_path1, index=False)
df_office_year_filtered.to_csv(save_path2, index=False)