# Build office level variables form Audit Analytics dataset
* Jie Xia, SUSTech
* 2025-02-10

# 1. Cleaning data
* Drop rows where have missing value
* Standardize city expression

## 1.1 Drop NaN and standardize city expression

In [None]:
import pandas as pd
from pandasgui import show
import re

### Cleaning Data

# 1. Import Audit Analytics (AA) data
input_path = r"e:\USA auditor turnover data\Audit Analytics data\AuditAnalytics_us_cleaned_for_python.dta"
df_aa = pd.read_stata(input_path)

# Drop rows with any missing values
original_length = len(df_aa)
df_aa = df_aa.dropna()
dropped_rows = original_length - len(df_aa)
print(f"{dropped_rows} rows have been dropped due to missing values")

# Standardize city names by renaming "Washington" to "Washington DC"
mask = df_aa['AUD_CITY'].str.strip().str.lower() == 'washington'
df_aa.loc[mask, 'AUD_CITY'] = 'Washington DC'

mask = df_aa['BUS_CITY_TITLE'].str.strip().str.lower() == 'washington'
df_aa.loc[mask, 'BUS_CITY_TITLE'] = 'Washington DC'

def clean_text(text):
    """
    Cleans a text string by removing punctuation and extra whitespace.
    """
    if pd.isna(text):  # Handle NaN values
        return text
    text = text.strip()
    text = re.sub(r'[^\w\s]', ' ', text)  # Replace punctuation with a space
    text = re.sub(r'\s+', ' ', text.strip())  # Remove extra spaces and special characters
    return text.strip()

# Apply text cleaning to the 'AUD_CITY' column
df_aa['AUD_CITY'] = df_aa['AUD_CITY'].apply(clean_text)


In [None]:
print(df_aa.columns)

## 1.2 Generate indicator and drop duplicates
1. Create Office-City Identifier:

   * Convert key numeric columns to integer.
   * Combine AUDITOR_FKEY, AUD_CITY, and AUD_STATE_NAME (all title-cased) into a single identifier office_key_location.
  
2. Reorder Columns:

   * Move important columns (e.g., office_key_location, OP_AUD_NAME, etc.) to the beginning of the DataFrame.
  
3. Sort Data:

   * Sort observations by office_key_location, year (FY_IC_OP), and client (COMPANY_FKEY).

4. Identify BIG 4 Firms:

   * Create an indicator (BIG4_flag) that marks a record as 1 if the office name contains any of the BIG 4 firm names.
  
5. Remove Duplicate Client Records:

   * Within each office and year, sort by IS_NTH_RESTATE (in descending order) and drop duplicates for the same client.
   * This ensures that if multiple records exist, the one with a non-zero restatement is retained (if available).
  
6. Resort the Cleaned DataFrame:

   * Finally, re-sort the cleaned DataFrame by office_key_location, FY_IC_OP, and COMPANY_FKEY.

In [None]:
# 2. Create office-city level identifier 'office_key_location' (combination of AUDITOR_FKEY, city, state)
df_aa['AUDITOR_FKEY'] = df_aa['AUDITOR_FKEY'].astype(int)
df_aa['FY_IC_OP'] = df_aa['FY_IC_OP'].astype(int)
df_aa['IS_NTH_RESTATE'] = df_aa['IS_NTH_RESTATE'].astype(int)
df_aa['AUDIT_FEES'] = df_aa['AUDIT_FEES'].astype(int)

# Combine AUDITOR_FKEY, AUD_CITY, and AUD_STATE_NAME into a single identifier, with proper title casing and commas
df_aa['office_key_location'] = (
    df_aa['AUDITOR_FKEY'].astype(str).str.title() + ', ' +
    df_aa['AUD_CITY'].str.title() + ', ' +
    df_aa['AUD_STATE_NAME'].str.title()
)

# 3. Reorder the columns
# Define the columns to be moved to the beginning
cols_to_move = [
    'office_key_location', 
    "OP_AUD_NAME",
    "FY_IC_OP",
    "NAME",
    "IS_NTH_RESTATE",
    "AUDIT_FEES"
]
# Get the remaining columns that are not in cols_to_move
remaining_cols = [col for col in df_aa.columns if col not in cols_to_move]
# Reorder the DataFrame so that cols_to_move come first
df_aa = df_aa[cols_to_move + remaining_cols]

# 4. Sort observations by office_key_location, year (FY_IC_OP), and client (COMPANY_FKEY)
df_aa = df_aa.sort_values(by=['office_key_location', 'FY_IC_OP', 'COMPANY_FKEY'])

# 5. Build an indicator for BIG 4 accounting firms
big4_names = ["PricewaterhouseCoopers", "Ernst & Young", "Deloitte & Touche", "KPMG"]

def is_big4(office_name):
    """
    Returns 1 if the office name contains any of the BIG 4 names (case insensitive), else returns 0.
    """
    if pd.isna(office_name):
        return 0
    for name in big4_names:
        if name.lower() in office_name.lower():
            return 1
    return 0

df_aa['BIG4_flag'] = df_aa['OP_AUD_NAME'].apply(is_big4)

# 6. For each office_key_location & year, remove duplicate client records.
#    If multiple records exist for the same client within the same office and year,
#    then if any record has a non-zero restatement (IS_NTH_RESTATE != 0), keep that record;
#    otherwise, keep just one record.
#    This is achieved by sorting in descending order by IS_NTH_RESTATE and then dropping duplicates.
df_aa = df_aa.sort_values(by='IS_NTH_RESTATE', ascending=False)
df_aa_clean = df_aa.drop_duplicates(subset=['office_key_location', 'FY_IC_OP', 'COMPANY_FKEY'])

# 7. Resort the cleaned DataFrame by office_key_location, year, and client
df_aa_clean = df_aa_clean.sort_values(by=['office_key_location', 'FY_IC_OP', 'COMPANY_FKEY'])


## 1.3 Check the AA dataset

### 1.3.1 Describe BIG4 distribution in dataset

In [None]:
big4_df = df_aa_clean[df_aa_clean['BIG4_flag'] == 1]

big4_number = big4_df['office_key_location'].nunique()
total_number = df_aa_clean['office_key_location'].nunique()

print(f"{big4_number} offices are big4\n"
      f"{total_number - big4_number} offices are non-big4\n"
      f"{big4_number / total_number} % of office is big4")

### 1.3.2 Check change of COMBINED_IC_OP in office-client-year

1. Convert Year Column:

   * Ensure the FY_IC_OP column (representing the year) is of integer type.
2. Define Window Check Function:

   * The function check_window is applied to each group (grouped by office_key_location and COMPANY_FKEY).
   * For each record within a group, it looks at a window of years (current year ± window) and checks if COMBINED_IC_OP remains consistent (i.e., has only one unique value).
   * A new flag column (e.g., flag_2_year, flag_3_year, or flag_5_year) is created, where 1 indicates consistency and 0 indicates inconsistency.
3. Apply Window Check for Various Windows:

   * The check_window function is applied for window lengths of 2, 3, and 5 years using a groupby-apply operation.
4. Assess Overall Client Consistency:

   * The function client_consistency aggregates the window flag for each group (client within an office) over the entire sample period.
   * If the flag is 1 for all records within the group for a given window, the overall flag is set to 1; otherwise, it is 0.
5. Compute and Display Client Consistency:

   * For each window (2, 3, 5 years), the overall consistency is computed for each client group.
   * The code then prints out the counts of clients that maintained consistency within each window.

In [None]:
def reorder_dataframe(df):
    # Define the columns to move to the beginning
    cols_to_move = ['office_key_location',
                    'COMPANY_FKEY',
                    'FY_IC_OP',
                    'COMBINED_IC_OP'
                    ] 
    # Get the remaining columns (excluding those moved)
    remaining_cols = [col for col in df.columns if col not in cols_to_move]

    # Reorder DataFrame
    df = df[cols_to_move + remaining_cols]

    return df

df_check_aa = df_aa.sort_values(by=['office_key_location', 'COMPANY_FKEY','FY_IC_OP',])

df_check_aa = reorder_dataframe(df_check_aa)


In [None]:
import pandas as pd

# Assume FY_IC_OP is the year column (integer type); convert if necessary
df_check_aa['FY_IC_OP'] = df_check_aa['FY_IC_OP'].astype(int)

def check_window(group, window):
    """
    For each group (i.e., a unique audit office-client combination),
    check whether the value of COMBINED_IC_OP remains consistent within a window
    defined as (current year ± window). For each record in the group,
    return 1 if the COMBINED_IC_OP value is the same across the window, otherwise return 0.
    """
    # Sort the group by year
    group = group.sort_values('FY_IC_OP').copy()
    # Define a new column name based on the window, e.g., 'flag_2_year'
    flag_column = f'flag_{window}_year'
    
    # For each row, compute whether COMBINED_IC_OP is unique within the specified window
    group[flag_column] = group.apply(
        lambda row: 1 if group[
            (group['FY_IC_OP'] >= row['FY_IC_OP'] - window) &
            (group['FY_IC_OP'] <= row['FY_IC_OP'] + window)
        ]['COMBINED_IC_OP'].nunique() == 1 else 0, 
        axis=1
    )
    return group

# Apply the window check for different window lengths (2, 3, and 5 years)
for window in [2, 3, 5]:
    df_check_aa = df_check_aa.groupby(
        ['office_key_location', 'COMPANY_FKEY'], group_keys=False
    ).apply(lambda g: check_window(g, window))

def client_consistency(group, window):
    """
    For each group (i.e., a unique audit office-client combination),
    check whether the COMBINED_IC_OP value remains consistent within the given window 
    across all records. If every record in the group has a flag of 1 for the specified window,
    then the overall flag is set to 1; otherwise, it is 0.
    """
    flag_column = f'flag_{window}_year'
    overall_flag = 1 if group[flag_column].min() == 1 else 0
    return pd.Series({flag_column: overall_flag})

# Compute overall client consistency for each window length
client_flags = {}
for window in [2, 3, 5]:
    flag_column = f'flag_{window}_year'
    client_flags[flag_column] = df_check_aa.groupby(
        ['office_key_location', 'COMPANY_FKEY']
    ).apply(lambda g: client_consistency(g, window)).reset_index()

# Display the results: number of clients with consistent COMBINED_IC_OP within each window
for window in [2, 3, 5]:
    flag_column = f'flag_{window}_year'
    print(f"Number of clients with consistent COMBINED_IC_OP within a ±{window}-year window:")
    counts = client_flags[flag_column][flag_column].value_counts()
    print(counts)


* Group the data by office_key_location and COMPANY_FKEY, flag each group as 1 if its COMBINED_IC_OP values are consistent (or 0 otherwise)

In [None]:
# Group df_check_aa by office_key_location and COMPANY_FKEY, and compute whether COMBINED_IC_OP is consistent within each group.
# If there is only one unique value of COMBINED_IC_OP in the group, assign flag = 1; otherwise, flag = 0.
client_flags = df_check_aa.groupby(['office_key_location', 'COMPANY_FKEY']).apply(
    lambda group: 1 if group['COMBINED_IC_OP'].nunique() == 1 else 0
).reset_index(name='flag')

# Output the flag results for each group
print(client_flags.head())

# Count the number of clients with each flag (1 and 0)
flag_counts = client_flags['flag'].value_counts()
print("Number of clients with flag 1 and 0:")
print(flag_counts)

# Print the ratio 1431/10845 (as an example)
print(1431 / 10845)

# 2. Build office-level variables panel data

* Variables definitions:
  * **Office Size variables**
  
    * $\text{OFFICE\_SIZE} = \text{Number of audit clients in an audit office in a year}$
  
    * $\text{LARGE\_OFFICE =  if an audit office’s size is greater than the sample median in an MSA}$
  
    * $\text{MARKET\_RATE} = \cfrac{\text{Total number of audit clients in an office in a year}}{\text{Total number of audit clients for all Big 4 offices in the MSA in a year}}$
  
    * $\text{OFFICE\_GROWTH\_NUMBERS} = \text{The percentage change in the number of audit clients in an audit office from year}_{t–1} \text{to year}_{t}$
  
  * **Restatement variables:**
  
    * $\text{RESTATE\_PERC} = \cfrac{\text{Number of restatement announcements for clients}}{\text{Total number of audit clients}}\  \text{in office-year}$
  
  * **Audit Fees variables:**
  
    * $\text{OFFICE\_GROWTH\_FEES} = \text{The percentage change in total audit fees in an audit office from year}_{t–1} \text{to year}_{t}$

    * $\text{HIGH\_GROWTH} = \text{if an office’s growth in audit fees is greater than the sample median in a year}$

  * **Combined internal control varibales:**

    * $\text{COMBINED\_OP\_INDICATOR = if an audit office published any financial statement report integrated with internal control opinion in a year}$ 

    * $\text{COMBINED\_OP\_RATE} = \cfrac{\text{Total number of financial statement reports with internal control opinion published by an audit office in a year}}{\text{Total number of financial statement report published by an audit office in a year}}$ 
  
  * **Gap indicator variables:**
    * $\text{gap\_indicator\_aa} = \text{indicate the audit office if any years of it has an discontinuous record}$
    * $\text{gap\_indicator\_aa\_row} = \text{indicate the specific gap‐filled rows}$

## 2.1 Calculate the variables
1. Aggregate Office-Year Metrics:

   * Group data by office_key_location and FY_IC_OP to compute:
     *  the number of unique clients (OFFICE_SIZE)
     *  total restatement (TOTAL_RESTATEMENT)
     *  total audit fees (TOTAL_AUDIT_FEE)
     *  the ratio of reports with internal control opinion to total reports(COMBINED_OP_RATE)
     *  and set to 1 if the office published any report with an internal control opinion, otherwise (COMBINED_OP_INDICATOR)
2. Fill Missing Years and Create Gap Indicator(`gap_indicator_aa`/`gap_indicator_aa_row`):

   * For each office, generate a complete sequence of years between its minimum and maximum year, merge with the aggregated data (filling missing metrics with 0), and create a `gap_indicator_aa` if any years are missing(i.e., total client == 0 in any year).
   * For an office that has ever experienced a discontinuous client history (i.e., total client == 0 in any year), mark its specific gap‐filled rows with 1; otherwise, 0, using `gap_indicator_aa_row`.
3. Compute Growth Rates and Restatement Percentage:

   * Calculate growth rates for both OFFICE_SIZE (as OFFICE_GROWTH_NUMBERS) and TOTAL_AUDIT_FEE (as OFFICE_GROWTH_FEES) between consecutive years, and compute RESTATE_PERC as the ratio of TOTAL_RESTATEMENT to OFFICE_SIZE when applicable.
4. Determine High Growth Offices:

   * For each year, compute the median growth rate of TOTAL_AUDIT_FEE and flag an office as HIGH_GROWTH if its growth rate exceeds the median.
5. Merge BIG4 Flag:

   * Integrate the office-level BIG4_flag from the original cleaned dataset into the aggregated DataFrame.

In [None]:
import numpy as np

# -------------------------------
# 1. Calculate Basic Metrics by Grouping on office_key_location and FY_IC_OP:
#    - OFFICE_SIZE: Number of unique COMPANY_FKEY (i.e., clients) for each office-year.
#    - TOTAL_RESTATEMENT: Sum of IS_NTH_RESTATE.
#    - TOTAL_AUDIT_FEE: Sum of AUDIT_FEES.
#    - TOTAL_COMBINED_IC: Sum of COMBINED_IC_OP (i.e., number of reports with internal control opinion).
#    - TOTAL_REPORTS: Total number of financial statement reports published by the office in that year.
# -------------------------------
agg = df_aa_clean.groupby(['office_key_location', 'FY_IC_OP']).agg(
    OFFICE_SIZE=('COMPANY_FKEY', 'nunique'),
    TOTAL_RESTATEMENT=('IS_NTH_RESTATE', 'sum'),
    TOTAL_AUDIT_FEE=('AUDIT_FEES', 'sum'),
    TOTAL_COMBINED_IC=('COMBINED_IC_OP', 'sum'),
    TOTAL_REPORTS=('COMBINED_IC_OP', 'count')  # Assuming each row is one report.
).reset_index()

# -------------------------------
# 1.1 Compute Combined Internal Control Indicators:
#     - COMBINED_OP_INDICATOR: 1 if the audit office published any report with an internal control opinion (i.e., TOTAL_COMBINED_IC > 0), otherwise 0.
#     - COMBINED_OP_RATE: The ratio of reports with internal control opinion to total reports.
# -------------------------------
agg['COMBINED_OP_INDICATOR'] = np.where(agg['TOTAL_COMBINED_IC'] > 0, 1, 0)
agg['COMBINED_OP_RATE'] = np.where(
    agg['TOTAL_REPORTS'] > 0,
    agg['TOTAL_COMBINED_IC'] / agg['TOTAL_REPORTS'],
    np.nan
)

# -------------------------------
# 2. Fill in Missing Years for Each Office and Create Gap Indicators:
#    For each office_key_location, from its minimum to maximum year,
#    fill in records for missing years (with all metric values set to 0).
#    Also, if the actual number of years is less than (max - min + 1), mark gap_indicator_aa as 1.
#    Additionally, mark the gap-filled row with gap_indicator_aa_row = 1 (and 0 otherwise).
# -------------------------------
# 2.1 Get the minimum and maximum year for each office
office_years = agg.groupby('office_key_location')['FY_IC_OP'].agg(['min', 'max']).reset_index()

# 2.2 Construct a complete sequence of years for each office
full_rows = []
for _, row in office_years.iterrows():
    office = row['office_key_location']
    start_year = int(row['min'])
    end_year = int(row['max'])
    for year in range(start_year, end_year + 1):
        full_rows.append({'office_key_location': office, 'FY_IC_OP': year})
full_df = pd.DataFrame(full_rows)

# 2.3 Merge the complete year records with the aggregated metrics.
#      Use the merge indicator to flag rows that are gap-filled.
agg_full = full_df.merge(agg, on=['office_key_location', 'FY_IC_OP'], how='left', indicator='merge_flag')
agg_full['gap_indicator_aa_row'] = np.where(agg_full['merge_flag'] == 'left_only', 1, 0)
agg_full = agg_full.drop(columns=['merge_flag'])
agg_full[['OFFICE_SIZE', 'TOTAL_RESTATEMENT', 'TOTAL_AUDIT_FEE',
          'TOTAL_COMBINED_IC', 'TOTAL_REPORTS', 'COMBINED_OP_INDICATOR',
          'COMBINED_OP_RATE']] = agg_full[['OFFICE_SIZE', 'TOTAL_RESTATEMENT', 'TOTAL_AUDIT_FEE',
                                            'TOTAL_COMBINED_IC', 'TOTAL_REPORTS', 'COMBINED_OP_INDICATOR',
                                            'COMBINED_OP_RATE']].fillna(0)

# 2.4 Create the GAP_INDICATOR (office-level):
#     For each office, if the number of actual years is less than the expected number (max - min + 1), mark gap_indicator_aa = 1; otherwise, 0.
office_years['expected_years'] = office_years['max'] - office_years['min'] + 1
actual_counts = agg.groupby('office_key_location').size().reset_index(name='actual_years')
office_years = office_years.merge(actual_counts, on='office_key_location', how='left')
office_years['gap_indicator_aa'] = np.where(office_years['actual_years'] < office_years['expected_years'], 1, 0)
agg_full = agg_full.merge(office_years[['office_key_location', 'gap_indicator_aa']], on='office_key_location', how='left')

# -------------------------------
# 3. Calculate Derived Metrics: Growth Rates and Ratio Indicators
# -------------------------------
def compute_growth(df, col, new_col):
    df = df.sort_values('FY_IC_OP').copy()
    df[new_col] = np.nan
    df['prev_year'] = df['FY_IC_OP'].shift(1)
    df['prev_val'] = df[col].shift(1)
    df[new_col] = np.where(
        ((df['FY_IC_OP'] - df['prev_year'] == 1) & (df['prev_val'] != 0)),
        (df[col] - df['prev_val']) / df['prev_val'],
        np.nan
    )
    return df.drop(columns=['prev_year', 'prev_val'])

# Calculate growth rate for OFFICE_SIZE
agg_full = agg_full.sort_values(['office_key_location', 'FY_IC_OP'])
agg_full = agg_full.groupby('office_key_location').apply(lambda x: compute_growth(x, 'OFFICE_SIZE', 'OFFICE_GROWTH_NUMBERS')).reset_index(drop=True)

# Calculate growth rate for TOTAL_AUDIT_FEE
agg_full = agg_full.sort_values(['office_key_location', 'FY_IC_OP'])
agg_full = agg_full.groupby('office_key_location').apply(lambda x: compute_growth(x, 'TOTAL_AUDIT_FEE', 'OFFICE_GROWTH_FEES')).reset_index(drop=True)

# Calculate RESTATE_PERC: If OFFICE_SIZE > 0, then ratio = TOTAL_RESTATEMENT / OFFICE_SIZE; otherwise, set to NaN.
agg_full['RESTATE_PERC'] = np.where(agg_full['OFFICE_SIZE'] > 0,
                                    agg_full['TOTAL_RESTATEMENT'] / agg_full['OFFICE_SIZE'],
                                    np.nan)

# -------------------------------
# 4. Compute HIGH_GROWTH Indicator:
#    For each year, compute the median of TOTAL_AUDIT_FEE growth rates across all offices.
#    Then, if an office's growth rate for that year is above the median, mark HIGH_GROWTH as 1; otherwise, 0.
# -------------------------------
median_growth = agg_full.groupby('FY_IC_OP')['OFFICE_GROWTH_FEES'].median().reset_index().rename(
    columns={'OFFICE_GROWTH_FEES': 'MEDIAN_GROWTH_FEES'}
)
agg_full = agg_full.merge(median_growth, on='FY_IC_OP', how='left')
agg_full['HIGH_GROWTH'] = np.where(
    (agg_full['OFFICE_GROWTH_FEES'].notna()) & (agg_full['OFFICE_GROWTH_FEES'] > agg_full['MEDIAN_GROWTH_FEES']),
    1,
    0
)
agg_full = agg_full.drop(columns=['MEDIAN_GROWTH_FEES'])

# -------------------------------
# 5. Merge BIG4_flag (Office-Level Indicator):
#    Retrieve the unique BIG4_flag value for each office_key_location and merge it into the aggregated DataFrame.
# -------------------------------
big4_offices = df_aa_clean[['office_key_location', 'BIG4_flag']].drop_duplicates(subset=['office_key_location'])
agg_full = agg_full.merge(big4_offices, on='office_key_location', how='left')

# The final DataFrame agg_full now contains the following key variables:
# office_key_location, FY_IC_OP, OFFICE_SIZE, TOTAL_RESTATEMENT, TOTAL_AUDIT_FEE, gap_indicator_aa,
# OFFICE_GROWTH_NUMBERS, OFFICE_GROWTH_FEES, RESTATE_PERC, HIGH_GROWTH, BIG4_flag,
# COMBINED_OP_INDICATOR, COMBINED_OP_RATE, and gap_indicator_aa_row.


In [None]:
# Show the result
# show(agg_full)

## 2.2 Manually check the result by office in a year

In [None]:
from pandasgui import show

def show_office_by_year(df, office_id, year):
    df = df[df['FY_IC_OP'] == year]
    df = df[df['office_key_location'] == office_id]
    show(df)

# show_office_by_year(df_aa_clean, "1, Dallas, Texas", 2006)

# 3 Save the AA construct panel data

In [None]:
output_path = r"E:\USA auditor turnover data\Audit Analytics data\office-year us auditor variables from AA data.csv"

print("Start to save csv")
agg_full.to_csv(output_path, index=False)
print("Csv file is saved")