### Creating a single file for all PPP Loans up to 150,000 by merging 12 original files, filtering by PPP as processing method, and dropping unnecessary columns.### 

In [1]:
#dependencies
import pandas as pd
import os

In [2]:
#establish source folder path to access original files and house the final single file
source_folder = r'F:\FAU PhD\DISSERTATION\DATABASES\DataTESTS\2023.08.27_Test'

In [3]:
#list all the columns to drop from each original file
columns_to_drop = [
    'LoanNumber',
    'DateApproved',
    'SBAOfficeCode',
    'BorrowerAddress',
    'BorrowerCity',
    'BorrowerState',
    'BorrowerZip',
    'LoanStatusDate',
    'LoanStatus',
    'Term',
    'SBAGuarantyPercentage',
    'FranchiseName',
    'ServicingLenderLocationID',
    'ServicingLenderName',
    'ServicingLenderAddress',
    'ServicingLenderCity',
    'ServicingLenderState',
    'ServicingLenderZip',
    'RuralUrbanIndicator',
    'ProjectCity',
    'ProjectCountyName',
    'ProjectState',
    'ProjectZip',
    'CD',
    'Race',
    'Ethnicity',
    'UTILITIES_PROCEED',
    'PAYROLL_PROCEED',
    'MORTGAGE_INTEREST_PROCEED',
    'RENT_PROCEED',
    'REFINANCE_EIDL_PROCEED',
    'HEALTH_CARE_PROCEED',
    'DEBT_INTEREST_PROCEED',
    'BusinessType',
    'OriginatingLenderLocationID',
    'OriginatingLenderCity',
    'OriginatingLenderState',
    'Gender',
    'Veteran',
    'NonProfit'
]


In [4]:
# Create an empty list to store filtered and trimmed data frames
dfs = []

In [5]:
# Loop over each CSV file
for i in range(1, 13):
    file_name = f"public_up_to_150k_{i}_230101.csv"
    file_path = os.path.join(source_folder, file_name)
    
    # Read the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Filter the rows where 'ProcessingMethod' is 'PPP'
    filtered_df = df[df['ProcessingMethod'] == 'PPP']
    
    # Drop specified columns
    trimmed_df = filtered_df.drop(columns=columns_to_drop, errors='ignore')
    
    # Append the trimmed DataFrame to the list
    dfs.append(trimmed_df)

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
# Concatenate all data frames in the list into a single data frame
merged_df = pd.concat(dfs, ignore_index=True)

#preview the merged DataFrame
merged_df.head()

Unnamed: 0,ProcessingMethod,BorrowerName,InitialApprovalAmount,CurrentApprovalAmount,UndisbursedAmount,HubzoneIndicator,LMIIndicator,BusinessAgeDescription,JobsReported,NAICSCode,OriginatingLender,ForgivenessAmount,ForgivenessDate
0,PPP,NOT AVAILABLE,148440.0,148440.0,0.0,N,N,New Business or 2 years or less,12.0,339114.0,"Bank of America, National Association",150083.01,06/11/2021
1,PPP,NORTH CHARLESTON HOSPITALITY GROUP LLC,140081.78,140081.78,0.0,N,N,Existing or more than 2 years old,3.0,,Synovus Bank,141920.11,08/25/2021
2,PPP,Q AND J SERVICES LLC,136520.96,136520.96,0.0,Y,Y,Existing or more than 2 years old,170.0,541990.0,Synovus Bank,137747.78,03/29/2021
3,PPP,Exemption 6,130600.0,130600.0,0.0,N,N,Unanswered,14.0,624190.0,,131876.98,04/27/2021
4,PPP,OPTIMIZED PROCESS SOLUTIONS DBA AAA INDUSTRIES,126798.0,126798.0,0.0,Y,N,New Business or 2 years or less,20.0,,"Bank of America, National Association",127836.7,02/25/2021


In [8]:
merged_df.shape

(7935860, 13)

In [7]:
# Save the merged DataFrame to a new CSV file
output_file_path = os.path.join(source_folder, "ALL_PPP_up_to_150k.csv")
merged_df.to_csv(output_file_path, index=False)

print(f"Merged file saved at {output_file_path}")

Merged file saved at F:\FAU PhD\DISSERTATION\DATABASES\DataTESTS\2023.08.27_Test\ALL_PPP_up_to_150k.csv
