In [1]:
import pandas as pd 
import pandas_profiling
from pandas_profiling import ProfileReport

In [2]:
url = "https://github.com/SebastianStoneham/seb/raw/main/lei_3Y4U8VZURTYWI1W2K376-QOT5WN9RBKQTFRVKEV31_state_CA.csv"
hmda_df = pd.read_csv(url)  

### applicant income variable

aggregates the applicant income data by grouping on the lei and census_tract columns. It calculates the sum, mean and median of the applicant income.
- FFIEC Median family income in dollars for the MSA/MD in which the tract is located (adjusted annually by FFIEC)

In [3]:
income_agg = hmda_df.groupby(['lei', 'census_tract'])['ffiec_msa_md_median_family_income'].agg(
income_sum='sum',
income_mean='mean',
income_median='median'
)
print(income_agg)

                                   income_sum  income_mean  income_median
lei                  census_tract                                        
3Y4U8VZURTYWI1W2K376 6001400100        125600     125600.0       125600.0
                     6001400700        125600     125600.0       125600.0
                     6001401000        125600     125600.0       125600.0
                     6001403800        125600     125600.0       125600.0
                     6001403900        125600     125600.0       125600.0
...                                       ...          ...            ...
QOT5WN9RBKQTFRVKEV31 6115040400         66800      66800.0        66800.0
                     6115040500         66800      66800.0        66800.0
                     6115040600        200400      66800.0        66800.0
                     6115040700        334000      66800.0        66800.0
                     6115041100         66800      66800.0        66800.0

[5839 rows x 3 columns]


### Aggregate applicant_ethnicity
aggregates the applicant ethnicity data by grouping on the lei and census_tract columns. It calculates the percentage of applicants who are of ethnicity 2.


In [13]:
ethnicity_agg = hmda_df.groupby(['lei', 'census_tract'])['applicant_ethnicity-1'].agg(
hispanic_percent=lambda x: (x == 1).mean() * 100,
    mexican_percent=lambda x: (x == 11).mean() * 100,
    puerto_percent=lambda x: (x == 12).mean() * 100,
    cuban_percent=lambda x: (x == 13).mean() * 100,
    otherhisp_percent=lambda x: (x == 14).mean() * 100,
    nothisp_percent=lambda x: (x == 2).mean() * 100,
    noinfo_percent=lambda x: (x == 3).mean() * 100,
    NA_percent=lambda x: (x == 2).mean() * 100
)
print(ethnicity_agg)

                                   hispanic_percent  mexican_percent  \
lei                  census_tract                                      
3Y4U8VZURTYWI1W2K376 6001400100                 0.0              0.0   
                     6001400700                 0.0              0.0   
                     6001401000                 0.0              0.0   
                     6001403800                 0.0              0.0   
                     6001403900                 0.0              0.0   
...                                             ...              ...   
QOT5WN9RBKQTFRVKEV31 6115040400                 0.0              0.0   
                     6115040500                 0.0            100.0   
                     6115040600                 0.0              0.0   
                     6115040700                 0.0              0.0   
                     6115041100                 0.0              0.0   

                                   puerto_percent  cuban_percen

### Aggregate property_type
aggregates the property type data by grouping on the lei and census_tract columns. It calculates the percentage of properties that are of type 1.
- Descriptions: manufactured_home_secured_property_type
1. Manufactured home and land
2. Manufactured home and not land
3. Not applicable


In [11]:
property_type = hmda_df.groupby(['lei', 'census_tract'])['manufactured_home_secured_property_type'].agg(
property_percent1=lambda x: (x == 1).mean() * 100,
    property_percent2=lambda x: (x == 2).mean() * 100,
    property_percent3=lambda x: (x == 3).mean() * 100
)
print(property_type)

                                   property_percent1  property_percent2  \
lei                  census_tract                                         
3Y4U8VZURTYWI1W2K376 6001400100                  0.0                0.0   
                     6001400700                  0.0                0.0   
                     6001401000                  0.0                0.0   
                     6001403800                  0.0                0.0   
                     6001403900                  0.0                0.0   
...                                              ...                ...   
QOT5WN9RBKQTFRVKEV31 6115040400                  0.0                0.0   
                     6115040500                  0.0                0.0   
                     6115040600                  0.0                0.0   
                     6115040700                  0.0                0.0   
                     6115041100                  0.0                0.0   

                        

### Aggregate loan_purpose
aggregates the loan purpose data by grouping on the lei and census_tract columns. It calculates the percentage of loans that are for purpose 1.
- Descriptions: loan_purpose
1. Home purchase
2. Home improvement
31. Refinancing
32. Cash-out refinancing
4. Other purpose
5. Not applicable

In [8]:
purpose_agg = hmda_df.groupby(['lei', 'census_tract'])['loan_purpose'].agg(
purpose_percent=lambda x: (x == 1).mean() * 100
)
print(purpose_agg)

                                   purpose_percent
lei                  census_tract                 
3Y4U8VZURTYWI1W2K376 6001400100              100.0
                     6001400700                0.0
                     6001401000              100.0
                     6001403800                0.0
                     6001403900                0.0
...                                            ...
QOT5WN9RBKQTFRVKEV31 6115040400              100.0
                     6115040500              100.0
                     6115040600              100.0
                     6115040700               20.0
                     6115041100                0.0

[5839 rows x 1 columns]
