In [6]:
import pandas as pd 
import pandas_profiling
from pandas_profiling import ProfileReport

In [2]:
url = "https://github.com/SebastianStoneham/seb/raw/main/lei_3Y4U8VZURTYWI1W2K376-QOT5WN9RBKQTFRVKEV31_state_CA.csv"
hmda_df = pd.read_csv(url)  

### applicant income variable

aggregates the applicant income data by grouping on the lei and census_tract columns. It calculates the sum, mean and median of the applicant income.
- FFIEC Median family income in dollars for the MSA/MD in which the tract is located (adjusted annually by FFIEC)

In [10]:
income_agg = hmda_df.groupby(['lei', 'census_tract'])['ffiec_msa_md_median_family_income'].agg(
income_sum='sum',
income_mean='mean',
income_median='median'
)
print(income_agg)

                                   income_sum  income_mean  income_median
lei                  census_tract                                        
3Y4U8VZURTYWI1W2K376 6001400100        125600     125600.0       125600.0
                     6001400700        125600     125600.0       125600.0
                     6001401000        125600     125600.0       125600.0
                     6001403800        125600     125600.0       125600.0
                     6001403900        125600     125600.0       125600.0
...                                       ...          ...            ...
QOT5WN9RBKQTFRVKEV31 6115040400         66800      66800.0        66800.0
                     6115040500         66800      66800.0        66800.0
                     6115040600        200400      66800.0        66800.0
                     6115040700        334000      66800.0        66800.0
                     6115041100         66800      66800.0        66800.0

[5839 rows x 3 columns]


### Aggregate applicant_ethnicity
aggregates the applicant ethnicity data by grouping on the lei and census_tract columns. It calculates the percentage of applicants who are of ethnicity 2.


In [17]:
ethnicity_agg = hmda_df.groupby(['lei', 'census_tract'])['applicant_ethnicity-1'].agg(
ethnicity_percent=lambda x: (x == 2).mean() * 100
)
print(ethnicity_agg)

                                   ethnicity_percent
lei                  census_tract                   
3Y4U8VZURTYWI1W2K376 6001400100             0.000000
                     6001400700             0.000000
                     6001401000             0.000000
                     6001403800             0.000000
                     6001403900           100.000000
...                                              ...
QOT5WN9RBKQTFRVKEV31 6115040400             0.000000
                     6115040500             0.000000
                     6115040600            66.666667
                     6115040700            60.000000
                     6115041100           100.000000

[5839 rows x 1 columns]


### Aggregate property_type
aggregates the property type data by grouping on the lei and census_tract columns. It calculates the percentage of properties that are of type 1.
- Descriptions: manufactured_home_secured_property_type
1. Manufactured home and land
2. Manufactured home and not land
3. Not applicable


In [23]:
property_agg = hmda_df.groupby(['lei', 'census_tract'])['manufactured_home_secured_property_type'].agg(
property_percent=lambda x: (x == 1).mean() * 100
)
print(property_agg)

                                   property_percent
lei                  census_tract                  
3Y4U8VZURTYWI1W2K376 6001400100                 0.0
                     6001400700                 0.0
                     6001401000                 0.0
                     6001403800                 0.0
                     6001403900                 0.0
...                                             ...
QOT5WN9RBKQTFRVKEV31 6115040400                 0.0
                     6115040500                 0.0
                     6115040600                 0.0
                     6115040700                 0.0
                     6115041100                 0.0

[5839 rows x 1 columns]


### Aggregate loan_purpose
aggregates the loan purpose data by grouping on the lei and census_tract columns. It calculates the percentage of loans that are for purpose 1.

In [None]:
purpose_agg = hmda_df.groupby(['lei', 'census_tract'])['loan_purpose'].agg(
purpose_percent=lambda x: (x == 1).mean() * 100
)

### Combine aggregated dataframes
created by merging the above dataframes together using a left join.


In [14]:
result = loan_agg.merge(minority_agg, left_index=True, right_index=True)
.merge(denial_agg, left_index=True, right_index=True)
.merge(fico_agg, left_index=True, right_index=True)
.merge(income_agg, left_index=True, right_index=True)
.merge(ethnicity_agg, left_index=True, right_index=True)
.merge(property_agg, left_index=True, right_index=True)
.merge(purpose_agg, left_index=True, right_index=True) describe what these are

SyntaxError: invalid syntax (1356198276.py, line 2)