In [1]:
# ECS289G_Term_Project/data_transform.ipynb

# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Step 1: Load the cleaned dataset
file_path = r"/Users/harshil/Developer/GitHub_Repos/ECS_289G/data/processsed/cleaned_dataset_yrs-23.csv"  # Update with the actual file path
cleaned_dataset = pd.read_csv(file_path, low_memory=False)

# Display dataset information
print(f"Dataset Shape: {cleaned_dataset.shape}")
cleaned_dataset.info()

Dataset Shape: (1602362, 35)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1602362 entries, 0 to 1602361
Data columns (total 35 columns):
 #   Column                             Non-Null Count    Dtype  
---  ------                             --------------    -----  
 0   tract_to_msa_income_percentage     1602362 non-null  float64
 1   ffiec_msa_md_median_family_income  1602362 non-null  int64  
 2   tract_minority_population_percent  1602362 non-null  float64
 3   interest_rate                      1602362 non-null  float64
 4   action_taken                       1602362 non-null  int64  
 5   race_0                             1602362 non-null  bool   
 6   race_1                             1602362 non-null  bool   
 7   race_2                             1602362 non-null  bool   
 8   race_3                             1602362 non-null  bool   
 9   race_4                             1602362 non-null  bool   
 10  race_5                             1602362 non-null  bool   


In [2]:
# Step 2: Generate SES Groups
def assign_ses_group(row):
    if row['tract_to_msa_income_percentage'] < 80:
        return 'Low'
    elif 80 <= row['tract_to_msa_income_percentage'] <= 120:
        return 'Middle'
    else:
        return 'High'

cleaned_dataset['SES_group'] = cleaned_dataset.apply(assign_ses_group, axis=1)
print("SES Groups Assigned")
print(cleaned_dataset['SES_group'].value_counts())

SES Groups Assigned
SES_group
High      623962
Middle    612631
Low       365769
Name: count, dtype: int64


In [3]:
# Step 3: Normalize/Scale Numeric Features
# Define numeric columns to scale
numeric_columns = [
    'tract_to_msa_income_percentage', 'ffiec_msa_md_median_family_income',
    'tract_minority_population_percent', 'interest_rate'
]

scaler = MinMaxScaler()
cleaned_dataset[numeric_columns] = scaler.fit_transform(cleaned_dataset[numeric_columns])

print("Numeric Features Scaled")
print(cleaned_dataset[numeric_columns].describe())

Numeric Features Scaled
       tract_to_msa_income_percentage  ffiec_msa_md_median_family_income  \
count                    1.602362e+06                       1.602362e+06   
mean                     3.466439e-01                       6.176792e-01   
std                      1.402902e-01                       1.432159e-01   
min                      0.000000e+00                       0.000000e+00   
25%                      2.522043e-01                       5.296656e-01   
50%                      3.302106e-01                       6.143474e-01   
75%                      4.214426e-01                       6.893204e-01   
max                      1.000000e+00                       1.000000e+00   

       tract_minority_population_percent  interest_rate  
count                       1.602362e+06   1.602362e+06  
mean                        4.910712e-01   3.775641e-01  
std                         2.611486e-01   8.960323e-02  
min                         0.000000e+00   0.000000e+00  
2

In [4]:
transformed_data = cleaned_dataset

In [5]:
transformed_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1602362 entries, 0 to 1602361
Data columns (total 36 columns):
 #   Column                             Non-Null Count    Dtype  
---  ------                             --------------    -----  
 0   tract_to_msa_income_percentage     1602362 non-null  float64
 1   ffiec_msa_md_median_family_income  1602362 non-null  float64
 2   tract_minority_population_percent  1602362 non-null  float64
 3   interest_rate                      1602362 non-null  float64
 4   action_taken                       1602362 non-null  int64  
 5   race_0                             1602362 non-null  bool   
 6   race_1                             1602362 non-null  bool   
 7   race_2                             1602362 non-null  bool   
 8   race_3                             1602362 non-null  bool   
 9   race_4                             1602362 non-null  bool   
 10  race_5                             1602362 non-null  bool   
 11  race_6                  

In [6]:
print(transformed_data.isnull().sum())

tract_to_msa_income_percentage       0
ffiec_msa_md_median_family_income    0
tract_minority_population_percent    0
interest_rate                        0
action_taken                         0
race_0                               0
race_1                               0
race_2                               0
race_3                               0
race_4                               0
race_5                               0
race_6                               0
race_7                               0
race_8                               0
gender_0                             0
gender_1                             0
gender_2                             0
gender_3                             0
ethnicity_0                          0
ethnicity_1                          0
ethnicity_2                          0
ethnicity_3                          0
ethnicity_4                          0
loan_type_2                          0
loan_type_3                          0
loan_type_4              

In [7]:
transformed_data.to_csv(r"/Users/harshil/Developer/GitHub_Repos/ECS_289G/data/processsed/transformed/transformed_dataset-yrs-23.csv", index=False)