In [1]:
# Add any additional libraries or submodules below

# Data libraries
import pandas as pd
import numpy as np

# Plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Plotting defaults
plt.rcParams['figure.figsize'] = (8,5)
plt.rcParams['figure.dpi'] = 80

# sklearn modules
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import RocCurveDisplay, roc_auc_score
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.model_selection import GridSearchCV, KFold, StratifiedKFold 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.tree import export_graphviz, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# imblearn modules
from imblearn.pipeline import Pipeline as ImPipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

In [3]:
# Load data in easyshare.csv
d = pd.read_csv("freddiemac.csv")

  d = pd.read_csv("freddiemac.csv")


## 数据清理

### 删除项处理 Data cleaning

In [7]:
# 删除主要特征缺失的行
d = d[d['fico'] != 9999] # 删除fico为9999的行（即缺失值）
d = d[d['ltv'] != 999]
d = d[d['dti'] != 999]
d = d[d['int_rt'] !=0]
# 不重要的特征处理
#字符
d['flag_sc'] = d['flag_sc'].replace(np.nan, 'Not_Available')
d['id_loan_rr'] = d['id_loan_rr'].replace('NA', 'Not_Available')
d['id_loan_rr'] = d['id_loan_rr'].fillna('Not_Available')
d['program_ind'] = d['program_ind'].replace('9', 'Not_Available')
d['rr_ind'] = d['rr_ind'].replace('NA', 'Not_Available')
d['rr_ind'] = d['rr_ind'].fillna('Not_Available')
d['mi_cancel_ind'] = d['mi_cancel_ind'].replace({'7': 'Not_Available', '9': 'Not_Available'})
# Index
d = d.set_index('id_loan')
# 数值
d['cd_msa'] = d['cd_msa'].replace(np.nan,0)
d['mi_pct'] = d['mi_pct'].replace(999,0)
d['cltv'] = d['cltv'].replace(999,0)
d['property_val'] = d['property_val'].replace(9,0)
d['cnt_units'] = d['cnt_units'].astype('object')
d['cnt_borr'] = d['cnt_borr'].astype('object')
d['property_val'] = d['property_val'].astype('object')
# 只要违约与不违约
d = d[d["loan_status"].isin(["prepaid", "default"])]
#不要时间相关的特征
d = d.drop(columns=['dt_first_pi'])
d = d.drop(columns=['dt_matr'])
#d = d.drop(columns=['seller_name'])
d.head()

Unnamed: 0_level_0,fico,flag_fthb,cd_msa,mi_pct,cnt_units,occpy_sts,cltv,dti,orig_upb,ltv,...,seller_name,servicer_name,flag_sc,id_loan_rr,program_ind,rr_ind,property_val,io_ind,mi_cancel_ind,loan_status
id_loan,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F17Q10000002,809,N,0.0,0,1,P,75,38,195000,75,...,Other sellers,SPECIALIZED LOAN SERVICING LLC,Not_Available,Not_Available,Not_Available,Not_Available,2,N,Not_Available,prepaid
F17Q10000064,792,N,0.0,0,1,S,60,36,87000,60,...,Other sellers,Other servicers,Not_Available,Not_Available,Not_Available,Not_Available,2,N,Not_Available,prepaid
F17Q10000065,776,N,0.0,0,1,S,80,18,106000,80,...,Other sellers,Other servicers,Not_Available,Not_Available,Not_Available,Not_Available,2,N,Not_Available,prepaid
F17Q10000176,687,N,46520.0,0,1,P,80,45,619000,75,...,Other sellers,Other servicers,Not_Available,Not_Available,Not_Available,Not_Available,2,N,Not_Available,prepaid
F17Q10000410,725,N,48300.0,0,1,P,14,10,100000,14,...,Other sellers,Other servicers,Not_Available,Not_Available,Not_Available,Not_Available,2,N,Not_Available,prepaid


In [8]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from imblearn.pipeline import Pipeline


In [11]:
# Create a list of categorical columns
categorical = d.select_dtypes(include=['object']).drop(columns=['loan_status']).columns.tolist()
# Create a list of numerical columns
numerical = d.select_dtypes(include=['int64', 'float64']).columns.tolist()


In [13]:
X, y =d.drop(['loan_status'], axis=1), d['loan_status']
# Encode target
y = LabelEncoder().fit_transform(y)
# Split test and train with similar proportion of default and prepaid
X_train, X_test, y_train, y_test = train_test_split(
    X, y, shuffle=True, stratify=y, test_size=0.2, random_state=0
)

In [15]:
d["loan_status"].value_counts()

loan_status
prepaid    124669
default       706
Name: count, dtype: int64

In [49]:
numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical),
        ('cat', categorical_transformer, categorical)
    ],
    verbose_feature_names_out=False  
)
preprocessor = pipeline_preprocess.named_steps['preprocess']
pipeline_preprocess = Pipeline([
    ('preprocess', preprocessor)
])


In [55]:
X_train_preprocessed = pipeline_preprocess.fit_transform(X_train, y_train)
# 2) 从 ColumnTransformer 拿到特征名
preprocessor = pipeline_preprocess.named_steps['preprocess']  
feature_names = preprocessor.get_feature_names_out()
# 3) 把 csr_matrix 转成带列名的 DataFrame
import pandas as pd

X_train_preprocessed_df = pd.DataFrame(
    X_train_preprocessed.toarray(), 
    columns=feature_names
)
# 现在 X_train_preprocessed_df 就是带列名的 DataFrame
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=0)
X_train_resampled_df, y_train_resampled = sm.fit_resample(X_train_preprocessed_df, y_train)
# SMOTE 输出的 y_train_resampled 是 numpy.ndarray

print("X_train_resampled_df shape =", X_train_resampled_df.shape)
print("columns =", X_train_resampled_df.columns)
# 5) 做特征选择（相关性 + 随机森林）
y_train_resampled_ser = pd.Series(y_train_resampled, name='target')
# 合并 X 与 y，计算相关性：
df_for_corr = pd.concat([X_train_resampled_df, y_train_resampled_ser], axis=1)
corr_matrix = df_for_corr.corr()  
corr_with_target = corr_matrix['target'].abs()
selected_filter = corr_with_target[corr_with_target > 0.05].index.tolist()
if 'target' in selected_filter:
    selected_filter.remove('target')
# 用随机森林提取特征重要度：
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train_resampled_df, y_train_resampled_ser)
importances = pd.Series(rf.feature_importances_, index=X_train_resampled_df.columns)
selected_rf = importances[importances > 0.01].index.tolist()
# 取交集得到最终选择的特征：
final_selected_features = list(set(selected_filter) & set(selected_rf))
print("相关性法选出的特征：", selected_filter)
print("随机森林选出的特征：", selected_rf)
print("最终交集：", final_selected_features)
# 根据最终选出的特征切出数据
X_train_final = X_train_resampled_df[final_selected_features]


X_train_resampled_df shape = (199470, 200)
columns = Index(['fico', 'cd_msa', 'mi_pct', 'cltv', 'dti', 'orig_upb', 'ltv', 'int_rt',
       'zipcode', 'orig_loan_term',
       ...
       'program_ind_Not_Available', 'rr_ind_Not_Available', 'property_val_0',
       'property_val_1', 'property_val_2', 'property_val_3', 'io_ind_N',
       'mi_cancel_ind_N', 'mi_cancel_ind_Not_Available', 'mi_cancel_ind_Y'],
      dtype='object', length=200)
相关性法选出的特征： ['fico', 'mi_pct', 'cltv', 'dti', 'orig_upb', 'ltv', 'int_rt', 'zipcode', 'flag_fthb_N', 'flag_fthb_Y', 'occpy_sts_I', 'st_AZ', 'st_CO', 'st_FL', 'st_LA', 'st_NY', 'st_VA', 'st_WA', 'prop_type_MH', 'prop_type_PU', 'prop_type_SF', 'loan_purpose_C', 'loan_purpose_N', 'cnt_borr_1', 'cnt_borr_2', 'seller_name_FLAGSTAR BANK, FSB', 'seller_name_GUARANTEED RATE, INC.', 'seller_name_LOANDEPOT.COM, LLC', 'seller_name_Other sellers', 'seller_name_PROVIDENT FUNDING ASSOCIATES, L.P.', 'seller_name_QUICKEN LOANS INC.', 'seller_name_QUICKEN LOANS, LLC', 's