In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import PowerTransformer, OrdinalEncoder, StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from itertools import combinations
import warnings

In [2]:
warnings.filterwarnings('ignore')

df = pd.read_csv("/kaggle/input/mai-ml-lab-1-308/train.csv")
df = df.drop_duplicates()
df = df.loc[(df['RiskScore'] >= 0) & (df['RiskScore'] <= 100)].copy()
df = df.loc[(df['Experience'].notna()) & (df['Experience'] >= 0) & (df['Experience'] <= 45)].copy()
df = df.loc[(df['Age'].notna()) & (df['Age'] >= 0) & (df['Age'] <= 75)].copy()

X_full_dataset = df.drop('RiskScore', axis=1)
y_full_dataset = df['RiskScore']

In [3]:
pt = PowerTransformer(method='yeo-johnson')
education_order = ['High School', 'Associate', 'Bachelor', 'Master', 'Doctorate']
ordinal_encoder = OrdinalEncoder(categories=[education_order], handle_unknown='use_encoded_value', unknown_value=-1)
onehot_cols = ['MaritalStatus', 'HomeOwnershipStatus', 'LoanPurpose', 'EmploymentStatus']
median_date, first_date, mode_vals = None, None, None

def feature_engineer_enhanced(df_input):
    df = df_input.copy()
    for col, mode in mode_vals.items():
        if col in df.columns: df.loc[:, col] = df[col].fillna(mode)
    
    log_cols = ['AnnualIncome', 'LoanAmount', 'MonthlyIncome', 'TotalAssets', 'TotalLiabilities', 'CheckingAccountBalance', 'SavingsAccountBalance', 'MonthlyDebtPayments', 'LengthOfCreditHistory', 'NumberOfOpenCreditLines']
    for col in log_cols:
        if col in df.columns: df[f'{col}_log'] = np.log1p(df[col])
    
    sqrt_cols = ['CreditCardUtilizationRate', 'TotalAssets', 'MonthlyDebtPayments', 'AnnualIncome']
    for col in sqrt_cols:
        if col in df.columns: df[f'{col}_sqrt'] = np.sqrt(df[col] + 1)
    
    try:
        df['CreditScore_trans'] = pt.transform(df[['CreditScore']])
    except Exception:
        pass
        
    numeric_features = df.select_dtypes(include=np.number).columns.tolist()
    if 'ID' in numeric_features: numeric_features.remove('ID')
    
    poly_4_features = ['CreditScore', 'Age', 'AnnualIncome', 'LoanAmount', 'InterestRate', 'DebtToIncomeRatio', 'MonthlyIncome', 'PaymentHistory', 'Experience', 'LengthOfCreditHistory']
    for feature in poly_4_features:
        if feature in df.columns:
            df[f'{feature}_sq'] = df[feature] ** 2
            df[f'{feature}_cb'] = df[feature] ** 3
            df[f'{feature}_p4'] = df[feature] ** 4
    
    other_numeric = [f for f in numeric_features if f not in poly_4_features]
    for feature in other_numeric:
        if feature in df.columns:
            df[f'{feature}_sq'] = df[feature] ** 2
            df[f'{feature}_cb'] = df[feature] ** 3
    
    for f1, f2 in combinations(numeric_features, 2):
        if f1 in df.columns and f2 in df.columns:
            df[f'{f1}_x_{f2}'] = df[f1] * df[f2]
    
    key_interactions = [('CreditScore', 'PaymentHistory'), ('CreditScore', 'DebtToIncomeRatio'), ('AnnualIncome', 'LoanAmount'), ('InterestRate', 'LoanAmount'), ('MonthlyIncome', 'MonthlyDebtPayments'), ('Age', 'Experience'), ('LengthOfCreditHistory', 'PreviousLoanDefaults')]
    for f1, f2 in key_interactions:
        if f1 in df.columns and f2 in df.columns:
            df[f'{f1}_x_{f2}_sq'] = (df[f1] * df[f2]) ** 2
            df[f'{f1}_div_{f2}'] = df[f1] / (df[f2] + 1)
    
    df['Debt_to_Monthly_Income'] = df['MonthlyDebtPayments'] / (df['MonthlyIncome'] + 1)
    df['Debt_to_Monthly_Income_sq'] = (df['MonthlyDebtPayments'] / (df['MonthlyIncome'] + 1)) ** 2
    df['Loan_to_Annual_Income'] = df['LoanAmount'] / (df['AnnualIncome'] + 1)
    df['Loan_to_Annual_Income_sq'] = (df['LoanAmount'] / (df['AnnualIncome'] + 1)) ** 2
    df['Savings_to_Debt'] = df['SavingsAccountBalance'] / (df['TotalLiabilities'] + 1)
    df['CreditHistory_to_Age'] = df['LengthOfCreditHistory'] / (df['Age'] * 12 + 1)
    df['Disposable_Income'] = df['MonthlyIncome'] - df['MonthlyDebtPayments']
    df['Disposable_Income_sq'] = (df['MonthlyIncome'] - df['MonthlyDebtPayments']) ** 2
    df['Net_Worth_to_Income'] = df['NetWorth'] / (df['AnnualIncome'] + 1)
    df['DSR'] = df['MonthlyLoanPayment'] / (df['MonthlyIncome'] + 1)
    df['DSR_sq'] = (df['MonthlyLoanPayment'] / (df['MonthlyIncome'] + 1)) ** 2
    df['Liabilities_to_Assets'] = df['TotalLiabilities'] / (df['TotalAssets'] + 1)
    df['Income_minus_Loan'] = df['AnnualIncome'] - df['LoanAmount']
    
    df['ApplicationDate'] = pd.to_datetime(df['ApplicationDate'], errors='coerce').fillna(median_date)
    df['DaysSinceStart'] = (df['ApplicationDate'] - first_date).dt.days
    df['ApplicationMonth'] = df['ApplicationDate'].dt.month
    df['ApplicationQuarter'] = df['ApplicationDate'].dt.quarter
    df['ApplicationDayOfYear'] = df['ApplicationDate'].dt.dayofyear
    df['ApplicationWeekOfYear'] = df['ApplicationDate'].dt.isocalendar().week.astype('Int64')
    df['Is_Month_End'] = df['ApplicationDate'].dt.is_month_end.astype(int)
    df['Is_Month_Start'] = df['ApplicationDate'].dt.is_month_start.astype(int)
    
    df['EducationLevel'] = ordinal_encoder.transform(df[['EducationLevel']])
    df = pd.get_dummies(df, columns=onehot_cols, drop_first=True)
    df = df.drop(columns=['ID', 'ApplicationDate'], errors='ignore')
    return df

In [4]:
pt.fit(X_full_dataset[['CreditScore']])
ordinal_encoder.fit(X_full_dataset[['EducationLevel']])
temp_dates = pd.to_datetime(X_full_dataset['ApplicationDate'], errors='coerce')
median_ts = temp_dates.dropna().astype('int64').median()
median_date = pd.to_datetime(median_ts)
first_date = temp_dates.min()
mode_vals = {col: X_full_dataset[col].mode()[0] for col in ['EducationLevel'] + onehot_cols}

X_processed = feature_engineer_enhanced(X_full_dataset)
y_processed = y_full_dataset

In [5]:
initial_features_base = ['CreditScore_trans_cb', 'CreditScore_x_AnnualIncome_sqrt', 'MonthlyIncome_log_sq', 'MonthlyIncome_p4', 'MonthlyIncome_x_TotalAssets_sqrt', 'AnnualIncome_x_TotalAssets', 'TotalAssets_x_MonthlyIncome', 'NetWorth_x_AnnualIncome_sqrt', 'LengthOfCreditHistory_x_TotalAssets_sqrt', 'TotalAssets_sqrt_x_AnnualIncome_sqrt', 'Disposable_Income_sq', 'AnnualIncome_x_TotalAssets_sqrt', 'AnnualIncome_x_LengthOfCreditHistory', 'LengthOfCreditHistory_x_MonthlyIncome', 'CreditScore_x_TotalAssets_sqrt', 'LengthOfCreditHistory_log_x_TotalAssets_sqrt', 'LengthOfCreditHistory_x_AnnualIncome_sqrt', 'TotalAssets_sqrt', 'AnnualIncome_sqrt_x_CreditScore_trans', 'MonthlyIncome_sq', 'TotalAssets_x_AnnualIncome_sqrt', 'NumberOfOpenCreditLines_log_x_TotalAssets_sqrt', 'AnnualIncome_x_LengthOfCreditHistory_log', 'MonthlyIncome_x_LengthOfCreditHistory_log', 'MonthlyIncome_x_LoanAmount_log', 'TotalAssets_x_CreditScore_trans', 'LoanDuration_x_MonthlyIncome', 'AnnualIncome_x_LoanAmount', 'AnnualIncome_x_CreditScore_trans', 'LoanAmount_log_x_TotalAssets_log', 'TotalAssets_sqrt_x_CreditScore_trans', 'UtilityBillsPaymentHistory', 'MonthlyIncome_x_InterestRate', 'MonthlyIncome_log_x_TotalAssets_sqrt', 'EmploymentStatus_Unemployed', 'NumberOfDependents_x_NetWorth', 'LengthOfCreditHistory_log_x_AnnualIncome_sqrt', 'AnnualIncome_x_LoanAmount_log', 'MonthlyIncome_log_cb', 'NumberOfDependents_x_TotalAssets', 'CreditScore_x_CreditScore_trans', 'LoanAmount_log_x_AnnualIncome_sqrt', 'NetWorth_x_CreditScore_trans', 'AnnualIncome_log_x_TotalAssets_sqrt', 'Is_Month_Start', 'CreditScore_x_LengthOfCreditHistory_log', 'InterestRate_x_MonthlyIncome_log', 'LoanAmount_x_AnnualIncome_sqrt', 'DebtToIncomeRatio_sq', 'InterestRate_cb', 'Loan_to_Annual_Income', 'Loan_to_Annual_Income_sq', 'TotalDebtToIncomeRatio_sq', 'Debt_to_Monthly_Income', 'AnnualIncome_x_InterestRate', 'InterestRate_x_AnnualIncome_sqrt', 'InterestRate_x_AnnualIncome_log', 'TotalAssets_log_x_AnnualIncome_sqrt', 'CreditScore_div_DebtToIncomeRatio', 'BankruptcyHistory_x_LoanAmount_log', 'AnnualIncome_log_x_LengthOfCreditHistory_log', 'MonthlyDebtPayments_log_x_CreditScore_trans', 'LoanAmount_log_x_CreditCardUtilizationRate_sqrt', 'BankruptcyHistory_x_SavingsAccountBalance_log', 'CreditScore_x_BaseInterestRate', 'MonthlyIncome_x_TotalDebtToIncomeRatio', 'InterestRate_x_TotalLiabilities_log', 'InterestRate_x_TotalDebtToIncomeRatio', 'LoanDuration_x_TotalAssets_sqrt', 'LoanDuration_x_TotalDebtToIncomeRatio', 'InterestRate_x_TotalAssets_sqrt', 'TotalAssets_sq', 'MonthlyIncome_x_BaseInterestRate', 'AnnualIncome_log_sq', 'DebtToIncomeRatio_x_InterestRate', 'CreditScore_trans_sq', 'EmploymentStatus_Self-Employed', 'TotalDebtToIncomeRatio_x_CreditScore_trans', 'CreditScore_x_TotalAssets_log', 'InterestRate_x_CreditCardUtilizationRate_sqrt', 'MonthlyIncome_log_x_TotalAssets_log', 'BankruptcyHistory_x_CreditScore_trans', 'BankruptcyHistory_x_BaseInterestRate', 'MonthlyIncome_x_NetWorth', 'DebtToIncomeRatio_x_CreditCardUtilizationRate_sqrt', 'NetWorth_x_LengthOfCreditHistory_log', 'CreditCardUtilizationRate_x_BankruptcyHistory', 'NetWorth_x_CheckingAccountBalance_log', 'Income_minus_Loan', 'LengthOfCreditHistory_x_NetWorth', 'MonthlyIncome_x_TotalAssets_log', 'NetWorth_x_AnnualIncome_sqrt', 'BankruptcyHistory_x_InterestRate', 'CreditScore_x_NetWorth', 'MonthlyDebtPayments_x_BankruptcyHistory', 'MaritalStatus_Married', 'MaritalStatus_Widowed', 'DSR', 'DSR_sq', 'PaymentHistory', 'PreviousLoanDefaults', 'BankruptcyHistory', 'LoanAmount', 'InterestRate', 'DebtToIncomeRatio', 'TotalAssets', 'LengthOfCreditHistory', 'TotalLiabilities', 'CreditCardUtilizationRate', 'TotalDebtToIncomeRatio', 'MonthlyLoanPayment', 'NumberOfOpenCreditLines', 'CheckingAccountBalance', 'SavingsAccountBalance', 'TotalLiabilities_x_CreditScore_trans', 'UtilityBillsPaymentHistory_sq', 'AnnualIncome_log_cb', 'MonthlyIncome_cb', 'TotalAssets_cb', 'NumberOfOpenCreditLines_log', 'CreditScore_trans_p4', 'LoanAmount_x_TotalAssets', 'LoanAmount_x_MonthlyIncome', 'CreditScore_x_PaymentHistory', 'PaymentHistory_sq', 'PaymentHistory_cb', 'LoanAmount_sq', 'LoanAmount_cb', 'LoanAmount_p4', 'InterestRate_sq', 'InterestRate_p4', 'DebtToIncomeRatio_cb', 'DebtToIncomeRatio_p4', 'Experience_sq', 'Experience_cb', 'Experience_p4', 'Age_sq', 'Age_cb', 'Age_p4', 'LengthOfCreditHistory_sq', 'LengthOfCreditHistory_cb', 'LengthOfCreditHistory_p4', 'AnnualIncome_sq', 'AnnualIncome_cb', 'AnnualIncome_p4', 'MonthlyIncome_log_p4', 'AnnualIncome_log_p4', 'TotalAssets_log_sq', 'TotalAssets_log_cb', 'TotalAssets_log_p4', 'LoanAmount_log_sq', 'LoanAmount_log_cb', 'LoanAmount_log_p4', 'InterestRate_log_sq', 'InterestRate_log_cb', 'InterestRate_log_p4', 'NumberOfCreditInquiries', 'NumberOfCreditInquiries_sq', 'NumberOfCreditInquiries_cb', 'LoanDuration', 'LoanDuration_sq', 'LoanDuration_cb', 'JobTenure', 'JobTenure_sq', 'JobTenure_cb', 'NumberOfDependents', 'NumberOfDependents_sq', 'NumberOfDependents_cb', 'ApplicationMonth', 'ApplicationQuarter', 'ApplicationDayOfYear', 'DaysSinceStart', 'Is_Month_End', 'EducationLevel', 'LoanPurpose_Education', 'LoanPurpose_Home', 'LoanPurpose_Debt Consolidation', 'HomeOwnershipStatus_Own', 'HomeOwnershipStatus_Rent', 'MaritalStatus_Divorced', 'MaritalStatus_Single', 'TotalLiabilities_log', 'CheckingAccountBalance_log', 'SavingsAccountBalance_log', 'MonthlyDebtPayments_log', 'LengthOfCreditHistory_log', 'TotalAssets_sqrt_sq', 'MonthlyDebtPayments_sqrt_sq', 'CreditCardUtilizationRate_sqrt_sq', 'AnnualIncome_sqrt_sq', 'Liabilities_to_Assets', 'Net_Worth_to_Income', 'Savings_to_Debt', 'CreditHistory_to_Age', 'Debt_to_Monthly_Income_sq']
features_to_remove_collinearity = ['MonthlyIncome', 'Age', 'NetWorth', 'BaseInterestRate', 'AnnualIncome_log', 'CreditScore', 'CreditScore_sq']
initial_features = [f for f in initial_features_base if f in X_processed.columns and f not in features_to_remove_collinearity]
X_initial = X_processed[initial_features]

temp_pipe = Pipeline([
    ('imputer', KNNImputer(n_neighbors=13)),
    ('scaler', StandardScaler()),
    ('model', Ridge(alpha=3.3, random_state=42))
])
temp_pipe.fit(X_initial, y_processed)

importances = np.abs(temp_pipe.named_steps['model'].coef_)
feature_importances = pd.DataFrame({'feature': initial_features, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False)

final_features = feature_importances.head(160)['feature'].tolist()

X_final = X_initial[final_features]

In [7]:
print("Grid Search...")
pipeline = Pipeline([
    ('imputer', KNNImputer()),
    ('scaler', StandardScaler()),
    ('model', Ridge(random_state=42))
])

param_grid = {
    'imputer__n_neighbors': [12, 13],
    'model__alpha': [3.3, 3.4]
}
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='neg_mean_squared_error', cv=cv_strategy, n_jobs=-1, verbose=1)
grid_search.fit(X_final, y_processed)

best_model = grid_search.best_estimator_
best_mse = -grid_search.best_score_

print(f"Лучшие параметры: {grid_search.best_params_}")
print(f"MSE на кросс-валидации: {best_mse:.4f}")

Grid Search...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Лучшие параметры: {'imputer__n_neighbors': 13, 'model__alpha': 3.4}
MSE на кросс-валидации: 26.2538
