In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold , StratifiedKFold, cross_val_score
import missingno as msno
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import optuna

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/mai-ml-contest-1/train.csv')
train.head()

# EDA
Проверим пропущенные значения.

In [None]:
msno.matrix(train)

Видим, что все пропуски встречаются вместе, так что можно просто удалить строки с пропущенными значениями.
Заодно удалим столбец с датами и повторяющиеся строчки.

In [None]:
df = train.dropna()
df = df.drop(columns=['ApplicationDate'])
print(train.shape, df.shape)
df = df.drop_duplicates()
print(train.shape, df.shape)

In [None]:
num_features = df.select_dtypes(include=np.number).columns.tolist()
cat_features = df.select_dtypes(include='object').columns.tolist()
num_features.remove('RiskScore')

print(num_features)
print(cat_features)

Рассмотрим данные о всех столбцах. Заметим выбросы в таргете и почистим их.

In [None]:
pd.set_option('display.max_columns', None)
df.describe()

In [None]:
q_low = df['RiskScore'].quantile(0.01)
q_hi = df['RiskScore'].quantile(0.99)
df = df[(df['RiskScore'] < q_hi) & (df['RiskScore'] > q_low)]
df['RiskScore']

Построим матрицу корреляций и выбросим сильно коррелирующие признаки.

In [None]:
corr_matrix = df.select_dtypes(np.number).corr()
plt.figure(figsize=(12, 12))
sns.heatmap(corr_matrix, annot=False, cmap="coolwarm", vmin=-1, vmax=1)
plt.title("Корреляционная матрица")
plt.show()

In [None]:
high_corr = corr_matrix[(corr_matrix >= 0.8) & (corr_matrix != 1.)].stack()
print(high_corr)

In [None]:
df.drop(columns=['Experience', 'MonthlyIncome', 'MonthlyLoanPayment', 'NetWorth', 'BaseInterestRate'])

In [None]:
risk_corr = corr_matrix['RiskScore']
risk_corr[(risk_corr < 0.01) & (risk_corr > -0.01)]
#df.drop(columns=['SavingsAccountBalance'])

Нормализуем данные:

In [None]:
df[num_features] = np.log(df[num_features] + 1)

Закодируем категориальные признаки:

In [None]:
df = pd.get_dummies(df, columns=cat_features, drop_first=True)

# Обучение модели
Используем линейную регрессию с регуляризацией Ridge. Используем StandardScaler для масштабирования признаков.
С помощью библиотеки Optuna подберем оптимальные гиперпараметры модели.

In [None]:
X_train = df.drop(columns=['RiskScore'])
Y_train = df['RiskScore']

scaler = StandardScaler()
scaler.fit(X_train)
scaled_X = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)

In [None]:
def objective(trial, X=scaled_X, Y=Y_train):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, random_state=42)
    params = {
        'alpha': trial.suggest_float('alpha', 0.0, 1.0),
        'tol': trial.suggest_float('tol', 1e-6, 1e-3),
    }
    model = Ridge(**params)
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    score = mean_squared_error(Y_test, Y_pred)
    return score

In [None]:
study = optuna.create_study(direction="minimize")

# Optimize the objective function
study.optimize(objective, n_trials=100)

In [None]:
print("Best hyperparameters:", study.best_params)
print("Best value:", study.best_value)

Визуализируем поиск гиперпараметров.

In [None]:
import optuna.visualization as vis
vis.plot_optimization_history(study)

In [None]:
vis.plot_slice(study)

Обучим нашу модель с этими гиперпараметрами.

In [None]:
sk_reg = Ridge(**study.best_params)
sk_reg.fit(scaled_X, Y_train)
pred = sk_reg.predict(scaled_X)
pred

In [None]:
print(mean_squared_error(pred, Y_train))

Отправим сабмит.

In [None]:
test = pd.read_csv('/kaggle/input/mai-ml-contest-1/test.csv')
test = test.dropna()

num_features = test.select_dtypes(include=np.number).columns.tolist()
cat_features = test.select_dtypes(include='object').columns.tolist()
cat_features.remove('ApplicationDate')
test[num_features] = np.log(test[num_features] + 1)
test = test.drop(columns=['ID', 'ApplicationDate'])
test = pd.get_dummies(test, columns=cat_features, drop_first=True)
test.drop(columns=['Experience', 'MonthlyIncome', 'MonthlyLoanPayment', 'NetWorth', 'BaseInterestRate'])
scaled_pred = pd.DataFrame(scaler.transform(test), columns=test.columns)
test_pred = sk_reg.predict(scaled_pred)
test_pred

In [None]:
submission = pd.DataFrame({'RiskScore': test_pred})
submission.to_csv('submission.csv', index=True, index_label='ID')