In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# data analysis and wrangling
# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import os
from ipywidgets import interact

# machine learning 
from scipy.stats import chi2_contingency
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.metrics import confusion_matrix,roc_curve,accuracy_score,classification_report,make_scorer,f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.utils.class_weight import compute_class_weight
import MyClassAndFun as MCF
random_state=1

In [2]:
df=pd.read_csv('transformed__dumm_train.csv')
df.drop(['id','customer_id','month','ssn_aaa','monthly_inhand_salary','delay_from_due_date'], axis=1, inplace=True)
X=df.drop('credit_score', axis=1)
y=df['credit_score']
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.1, random_state=1,stratify=y)
from sklearn.preprocessing import PowerTransformer
pt=PowerTransformer(method='yeo-johnson', standardize=True, copy=True)
transform_var_list=['age','annual_income','changed_credit_limit',#'delay_from_due_date',,'interest_rate'
                    'credit_utilization_ratio','credit_history_monts',#'outstanding_debt',
                    'num_of_delayed_payment','total_emi_per_month','amount_invested_monthly','monthly_balance',]
for i in transform_var_list:
    X_train[i]=pt.fit_transform(X_train[[i]])
    X_test[i]=pt.transform(X_test[[i]])
X_train_df=pd.DataFrame(X_train,columns=X.columns)
X_test_df=pd.DataFrame(X_test,columns=X.columns)
combined_df = pd.concat([X_train_df, y_train], axis=1)
test_results = MCF.perform_tests(combined_df)
# print(test_results.sort_values(by="t-statistic", ascending=False))
X_test_df=X_test_df[list(test_results.loc[test_results['p-value']==0].column)]
X_train_df=X_train_df[list(test_results.loc[test_results['p-value']==0].column)]
class_labels = np.unique(y)
class_weights = compute_class_weight(class_weight='balanced', classes=class_labels, y=y_train)
unique, counts = np.unique(y, return_counts=True)
class_counts = dict(zip(unique, counts))

In [3]:
combined_df = pd.concat([X_train_df, y_train], axis=1)

In [4]:


def perform_tests(df):
    # Непрерывные переменные
    transform_var_list = ['age', 'annual_income', 'interest_rate', 'delay_from_due_date', 'changed_credit_limit', 'outstanding_debt', 'credit_utilization_ratio', 'credit_history_monts', 'num_of_delayed_payment', 'total_emi_per_month', 'amount_invested_monthly', 'monthly_balance']

    summary = []

    # Хи-квадрат тест для категориальных переменных
    for col in df.columns[:-1]:
        if col in transform_var_list:
            t_stat, pvalue = f_classif(df[[col]], df["credit_score"])
            summary.append([col, t_stat[0], pvalue[0]])
        else :
            cross = pd.crosstab(index=df[col], columns=df["credit_score"])
            t_stat, pvalue, *_ = chi2_contingency(cross)
            summary.append([col, t_stat, pvalue])

    return pd.DataFrame(
        data=summary,
        columns=["column", 't-statistic', "p-value"]
    )

# Применяем функцию
test_results = perform_tests(combined_df)
print(test_results.sort_values(by="t-statistic", ascending=False))
X_test_df=X_test_df[list(test_results.loc[test_results['p-value']==0].column)]
X_train_df=X_train_df[list(test_results.loc[test_results['p-value']==0].column)]

                       column   t-statistic  p-value
22                 credit_mix  36462.749947      0.0
3             num_credit_card  21314.732369      0.0
8        num_credit_inquiries  20599.566619      0.0
23  payment_of_min_amount_Yes  20212.760068      0.0
2           num_bank_accounts  18848.271651      0.0
5                 num_of_loan  14252.801416      0.0
4               interest_rate  13933.995271      0.0
9            outstanding_debt   8204.431397      0.0
21       credit_history_monts   7839.681872      0.0
6      num_of_delayed_payment   4934.652965      0.0
7        changed_credit_limit   2567.962690      0.0
1               annual_income   2173.643977      0.0
12                   AutoLoan   2103.899571      0.0
15               PersonalLoan   1834.697863      0.0
13         Credit-BuilderLoan   1830.130358      0.0
16                 PaydayLoan   1827.474665      0.0
17               MortgageLoan   1821.542435      0.0
19      DebtConsolidationLoan   1789.858870   

In [5]:
chi2_summary = test_results.loc[test_results['p-value']==0].sort_values(by="t-statistic", ascending=False)
chi2_summary.style.bar("t-statistic").background_gradient(
    "Blues", subset="t-statistic")

Unnamed: 0,column,t-statistic,p-value
22,credit_mix,36462.749947,0.0
3,num_credit_card,21314.732369,0.0
8,num_credit_inquiries,20599.566619,0.0
23,payment_of_min_amount_Yes,20212.760068,0.0
2,num_bank_accounts,18848.271651,0.0
5,num_of_loan,14252.801416,0.0
4,interest_rate,13933.995271,0.0
9,outstanding_debt,8204.431397,0.0
21,credit_history_monts,7839.681872,0.0
6,num_of_delayed_payment,4934.652965,0.0


In [6]:
models = {
    "Random Forest": RandomForestClassifier(random_state=1,class_weight=dict(enumerate(class_weights))),
    "Decision Tree": DecisionTreeClassifier(random_state=1,class_weight=dict(enumerate(class_weights))),
    "Gradient Boosting": GradientBoostingClassifier(random_state=1),
    "XGBoost": XGBClassifier(random_state=1, class_weight=dict(enumerate(class_weights))),
    "CatBoost": CatBoostClassifier(verbose=False,random_state=1, class_weights=dict(enumerate(class_weights))),
    "AdaBoost": AdaBoostClassifier(random_state=1),
    "K-Nearest Neighbors": KNeighborsClassifier(),
}

params = {
    "Random Forest": {
        'n_estimators': [x for x in range(200,2000,200)],
        'max_features': ['log2', 'sqrt'],
        'max_depth':[None,10.0,20.0,30.0,40.0,50.0,
                     60.0,70.0,80.0,
                     90.0,100.0,110.0],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False],
    },
    "Decision Tree": {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    },
    "Gradient Boosting": {
        'loss':['log_loss','exponential'],
        'criterion':['friedman_mse', 'squared_error'],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'learning_rate': [0.1, 0.01, 0.001],
        'n_estimators': [8, 16, 32, 64, 128, 256],
        'max_depth': [3, 4, 5],
        'subsample': [0.6, 0.7, 0.8, 0.9],
    },
    "XGBoost": {
        'learning_rate': [0.1, 0.01, 0.05, 0.001],
        'n_estimators': [8, 16, 32, 64, 128, 256],
        'max_depth': [3, 4, 5],
        'subsample': [0.6, 0.7, 0.8, 0.9],
    },
    "CatBoost": {
        'depth': [6, 8, 10],
        'learning_rate': [0.01, 0.05, 0.1],
        'iterations': [30, 50, 100],
    },
    "AdaBoost": {
        'learning_rate': [0.1, 0.01, 0.5, 0.001],
        'n_estimators': [8, 16, 32, 64, 128, 256],
    },
    "K-Nearest Neighbors": {
        'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12,15,20],
        'weights': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'p': [1, 2],
    },
}


f1_scorer = make_scorer(f1_score, average='weighted')

In [7]:
grid_searches = {}
for model_name, model in models.items():
    grid_search = GridSearchCV(
        model,
        params[model_name],
        cv=3,  # Use the number of desired cross-validation folds
        scoring=f1_scorer,
        n_jobs=-1,  # Use all available CPU cores
        verbose=2,
    )
    grid_searches[model_name] = grid_search

In [None]:
best_models = {}

for model_name, grid_search in grid_searches.items():
    grid_search.fit(X_train_df, y_train)  # X_train and y_train are your training data
    best_models[model_name] = grid_search.best_estimator_

Fitting 3 folds for each of 3888 candidates, totalling 11664 fits


10692 fits failed out of a total of 11664.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
972 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\elper\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\elper\anaconda3\lib\site-packages\sklearn\base.py", line 1145, in wrapper
    estimator._validate_params()
  File "C:\Users\elper\anaconda3\lib\site-packages\sklearn\base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\elper\anaconda3\lib\site-packages\sklearn\utils\_param_validation.py", line 96, in validate_parameter_constraints
    raise InvalidPa

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Fitting 3 folds for each of 7776 candidates, totalling 23328 fits


In [None]:
best_models

In [None]:
best_f1_score = -1  # Initialize with a low value
best_model = None

for model_name, grid_search in grid_searches.items():
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best F1-score for {model_name}: {grid_search.best_score_}")
    print("=="*25,"\n")

    if grid_search.best_score_ > best_f1_score:
        best_f1_score = grid_search.best_score_
        best_model = grid_search.best_estimator_

    if best_model is not None:
        print("Best model based on F1-score:")
        print(best_model)
        print(f"Best F1-score: {best_f1_score}")