In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import numpy as np
from scipy import stats

from sklearn.preprocessing import MinMaxScaler, StandardScaler




In [143]:
#load dataset

df = pd.read_csv('dataset/main/credit_train.csv')

In [144]:
# Data Preprocessing
# Handle missing values
df.fillna(0, inplace=True)  # Replace missing values with 0 for simplicity

def convert_credit_sum(value):
    try:
        return float(value.replace(',', '.'))
    except:
        return 0
    
# Convert incorrect data types to numerical
df['credit_sum'] = df['credit_sum'].apply(convert_credit_sum)
df['score_shk'] = df['score_shk'].str.replace(',', '.').astype(float)




In [145]:
# Select the numerical columns for scaling
numerical_cols = ['age', 'credit_sum', 'credit_month', 'score_shk', 'monthly_income', 'credit_count', 'overdue_credit_count']

# Initialize Min-Max Scaler
minmax_scaler = MinMaxScaler()

# Apply Min-Max Scaling to selected columns
df[numerical_cols] = minmax_scaler.fit_transform(df[numerical_cols])

# Initialize Standard Scaler
standard_scaler = StandardScaler()

# Apply Standard Scaling to selected columns
df[numerical_cols] = standard_scaler.fit_transform(df[numerical_cols])


In [146]:
# Convert categorical variables to numerical using one-hot encoding
categorical_features = ['gender', 'marital_status', 'job_position', 'tariff_id', 'education', 'living_region']
df_encoded = pd.get_dummies(df, columns=categorical_features)


In [111]:
df_encoded['age_credit_interaction'] = df_encoded['age'] * df_encoded['credit_month']
df_encoded = df_encoded.drop(columns=['age', 'credit_month'])


#Result: did not improve the model ac:  0.825475841874085 -> 0.8231698389458272

In [139]:
# Apply Box-Cox transformation to the selected columns
for col in ['credit_sum','monthly_income']:
    if np.min(df_encoded[col]) > 0:  # Box-Cox transformation requires positive values
        transformed_data, _ = stats.boxcox(df_encoded[col])
        df_encoded[col] = transformed_data

In [121]:
# Define the bin edges and labels for age groups
bin_edges = [0, 25, 40, 60, float('inf')]
bin_labels = ['young', 'adult', 'senior', 'elderly']

# Bin the 'age' feature into age groups
df_encoded['age_group'] = pd.cut(df_encoded['age'], bins=bin_edges, labels=bin_labels, right=False)

# Drop the original 'age' column if desired
df_encoded = df_encoded.drop(columns=['age'])
df_encoded.head()

df_encoded = pd.get_dummies(df_encoded, columns=['age_group'])

# Result: did not improve the model :0.825475841874085 -> 0.8245607613469985

In [101]:
mean_target_encoding = df_encoded.groupby('marital_status')['open_account_flg'].mean()
df_encoded['marital_status_mean_encoded'] = df_encoded['marital_status'].map(mean_target_encoding)
df_encoded = df_encoded.drop(columns=['marital_status'])
# Result: did not improve the model :0.825475841874085 -> 0.8248169838945827

In [122]:
df_encoded.head()

Unnamed: 0,client_id,credit_sum,credit_month,score_shk,monthly_income,credit_count,overdue_credit_count,open_account_flg,gender_F,gender_M,...,living_region_ЧУВАШСКАЯ РЕСПУБЛИКА,living_region_ЧУВАШСКАЯ РЕСПУБЛИКА - ЧУВАШИЯ,living_region_ЧУКОТСКИЙ АО,living_region_ЯМАЛО-НЕНЕЦКИЙ АО,living_region_ЯРОСЛАВСКАЯ ОБЛ,living_region_ЯРОСЛАВСКАЯ ОБЛАСТЬ,age_group_young,age_group_adult,age_group_senior,age_group_elderly
0,52372,-0.858123,-0.276914,-1.298542,-0.604258,-1.111251,-0.212496,0,False,True,...,False,False,False,False,False,False,False,True,False,False
1,75213,-0.515116,-0.276914,0.230904,-0.604258,1.119015,-0.212496,0,True,False,...,False,False,False,False,False,False,False,True,False,False
2,119931,0.023782,-0.276914,-0.302584,-0.604258,-1.111251,-0.212496,0,False,True,...,False,False,False,False,False,False,True,False,False,False
3,134365,-0.35248,0.28891,2.427188,0.393404,-1.111251,-0.212496,1,False,True,...,False,False,False,False,False,False,True,False,False,False
4,138695,-0.867298,-0.276914,1.191403,-0.883603,-1.111251,-0.212496,0,True,False,...,False,False,False,False,False,False,False,False,True,False


In [140]:
# Split the data into features (X) and target (y)
X = df_encoded.drop('open_account_flg', axis=1)
y = df_encoded['open_account_flg']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [147]:
# Model Training
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
y_pred_proba= model.predict_proba(X_test)
for i in range(len(y_test)):
    predicted_class = y_pred[i]
    confidence_scores = y_pred_proba[i]
    print(f"Sample {i+1}: Predicted class: {predicted_class}, Confidence scores: {confidence_scores}")

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Sample 1: Predicted class: 0, Confidence scores: [0.6 0.4]
Sample 2: Predicted class: 0, Confidence scores: [0.79 0.21]
Sample 3: Predicted class: 0, Confidence scores: [0.97 0.03]
Sample 4: Predicted class: 1, Confidence scores: [0.46 0.54]
Sample 5: Predicted class: 0, Confidence scores: [0.71 0.29]
Sample 6: Predicted class: 0, Confidence scores: [0.62 0.38]
Sample 7: Predicted class: 0, Confidence scores: [0.95 0.05]
Sample 8: Predicted class: 0, Confidence scores: [0.95 0.05]
Sample 9: Predicted class: 0, Confidence scores: [0.8 0.2]
Sample 10: Predicted class: 0, Confidence scores: [0.97 0.03]
Sample 11: Predicted class: 0, Confidence scores: [1. 0.]
Sample 12: Predicted class: 0, Confidence scores: [0.79 0.21]
Sample 13: Predicted class: 0, Confidence scores: [1. 0.]
Sample 14: Predicted class: 0, Confidence scores: [0.94 0.06]
Sample 15: Predicted class: 0, Confidence scores: [0.8 0.2]
Sample 16: Predicted class: 0, Confidence scores: [0.88 0.12]
Sample 17: Predicted class: 1, 

In [125]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
# Model Training
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Model Evaluation
y_pred_gb = gb_model.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
print("Gradient Boosting Accuracy:", accuracy_gb)

Gradient Boosting Accuracy: 0.8260248901903368


In [126]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

# Model Training
adaboost_model = AdaBoostClassifier(random_state=42)
adaboost_model.fit(X_train, y_train)

# Model Evaluation
y_pred_adaboost = adaboost_model.predict(X_test)
accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)
print("AdaBoost Accuracy:", accuracy_adaboost)

AdaBoost Accuracy: 0.8221449487554905


In [127]:
import xgboost as xgb
from sklearn.metrics import accuracy_score

# Model Training
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

# Model Evaluation
y_pred_xgb = xgb_model.predict(X_test)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print("XGBoost Accuracy:", accuracy_xgb)


XGBoost Accuracy: 0.8264275256222547
