In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from scipy.stats import mstats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('data/scaled_data/standardized/dataset_filled_combined_standardized.csv')
columns_to_check = ['MonthlyIncome', 'TotalAssets']
def remove_outliers_for_selected_columns(data, columns, threshold=3):
    while True:
        z_scores = np.abs((data[columns] - data[columns].mean()) / data[columns].std())
        outliers = (z_scores > threshold).sum().sum()
        if outliers == 0:
            break
        data = data[(z_scores <= threshold).all(axis=1)]
        
    return data
filtered_df = remove_outliers_for_selected_columns(df, columns_to_check)
print(f"Liczba wierszy przed usunięciem wartości odstających: {len(df)}")
print(f"Liczba wierszy po usunięciu wartości odstających: {len(filtered_df)}")
filtered_df.to_csv('data/scaled_data/standardized/filtered_outliers/dataset_filtered_selected_outliers.csv', index=False)
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
z_scores = np.abs((df[numerical_columns] - df[numerical_columns].mean()) / df[numerical_columns].std())
outliers = (z_scores > 3).sum()
print("Liczba wartości odstających w każdej kolumnie:")


print(outliers[outliers > 0])
numerical_columns = filtered_df.select_dtypes(include=['float64', 'int64']).columns
z_scores = np.abs((filtered_df[numerical_columns] - filtered_df[numerical_columns].mean()) / filtered_df[numerical_columns].std())
outliers = (z_scores > 3).sum()
print("Liczba wartości odstających w każdej kolumnie(po usuwaniu ):")
print(outliers[outliers > 0])

Liczba wierszy przed usunięciem wartości odstających: 20000
Liczba wierszy po usunięciu wartości odstających: 16794
Liczba wartości odstających w każdej kolumnie:
Age                             54
AnnualIncome                   381
CreditScore                    141
EmploymentStatus              1239
Experience                      66
LoanAmount                     426
LoanDuration                   444
MonthlyDebtPayments            348
CreditCardUtilizationRate      113
NumberOfOpenCreditLines        216
NumberOfCreditInquiries        312
DebtToIncomeRatio              122
BankruptcyHistory              913
PreviousLoanDefaults          1776
PaymentHistory                  76
SavingsAccountBalance          339
CheckingAccountBalance         384
TotalAssets                    354
TotalLiabilities               381
MonthlyIncome                  409
UtilityBillsPaymentHistory     181
JobTenure                       97
NetWorth                       367
BaseInterestRate               1

In [3]:
filtered_df = pd.read_csv('dataset_filtered_selected_outliers.csv')
z_scores = np.abs((filtered_df[columns_to_check] - filtered_df[columns_to_check].mean()) / filtered_df[columns_to_check].std())
outliers_after = (z_scores > 3).sum()
print("Liczba wartości odstających po usunięciu:")
print(outliers_after)

Liczba wartości odstających po usunięciu:
MonthlyIncome    0
TotalAssets      0
dtype: int64


In [4]:
df = pd.read_csv('dataset_filled_combined_standardized.csv')
filtered_df = pd.read_csv('dataset_filtered_selected_outliers.csv')

target_column = 'LoanApproved'
features = df.drop(columns=[target_column])

X_train, X_test, y_train, y_test = train_test_split(features, df[target_column], test_size=0.2, random_state=42)

model_df = LinearRegression()
model_df.fit(X_train, y_train)
y_pred_df = model_df.predict(X_test)
mse_df = mean_squared_error(y_test, y_pred_df)
r2_df = r2_score(y_test, y_pred_df)

print("Wyniki dla zbioru `df`:")
print(f"Mean Squared Error (MSE): {mse_df}")
print(f"R² Score: {r2_df:.2f}")
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(filtered_df.drop(columns=[target_column]), filtered_df[target_column], test_size=0.2, random_state=42)
model_filtered = LinearRegression()
model_filtered.fit(X_train_f, y_train_f)
y_pred_filtered = model_filtered.predict(X_test_f)
mse_filtered = mean_squared_error(y_test_f, y_pred_filtered)
r2_filtered = r2_score(y_test_f, y_pred_filtered)

print("\nWyniki dla zbioru `filtered_df`:")
print(f"Mean Squared Error (MSE): {mse_filtered}")
print(f"R² Score: {r2_filtered:.2f}")
print("\nPorównanie wyników:")
print(f"Różnica MSE: {mse_df - mse_filtered}")
print(f"Różnica R²: {r2_df - r2_filtered:.2f}")

Wyniki dla zbioru `df`:
Mean Squared Error (MSE): 0.06605877921177307
R² Score: 0.62

Wyniki dla zbioru `filtered_df`:
Mean Squared Error (MSE): 0.054512081704656705
R² Score: 0.60

Porównanie wyników:
Różnica MSE: 0.011546697507116364
Różnica R²: 0.03


In [5]:
filtered_df = pd.read_csv('dataset_filtered_selected_outliers.csv')

target_column = 'LoanApproved'
features = df.drop(columns=[target_column])
target = df[target_column]

# X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

test_size = 0.2  
random_state = 42  

np.random.seed(random_state)
indices = np.arange(features.shape[0])
np.random.shuffle(indices)

split_index = int(features.shape[0] * (1 - test_size))

train_indices = indices[:split_index]
test_indices = indices[split_index:]

X_train = features.iloc[train_indices]
X_test = features.iloc[test_indices]
y_train = target.iloc[train_indices]
y_test = target.iloc[test_indices]

def manual_mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def manual_r2(y_true, y_pred):
    ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
    ss_residual = np.sum((y_true - y_pred) ** 2)
    return 1 - (ss_residual / ss_total)


## Linear Regression

In [6]:
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
y_pred_lr = model_lr.predict(X_test)

mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f"MSE (Linear Regression): {mse_lr}")
print(f"R² (Linear Regression): {r2_lr}")

MSE (Linear Regression): 0.05925508469248257
R² (Linear Regression): 0.643138263745725


In [7]:
manual_mse_lr = manual_mse(y_test, y_pred_lr)
manual_r2_lr = manual_r2(y_test, y_pred_lr)

print(f"Manual MSE (Linear Regression): {manual_mse_lr}")
print(f"Manual R² (Linear Regression): {manual_r2_lr}")

Manual MSE (Linear Regression): 0.05925508469248257
Manual R² (Linear Regression): 0.643138263745725


## Random Forest Regressor

In [8]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor(random_state=42)
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"MSE (Random Forest Regressor): {mse_rf}")
print(f"R² (Random Forest Regressor): {r2_rf}")

MSE (Random Forest Regressor): 0.03506247500000001
R² (Random Forest Regressor): 0.7888374344445159


In [9]:
manual_mse_rf = manual_mse(y_test, y_pred_rf)
manual_r2_rf = manual_r2(y_test, y_pred_rf)

print(f"Manual MSE (Random Forest Regressor): {manual_mse_rf}")
print(f"Manual R² (Random Forest Regressor): {manual_r2_rf}")

Manual MSE (Random Forest Regressor): 0.035062475000000336
Manual R² (Random Forest Regressor): 0.7888374344445159


## Gradient Boosting Regressor

In [10]:
from sklearn.ensemble import GradientBoostingRegressor

model_gb = GradientBoostingRegressor(random_state=42)
model_gb.fit(X_train, y_train)
y_pred_gb = model_gb.predict(X_test)

mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print(f"MSE (Gradient Boosting Regressor): {mse_gb}")
print(f"R² (Gradient Boosting Regressor): {r2_gb}")


MSE (Gradient Boosting Regressor): 0.03557092647938056
R² (Gradient Boosting Regressor): 0.7857753026684083


In [11]:
manual_mse_gb = manual_mse(y_test, y_pred_gb)
manual_r2_gb = manual_r2(y_test, y_pred_gb)

print(f"Manual MSE (Gradient Boosting Regressor): {manual_mse_gb}")
print(f"Manual R² (Gradient Boosting Regressor): {manual_r2_gb}")

Manual MSE (Gradient Boosting Regressor): 0.03557092647938053
Manual R² (Gradient Boosting Regressor): 0.7857753026684083


## XGBoost Regressor 

In [12]:
from xgboost import XGBRegressor

model_xgb = XGBRegressor(random_state=42, use_label_encoder=False, eval_metric='rmse')
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)

mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"MSE (XGBoost Regressor): {mse_xgb}")
print(f"R² (XGBoost Regressor): {r2_xgb}")


Parameters: { "use_label_encoder" } are not used.



MSE (XGBoost Regressor): 0.03593787325357557
R² (XGBoost Regressor): 0.7835653781761605


In [13]:
manual_mse_xgb = manual_mse(y_test, y_pred_xgb)
manual_r2_xgb = manual_r2(y_test, y_pred_xgb)

print(f"Manual MSE (XGBoost Regressor): {manual_mse_xgb}")
print(f"Manual R² (XGBoost Regressor): {manual_r2_xgb}")

Manual MSE (XGBoost Regressor): 0.035937873253575434
Manual R² (XGBoost Regressor): 0.7835653781761605


## CatBoost Regressor

In [14]:
from catboost import CatBoostRegressor

model_cb = CatBoostRegressor(random_state=42, verbose=0)  
model_cb.fit(X_train, y_train)
y_pred_cb = model_cb.predict(X_test)

mse_cb = mean_squared_error(y_test, y_pred_cb)
r2_cb = r2_score(y_test, y_pred_cb)

print(f"MSE (CatBoost Regressor): {mse_cb}")
print(f"R² (CatBoost Regressor): {r2_cb}")


MSE (CatBoost Regressor): 0.03152092915107106
R² (CatBoost Regressor): 0.8101662741083506


In [15]:
manual_mse_cb = manual_mse(y_test, y_pred_cb)
manual_r2_cb = manual_r2(y_test, y_pred_cb)

print(f"Manual MSE (CatBoost Regressor): {manual_mse_cb}")
print(f"Manual R² (CatBoost Regressor): {manual_r2_cb}")

Manual MSE (CatBoost Regressor): 0.031520929151071034
Manual R² (CatBoost Regressor): 0.8101662741083506


In [16]:
results = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest', 'Gradient Boosting', 'XGBoost', 'CatBoost'],
    'MSE': [mse_lr, mse_rf, mse_gb, mse_xgb, mse_cb],
    'R²': [r2_lr, r2_rf, r2_gb, r2_xgb, r2_cb]
})

print(results)

               Model       MSE        R²
0  Linear Regression  0.059255  0.643138
1      Random Forest  0.035062  0.788837
2  Gradient Boosting  0.035571  0.785775
3            XGBoost  0.035938  0.783565
4           CatBoost  0.031521  0.810166


## Klasyfikator głosujący + stacking
## Walidacja krzyżowa (ręczna implementacja)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.base import clone
voting_regressor = VotingRegressor(
    estimators=[
        ('catboost', model_cb),
        ('xgboost', model_xgb),
        ('gradient_boosting', model_gb),
        ('random_forest', model_rf),
        ('linear_regression', model_lr)
    ],
)

base_models = [
    ('catboost', model_cb),
    ('xgboost', model_xgb),
    ('gradient_boosting', model_gb),
    ('random_forest', model_rf)
]

stacking_regressor = StackingRegressor(
    estimators=base_models,

)
def manual_cross_val(X, y, model, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    mse_values = []
    r2_values = []
    
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model_clone = clone(model) 
        
        model_clone.fit(X_train, y_train)
        y_pred = model_clone.predict(X_test)
        mse = manual_mse(y_test, y_pred)
        r2 = manual_r2(y_test, y_pred)
        
        mse_values.append(mse)
        r2_values.append(r2)
    
    print(f"Average MSE: {np.mean(mse_values):.4f}")
    print(f"Average R²: {np.mean(r2_values):.4f}")
X = features.values  
y = df[target_column].values
print("Evaluating Voting Regressor:")
manual_cross_val(X, y, voting_regressor, k=2)
print("\nEvaluating Stacking Regressor:")
manual_cross_val(X, y, stacking_regressor, k=2)


Evaluating Voting Regressor:


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Average MSE: 0.0377
Average R²: 0.7756

Evaluating Stacking Regressor:


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Average MSE: 0.0358
Average R²: 0.7873


In [19]:
import numpy as np

from sklearn.model_selection import KFold
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.base import clone
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier


model_cb = CatBoostClassifier(random_state=42, verbose=0)  
model_xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
model_gb = GradientBoostingClassifier(random_state=42)
model_rf = RandomForestClassifier(random_state=42)
model_lr = LogisticRegression()

voting_classifier = VotingClassifier(
    estimators=[ 
        ('catboost', model_cb),
        ('xgboost', model_xgb),
        ('gradient_boosting', model_gb),
        ('random_forest', model_rf),
        ('linear_regression', model_lr)
    ],
    voting='soft' 
)
base_models = [
    ('catboost', model_cb),
    ('xgboost', model_xgb),
    ('gradient_boosting', model_gb),
    ('random_forest', model_rf)
]

stacking_classifier = StackingClassifier(
    estimators=base_models,
)
def manual_cross_val(X, y, model, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    mse_values = []
    r2_values = []
    
    for train_idx, test_idx in kf.split(X):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        model_clone = clone(model) 
        
        model_clone.fit(X_train, y_train)
        y_pred = model_clone.predict(X_test)
        mse = manual_mse(y_test, y_pred)
        r2 = manual_r2(y_test, y_pred)
        
        mse_values.append(mse)
        r2_values.append(r2)
    
    print(f"Average MSE: {np.mean(mse_values):.4f}")
    print(f"Average R²: {np.mean(r2_values):.4f}")

def manual_mse(y_true, y_pred):
    return np.mean((y_true - y_pred) ** 2)

def manual_r2(y_true, y_pred):
    ss_total = np.sum((y_true - np.mean(y_true)) ** 2)
    ss_residual = np.sum((y_true - y_pred) ** 2)
    return 1 - (ss_residual / ss_total)
X = features.values  
y = df[target_column].values

print("Evaluating Voting Classifier:")
manual_cross_val(X, y, voting_classifier, k=5)
print("\nEvaluating Stacking Classifier:")
manual_cross_val(X, y, stacking_classifier, k=5)


Evaluating Voting Classifier:


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Average MSE: 0.0394
Average R²: 0.7653

Evaluating Stacking Classifier:


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Average MSE: 0.0389
Average R²: 0.7681
