In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from scipy.stats import mstats
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score,accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [43]:
df = pd.read_csv('data/scaled_data/standardized/dataset_filled_combined_standardized.csv')
columns_to_check = ['MonthlyIncome', 'TotalAssets']
def remove_outliers_for_selected_columns(data, columns, threshold=3):
    while True:
        z_scores = np.abs((data[columns] - data[columns].mean()) / data[columns].std())
        outliers = (z_scores > threshold).sum().sum()
        if outliers == 0:
            break
        data = data[(z_scores <= threshold).all(axis=1)]
        
    return data
filtered_df = remove_outliers_for_selected_columns(df, columns_to_check)
print(f"Liczba wierszy przed usunięciem wartości odstających: {len(df)}")
print(f"Liczba wierszy po usunięciu wartości odstających: {len(filtered_df)}")
filtered_df.to_csv('data/scaled_data/standardized/filtered_outliers/dataset_filtered_selected_outliers.csv', index=False)
numerical_columns = df.select_dtypes(include=['float64', 'int64']).columns
z_scores = np.abs((df[numerical_columns] - df[numerical_columns].mean()) / df[numerical_columns].std())
outliers = (z_scores > 3).sum()
print("Liczba wartości odstających w każdej kolumnie:")


print(outliers[outliers > 0])
numerical_columns = filtered_df.select_dtypes(include=['float64', 'int64']).columns
z_scores = np.abs((filtered_df[numerical_columns] - filtered_df[numerical_columns].mean()) / filtered_df[numerical_columns].std())
outliers = (z_scores > 3).sum()
print("Liczba wartości odstających w każdej kolumnie(po usuwaniu ):")
print(outliers[outliers > 0])

Liczba wierszy przed usunięciem wartości odstających: 20000
Liczba wierszy po usunięciu wartości odstających: 16794
Liczba wartości odstających w każdej kolumnie:
Age                             54
AnnualIncome                   381
CreditScore                    141
EmploymentStatus              1239
Experience                      66
LoanAmount                     426
LoanDuration                   444
MonthlyDebtPayments            348
CreditCardUtilizationRate      113
NumberOfOpenCreditLines        216
NumberOfCreditInquiries        312
DebtToIncomeRatio              122
BankruptcyHistory              913
PreviousLoanDefaults          1776
PaymentHistory                  76
SavingsAccountBalance          339
CheckingAccountBalance         384
TotalAssets                    354
TotalLiabilities               381
MonthlyIncome                  409
UtilityBillsPaymentHistory     181
JobTenure                       97
NetWorth                       367
BaseInterestRate               1

In [38]:
filtered_df = pd.read_csv('dataset_filtered_selected_outliers.csv')
z_scores = np.abs((filtered_df[columns_to_check] - filtered_df[columns_to_check].mean()) / filtered_df[columns_to_check].std())
outliers_after = (z_scores > 3).sum()
print("Liczba wartości odstających po usunięciu:")
print(outliers_after)

Liczba wartości odstających po usunięciu:
MonthlyIncome    0
TotalAssets      0
dtype: int64


In [40]:
df = pd.read_csv('dataset_filled_combined_standardized.csv')
filtered_df = pd.read_csv('dataset_filtered_selected_outliers.csv')
target_column = 'LoanApproved'
features = df.drop(columns=[target_column])
X_train, X_test, y_train, y_test = train_test_split(features, df[target_column], test_size=0.2, random_state=42)
model_df = LinearRegression()
model_df.fit(X_train, y_train)
y_pred_df = model_df.predict(X_test)
mse_df = mean_squared_error(y_test, y_pred_df)
r2_df = r2_score(y_test, y_pred_df)

print("Wyniki dla zbioru `df`:")
print(f"Mean Squared Error (MSE): {mse_df:.2f}")
print(f"R² Score: {r2_df:.2f}")
X_train_f, X_test_f, y_train_f, y_test_f = train_test_split(filtered_df.drop(columns=[target_column]), filtered_df[target_column], test_size=0.2, random_state=42)
model_filtered = LinearRegression()
model_filtered.fit(X_train_f, y_train_f)
y_pred_filtered = model_filtered.predict(X_test_f)
mse_filtered = mean_squared_error(y_test_f, y_pred_filtered)
r2_filtered = r2_score(y_test_f, y_pred_filtered)

print("\nWyniki dla zbioru `filtered_df`:")
print(f"Mean Squared Error (MSE): {mse_filtered:.2f}")
print(f"R² Score: {r2_filtered:.2f}")
print("\nPorównanie wyników:")
print(f"Różnica MSE: {mse_df - mse_filtered:.2f}")
print(f"Różnica R²: {r2_df - r2_filtered:.2f}")

Wyniki dla zbioru `df`:
Mean Squared Error (MSE): 0.07
R² Score: 0.62

Wyniki dla zbioru `filtered_df`:
Mean Squared Error (MSE): 0.05
R² Score: 0.60

Porównanie wyników:
Różnica MSE: 0.01
Różnica R²: 0.03
