In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

In [4]:
train = pd.read_csv('cs-training.csv',index_col='Unnamed: 0')

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 150000 entries, 1 to 150000
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   150000 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         120269 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 7   NumberOfTimes90DaysLate               150000 non-null  int64  
 8   NumberRealEstateLoansOrLines          150000 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 10  NumberOfDependents                    146076 non-null  float64
dtypes: fl

In [6]:
    def col_groupage(age):
        if age < 41:
            return 0
        elif 41 <= age <= 52:
            return 1
        elif 52 <= age <= 63:
            return 2
        else:
            return 3

In [7]:
train['group_age'] = train['age'].apply(col_groupage)

In [8]:
    def replace_MonthlyIncome(group_age):
        if group_age == 0:
            return 4980
        if group_age == 1:
            return 7490
        if group_age == 2:
            return 7590
        else:
            return 6400

In [9]:
train['MonthlyIncome'] = train['MonthlyIncome'].fillna(train['MonthlyIncome'].apply(replace_MonthlyIncome))

In [10]:
train['NumberOfDependents'] = train['NumberOfDependents'].fillna(0)

In [11]:
x_train = train.drop('SeriousDlqin2yrs',axis=1)
y_train = train['SeriousDlqin2yrs']

In [12]:
from sklearn.model_selection import train_test_split
x_train_ , x_test_, y_train_, y_test_ = train_test_split(x_train, y_train,test_size=0.2, random_state=42)

In [13]:
class DF(Dataset):

    def __init__(self,x,y):

        features = x.values
        labels = y.values

        self.x = torch.tensor(features,dtype=torch.float32)
        self.y = torch.tensor(labels,dtype=torch.float32)

    def __len__(self):
        return len(self.y)
    
    def __getitem__(self,idx):
        return self.x[idx], self.y[idx]

In [14]:
dataset_ = DF(x_train_,y_train_)

In [15]:
batch_size = 16

In [16]:
class Model(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
        super (Model,self).__init__()

        self.input_to_hidden = nn.Linear(input_dim, hidden_dim)
        self.hidden_layer_1 = nn.Linear(hidden_dim, hidden_dim)
        # self.hidden_layer_2 = nn.Linear(hidden_dim, hidden_dim)
        self.hidden_to_output = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x: torch.Tensor) -> torch.Tensor:

    # no activation and no softmax at the end
        x = self.relu(self.input_to_hidden(x))
        x = self.relu(self.hidden_layer_1(x))
        # x = self.relu(self.hidden_layer_2(x))
        x = self.hidden_to_output(x)
        return x



In [17]:
input_dim = train.shape[1] - 1
# number of hidden layers
hidden_layers = 50
# output dimension is 1 because of linear regression
output_dim = 1
# initiate the linear regression model
model = Model(input_dim, hidden_layers, output_dim)

In [18]:
criterion = nn.MSELoss()

optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [19]:
num_epochs = 10

In [20]:

train_dataset = DF(x_train_, y_train_)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = DF(x_test_, y_test_)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [21]:
for epoch in range(num_epochs):
    for inputs, labels in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.unsqueeze(1))
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/10], Loss: 0.0587
Epoch [2/10], Loss: 0.0012
Epoch [3/10], Loss: 0.0587
Epoch [4/10], Loss: 0.0588
Epoch [5/10], Loss: 0.0024
Epoch [6/10], Loss: 0.1147
Epoch [7/10], Loss: 0.0596
Epoch [8/10], Loss: 0.0251
Epoch [9/10], Loss: 0.0593
Epoch [10/10], Loss: 0.0613


In [45]:
correct = 0
total = 0
train_loss_values = []
train_accuracy_values = []
with torch.no_grad():
    predictions = model(test_dataset.x)  
    for inputs, labels in test_dataloader:
        outputs = model(inputs)
        predicted = outputs.round()
        total += labels.size(0)
        correct += (predicted == labels.unsqueeze(1)).sum().item()
        train_loss_values.append(loss.item())
        train_accuracy_values.append(accuracy)

accuracy = correct / total
print(f'Accuracy: {accuracy:.2%}')
   

mse = F.mse_loss(predictions, test_dataset.y)  
print(f'MSE: {mse.item()}')

r2_ = r2_score(y_true=test_dataset.y, y_pred=predictions)
print(f'R-squared: {r2_}')


Accuracy: 93.48%


  mse = F.mse_loss(predictions, test_dataset.y)


MSE: 0.062065884470939636
R-squared: -0.01832583543345323


In [24]:
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=10, random_state=42, oob_score=True)
rf_model.fit(x_train_,y_train_)

  warn(


In [25]:
from sklearn.metrics import mean_squared_error, r2_score

predictions = rf_model.predict(x_test_)

mse = mean_squared_error(y_test_, predictions)
print(f'MSE: {mse}')

r2 = r2_score(y_true=y_test_, y_pred=predictions)
print(f'R-squared: {r2}')


MSE: 0.05584198387340322
R-squared: 0.0837910298485286


In [27]:
from catboost import CatBoostClassifier
cat_model = CatBoostClassifier()
cat_model.fit(x_train_,y_train_)

Learning rate set to 0.079569
0:	learn: 0.5790058	total: 208ms	remaining: 3m 27s
1:	learn: 0.4953058	total: 247ms	remaining: 2m 3s
2:	learn: 0.4339010	total: 273ms	remaining: 1m 30s
3:	learn: 0.3838282	total: 293ms	remaining: 1m 12s
4:	learn: 0.3429819	total: 308ms	remaining: 1m 1s
5:	learn: 0.3095903	total: 323ms	remaining: 53.5s
6:	learn: 0.2836166	total: 335ms	remaining: 47.5s
7:	learn: 0.2636081	total: 352ms	remaining: 43.6s
8:	learn: 0.2485296	total: 368ms	remaining: 40.6s
9:	learn: 0.2364655	total: 385ms	remaining: 38.1s
10:	learn: 0.2269857	total: 402ms	remaining: 36.1s
11:	learn: 0.2190885	total: 418ms	remaining: 34.4s
12:	learn: 0.2126106	total: 435ms	remaining: 33.1s
13:	learn: 0.2073337	total: 453ms	remaining: 31.9s
14:	learn: 0.2032236	total: 468ms	remaining: 30.7s
15:	learn: 0.1996275	total: 482ms	remaining: 29.7s
16:	learn: 0.1964326	total: 496ms	remaining: 28.7s
17:	learn: 0.1938784	total: 509ms	remaining: 27.8s
18:	learn: 0.1922036	total: 524ms	remaining: 27s
19:	learn:

<catboost.core.CatBoostClassifier at 0x1e46cb7be50>

In [28]:
from sklearn.metrics import accuracy_score
predictions_cat = cat_model.predict(x_test_)
mse = mean_squared_error(y_test_, predictions_cat)
print(f'MSE: {mse}')

r2 = r2_score(y_true=y_test_, y_pred=predictions_cat)
print(f'R-squared: {r2}')
accuracy = accuracy_score(y_true=y_test_, y_pred=predictions_cat)
print(f'accuracy: {accuracy*100}%')

MSE: 0.0625
R-squared: -0.025448178279005917
accuracy: 93.75%


In [30]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

Вывод:
Torch model:
Accuracy: 93.48%
MSE: 0.062065884470939636
R-squared: -0.01832583543345323

RF model:
MSE: 0.05584198387340322
R-squared: 0.0837910298485286

catboost model:
accuracy: 93.75%
MSE: 0.0625
R-squared: -0.025448178279005917


RF показало лучшее метрики, чем остальные модели