Карпеченко Дмитрий, дз 30 v.1

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

%matplotlib inline
from matplotlib import pyplot as plt
plt.style.use(['seaborn-darkgrid'])
plt.rcParams['font.family'] = 'DejaVu Sans'

from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import validation_curve, learning_curve

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
import torch.nn.functional as F
from torch.nn import ReLU

RANDOM_STATE = 17

In [2]:
def make_submission(predictions, fname):
    out = pd.DataFrame(data=predictions, columns=['Probability'])
    out.index += 1
    out.to_csv(fname, index_label='id')

In [3]:
df = pd.read_csv(r'D:\TeachMeSkills\DZ\dz18\cs-training.csv', index_col=0)

In [4]:
feature_names = df.columns[1:]
df.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [5]:
print(df.shape)
df.describe(include = "all").T

(150000, 11)


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SeriousDlqin2yrs,150000.0,0.06684,0.249746,0.0,0.0,0.0,0.0,1.0
RevolvingUtilizationOfUnsecuredLines,150000.0,6.048438,249.755371,0.0,0.029867,0.154181,0.559046,50708.0
age,150000.0,52.295207,14.771866,0.0,41.0,52.0,63.0,109.0
NumberOfTime30-59DaysPastDueNotWorse,150000.0,0.421033,4.192781,0.0,0.0,0.0,0.0,98.0
DebtRatio,150000.0,353.005076,2037.818523,0.0,0.175074,0.366508,0.868254,329664.0
MonthlyIncome,120269.0,6670.221237,14384.674215,0.0,3400.0,5400.0,8249.0,3008750.0
NumberOfOpenCreditLinesAndLoans,150000.0,8.45276,5.145951,0.0,5.0,8.0,11.0,58.0
NumberOfTimes90DaysLate,150000.0,0.265973,4.169304,0.0,0.0,0.0,0.0,98.0
NumberRealEstateLoansOrLines,150000.0,1.01824,1.129771,0.0,0.0,1.0,2.0,54.0
NumberOfTime60-89DaysPastDueNotWorse,150000.0,0.240387,4.155179,0.0,0.0,0.0,0.0,98.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150000 entries, 1 to 150000
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   150000 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         120269 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 7   NumberOfTimes90DaysLate               150000 non-null  int64  
 8   NumberRealEstateLoansOrLines          150000 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 10  NumberOfDependents                    146076 non-null  float64
dtype

In [7]:
df.isnull().sum()

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

Удалим записи, где возраст равен 0. Такая запись одна)

In [8]:
df=df[(df['age']!=0)]

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 149999 entries, 1 to 150000
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      149999 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  149999 non-null  float64
 2   age                                   149999 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  149999 non-null  int64  
 4   DebtRatio                             149999 non-null  float64
 5   MonthlyIncome                         120268 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       149999 non-null  int64  
 7   NumberOfTimes90DaysLate               149999 non-null  int64  
 8   NumberRealEstateLoansOrLines          149999 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  149999 non-null  int64  
 10  NumberOfDependents                    146075 non-null  float64
dtype

Пропуски в столбце MonthlyIncome заменим медианными значениями

In [10]:
df['MonthlyIncome']=df['MonthlyIncome'].fillna(df['MonthlyIncome'].median())

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 149999 entries, 1 to 150000
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      149999 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  149999 non-null  float64
 2   age                                   149999 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  149999 non-null  int64  
 4   DebtRatio                             149999 non-null  float64
 5   MonthlyIncome                         149999 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       149999 non-null  int64  
 7   NumberOfTimes90DaysLate               149999 non-null  int64  
 8   NumberRealEstateLoansOrLines          149999 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  149999 non-null  int64  
 10  NumberOfDependents                    146075 non-null  float64
dtype

Пропуски в столбце NumberOfDependents заменим на 0, как самый распростаненный

In [12]:
df['NumberOfDependents']=df['NumberOfDependents'].fillna(0)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 149999 entries, 1 to 150000
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      149999 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  149999 non-null  float64
 2   age                                   149999 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  149999 non-null  int64  
 4   DebtRatio                             149999 non-null  float64
 5   MonthlyIncome                         149999 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       149999 non-null  int64  
 7   NumberOfTimes90DaysLate               149999 non-null  int64  
 8   NumberRealEstateLoansOrLines          149999 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  149999 non-null  int64  
 10  NumberOfDependents                    149999 non-null  float64
dtype

Для столбца NumberOfTime60-89DaysPastDueNotWorse удалим выбивающиеся значения 98 и 96

In [14]:
df['NumberOfTime60-89DaysPastDueNotWorse'].value_counts()

0     142395
1       5731
2       1118
3        318
98       264
4        105
5         34
6         16
7          9
96         5
8          2
11         1
9          1
Name: NumberOfTime60-89DaysPastDueNotWorse, dtype: int64

In [15]:
df = df.loc[df['NumberOfTime60-89DaysPastDueNotWorse'] != 98]

In [16]:
df = df.loc[df['NumberOfTime60-89DaysPastDueNotWorse'] != 96]

Вместо столбцов NumberOfTime30-59DaysPastDueNotWorse, NumberOfTime60-89DaysPastDueNotWorse, NumberOfTimes90DaysLate введем новый столбец в котором будет среднее количество просрочек за период от 30 до бесконечности дней

In [17]:
t=(df['NumberOfTime60-89DaysPastDueNotWorse']+df['NumberOfTime30-59DaysPastDueNotWorse']+df['NumberOfTimes90DaysLate'])/3
df.insert(2,'MeanPastDueNotWorse',t)
df.drop(labels=['NumberOfTime60-89DaysPastDueNotWorse','NumberOfTime30-59DaysPastDueNotWorse','NumberOfTimes90DaysLate'], axis=1, inplace=True)

In [18]:
df.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,MeanPastDueNotWorse,age,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberRealEstateLoansOrLines,NumberOfDependents
count,149730.0,149730.0,149730.0,149730.0,149730.0,149730.0,149730.0,149730.0,149730.0
mean,0.065979,6.057542,0.13369,52.327984,353.631316,6423.097,8.467949,1.020063,0.738142
std,0.248246,249.980364,0.367383,14.754371,2039.601344,12901.12,5.138107,1.129961,1.107373
min,0.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029771,0.0,41.0,0.175994,3915.0,5.0,0.0,0.0
50%,0.0,0.153488,0.0,52.0,0.367119,5400.0,8.0,1.0,0.0
75%,0.0,0.555592,0.0,63.0,0.870023,7400.0,11.0,2.0,1.0
max,1.0,50708.0,6.333333,109.0,329664.0,3008750.0,58.0,54.0,20.0


In [19]:
x=df.copy()
y=df.SeriousDlqin2yrs
x.drop(labels=['SeriousDlqin2yrs'],axis=1, inplace=True)

In [20]:
x=x.to_numpy()
y=y.to_numpy()

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y,random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.2, random_state=RANDOM_STATE, stratify=y_train)

In [22]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(X_val.shape)
print(y_val.shape)

(89837, 8)
(89837,)
(37433, 8)
(37433,)
(22460, 8)
(22460,)


In [23]:
from torch.utils.data import Dataset

In [24]:
class Data(Dataset):

    def __init__(self, X, y):

        self.X = torch.tensor(X, dtype = torch.float32)
        self.y = torch.tensor(y, dtype = torch.float32)
          
    def __getitem__(self, index):
        return self.X[index], self.y[index]

    def __len__(self):
        return len(self.X)

In [25]:
train_data = Data(X_train, y_train)

In [26]:
test_data = Data(X_test, y_test)

In [27]:
val_data = Data(X_val, y_val)

In [28]:
class SimpleNet(nn.Module):

    def __init__(self):
        super(SimpleNet, self).__init__()
        self.fc1 = nn.Linear(8,100)
        self.fc2 = nn.Linear(100,50)
        self.fc3 = nn.Linear(50,1)
        
        self.relu = ReLU()
        
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [29]:
def binary_acc(y_pred, y_true):
    y_pred_val = torch.round(torch.sigmoid(y_pred))
    correct_results_sum = torch.eq(y_pred_val, y_true).sum().float()
    acc = correct_results_sum/y_true.shape[0]

    return acc

In [30]:
simplenet = SimpleNet()

In [31]:
simplenet

SimpleNet(
  (fc1): Linear(in_features=8, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=1, bias=True)
  (relu): ReLU()
)

In [32]:
optimizer = optim.Adam(simplenet.parameters(), lr=0.001)

In [33]:
if torch.cuda.is_available():
    device = torch.device("cuda:0") 
else:
    device = torch.device("cpu")

simplenet.to(device)

SimpleNet(
  (fc1): Linear(in_features=8, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=50, bias=True)
  (fc3): Linear(in_features=50, out_features=1, bias=True)
  (relu): ReLU()
)

In [34]:
torch.cuda.is_available()

True

In [35]:
def train(model, optimizer, loss_fn, train_loader, val_loader, epochs=20,device="cuda:0"):
    for epoch in range(1, epochs+1):
        training_loss = 0.0
        train_acc = 0.0
        val_loss = 0.0
        val_acc = 0.0
        
        model.train()
        
        
        for batch in train_loader:
            optimizer.zero_grad()      
            X, y = batch
            X = X.to(device)
            y = y.to(device)
           
            y_pred = model(X)
        
            loss = loss_fn(y_pred, y.unsqueeze(1))
            acc = binary_acc(y_pred, y.unsqueeze(1))
            
            loss.backward()
            optimizer.step()
            
            training_loss += loss.item()
            train_acc += acc.item()
            
        training_loss /= len(train_loader)
        train_acc /= len(train_loader)
        
        model.eval()
                
        for batch in val_loader:
          
           
            X, y = batch
            X = X.to(device)
            y = y.to(device)
           
            y_pred = model(X)
        
            loss = loss_fn(y_pred, y.unsqueeze(1))
            
            acc = binary_acc(y_pred, y.unsqueeze(1))
            
            val_loss += loss.item()
  
         
            val_acc += acc.item()
            
        val_loss /= len(val_loader)
        val_acc /= len(val_loader)
        
        print('Epoch: {}, Training Loss: {:.2f}, accuracy = {:.5f}, Validation Loss: {:.2f}, accuracy = {:.5f}'.format(epoch, training_loss, train_acc, val_loss, val_acc))

from torch.utils.data.sampler import WeightedRandomSampler
from torch.utils.data import DataLoader

count=np.array([(y_train == 0).sum(),(y_train == 1).sum()])
weights = 1/count
print(f'Веса классов в обучающем наборе: {weights}')

sampler = WeightedRandomSampler(weights, num_samples=len(weights))
train_loader = DataLoader(train_data, batch_size=64, sampler=sampler)

In [36]:
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64)

In [37]:
val_loader = torch.utils.data.DataLoader(val_data, batch_size=64)

In [38]:
loss_fn = torch.nn.BCEWithLogitsLoss()

In [39]:
train(simplenet,optimizer,loss_fn,train_loader,val_loader, epochs=10)

Epoch: 1, Training Loss: 5.74, accuracy = 0.88489, Validation Loss: 1.57, accuracy = 0.93401
Epoch: 2, Training Loss: 4.01, accuracy = 0.89653, Validation Loss: 1.16, accuracy = 0.93401
Epoch: 3, Training Loss: 1.26, accuracy = 0.90708, Validation Loss: 0.96, accuracy = 0.93401
Epoch: 4, Training Loss: 0.91, accuracy = 0.90599, Validation Loss: 0.40, accuracy = 0.91554
Epoch: 5, Training Loss: 0.54, accuracy = 0.92062, Validation Loss: 0.32, accuracy = 0.93491
Epoch: 6, Training Loss: 0.41, accuracy = 0.92210, Validation Loss: 0.28, accuracy = 0.93446
Epoch: 7, Training Loss: 0.40, accuracy = 0.92846, Validation Loss: 0.24, accuracy = 0.93482
Epoch: 8, Training Loss: 0.33, accuracy = 0.92980, Validation Loss: 0.21, accuracy = 0.93486
Epoch: 9, Training Loss: 0.31, accuracy = 0.93245, Validation Loss: 0.23, accuracy = 0.93548
Epoch: 10, Training Loss: 0.25, accuracy = 0.93383, Validation Loss: 0.26, accuracy = 0.93415


In [40]:
test_loader = torch.utils.data.DataLoader(test_data, batch_size=64)

In [41]:
test_loss = 0.0
test_acc = 0.0
model_test = simplenet.eval()
loss_fn = torch.nn.BCEWithLogitsLoss()

In [42]:
for batch in test_loader:
    optimizer.zero_grad()      
    X, y = batch
    X = X.to(device)
    y = y.to(device)

    y_pred = model_test(X)
        
    loss = loss_fn(y_pred, y.unsqueeze(1))
    acc = binary_acc(y_pred, y.unsqueeze(1))
            
    loss.backward()
    optimizer.step()
            
    test_loss += loss.item()
    test_acc += acc.item()
            
test_loss /= len(test_loader)
test_acc /= len(test_loader)
print('Test Loss: {:.2f}, Test accuracy = {:.5f}'.format(test_loss, test_acc))

Test Loss: 0.25, Test accuracy = 0.93429
