In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc

version='v0.1'
app_tr_path="..\\data\\home-credit-default-risk\\application_train.csv"
app_te_path="..\\data\\home-credit-default-risk\\application_test.csv"

app_train = pd.read_csv(app_tr_path)
app_test = pd.read_csv(app_te_path)

In [124]:
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns.sort_values(
    '% of Total Values', ascending=False).round(1)
    return mis_val_table_ren_columns

missing_values = missing_values_table(app_train)
missing_values.head()

Unnamed: 0,Missing Values,% of Total Values
COMMONAREA_MEDI,214865,69.9
COMMONAREA_AVG,214865,69.9
COMMONAREA_MODE,214865,69.9
NONLIVINGAPARTMENTS_MODE,213514,69.4
NONLIVINGAPARTMENTS_AVG,213514,69.4


In [125]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le_count = 0
for col in app_train:
    if app_train[col].dtype == 'object':
        if len(list(app_train[col].unique())) <= 2:
            le.fit(app_train[col])
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            le_count += 1

app_train = pd.get_dummies(app_train)
app_test = pd.get_dummies(app_test)

train_labels = app_train['TARGET']
app_train, app_test = app_train.align(app_test, join = 'inner', axis = 1)
app_train['TARGET'] = train_labels

In [126]:
def agg_numeric(df, parent_var, df_name):
    
    for col in df:
        if col != parent_var and 'SK_ID' in col:
            df = df.drop(columns = col)
            
    parent_ids = df[parent_var].copy()
    numeric_df = df.select_dtypes('number').copy()
    numeric_df[parent_var] = parent_ids
    agg = numeric_df.groupby(parent_var).agg(['count', 'mean', 'max', 'min', 'sum'])

    columns = []
    for var in agg.columns.levels[0]:
        if var != parent_var:
            for stat in agg.columns.levels[1]:
                columns.append('%s_%s_%s' % (df_name, var, stat))
    agg.columns = columns
    
    _, idx = np.unique(agg, axis = 1, return_index=True)
    agg = agg.iloc[:, idx]
    
    return agg

def agg_categorical(df, parent_var, df_name):
    
    categorical = pd.get_dummies(df.select_dtypes('category'))
    categorical[parent_var] = df[parent_var]
    categorical = categorical.groupby(parent_var).agg(['sum', 'count', 'mean'])
    
    column_names = []
    for var in categorical.columns.levels[0]:
        for stat in ['sum', 'count', 'mean']:
            column_names.append('%s_%s_%s' % (df_name, var, stat))
    categorical.columns = column_names
    
    _, idx = np.unique(categorical, axis = 1, return_index = True)
    categorical = categorical.iloc[:, idx]
    
    return categorical

In [127]:
def kde_target(var_name, df):
    
    corr = df['TARGET'].corr(df[var_name])
    avg_repaid = df.loc[df['TARGET'] == 0, var_name].median()
    avg_not_repaid = df.loc[df['TARGET'] == 1, var_name].median()
    
    plt.figure(figsize = (12, 6))
    sns.kdeplot(df.loc[df['TARGET'] == 0, var_name], label = 'TARGET == 0')
    sns.kdeplot(df.loc[df['TARGET'] == 1, var_name], label = 'TARGET == 1')

    plt.xlabel(var_name); plt.ylabel('Density'); plt.title('%s Distribution' % var_name)
    plt.legend() 
    
    print('The correlation between %s and the TARGET is %0.4f' % (var_name, corr))
    print('Median value for loan that was not repaid = %0.4f' % avg_not_repaid)
    print('Median value for loan that was repaid =     %0.4f' % avg_repaid)

In [128]:
import sys

def return_size(df):
    return round(sys.getsizeof(df) / 1e9, 2)

def convert_types(df, print_info = False):
    
    original_memory = df.memory_usage().sum()
    
    for c in df:
        if ('SK_ID' in c):
            df[c] = df[c].fillna(0).astype(np.int32)
        elif (df[c].dtype == 'object') and (df[c].nunique() < df.shape[0]):
            df[c] = df[c].astype('category')
        elif list(df[c].unique()) == [1, 0]:
            df[c] = df[c].astype(bool)
        elif df[c].dtype == float:
            df[c] = df[c].astype(np.float32)
        elif df[c].dtype == int:
            df[c] = df[c].astype(np.int32)
        
    new_memory = df.memory_usage().sum()
    
    if print_info:
        print(f'Original Memory Usage: {round(original_memory / 1e9, 2)} gb.')
        print(f'New Memory Usage: {round(new_memory / 1e9, 2)} gb.')
        
    return df


In [129]:
previous = pd.read_csv('../data/home-credit-default-risk/previous_application.csv')
previous = convert_types(previous, print_info=True)
previous_agg = agg_numeric(previous, 'SK_ID_CURR', 'previous')
previous_counts = agg_categorical(previous, 'SK_ID_CURR', 'previous')

train = app_train
train = convert_types(train)
test = app_test
test = convert_types(test)

train = train.merge(previous_counts, on ='SK_ID_CURR', how = 'left')
train = train.merge(previous_agg, on = 'SK_ID_CURR', how = 'left')

test = test.merge(previous_counts, on ='SK_ID_CURR', how = 'left')
test = test.merge(previous_agg, on = 'SK_ID_CURR', how = 'left')

gc.enable()
del previous, previous_agg, previous_counts
gc.collect()

Original Memory Usage: 0.49 gb.
New Memory Usage: 0.18 gb.


2919

In [130]:
train.shape

(307511, 605)

In [131]:
def remove_missing_columns(train, test, threshold = 75):

    train_miss = train.isnull().sum()
    test_miss = test.isnull().sum()

    train_index_list = train_miss.index.tolist()
    missing_train_columns=[]
    for i in range(train_miss.shape[0]) :
        if 100 * train_miss[i] / len(train)>75:
            missing_train_columns.append(train_index_list[i])

    test_index_list = test_miss.index.tolist()
    missing_test_columns=[]
    for i in range(test_miss.shape[0]) :
        if 100 * test_miss[i] / len(test)>75:
            missing_test_columns.append(test_index_list[i])
    
    missing_columns = list(set(missing_train_columns + missing_test_columns))

    train = train.drop(columns = missing_columns)
    test = test.drop(columns = missing_columns)
    
    return train, test

train, test = remove_missing_columns(train, test)
train.shape

(307511, 599)

In [132]:
threshold = 0.9

corr_matrix = train.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool_))
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

train = train.drop(columns = to_drop)
test = test.drop(columns = to_drop)

In [133]:
train.shape

(307511, 508)

In [134]:

import xgboost
train_t=train.copy()
train_t = train_t.drop(columns = ['TARGET'])
x = train_t.values
y = train['TARGET'].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2 , random_state = 5)
model = xgboost.XGBClassifier(n_estimatores = 200, max_depth = 8, subsample = 0.8, colsample_bytree = 0.8, 
                              min_child_weight = 50, random_state=27).fit(x_train, y_train)
#生成train数据放进模型后的答案
ans = model.predict_proba(x_test)

ans_test = model.predict_proba(test.values)
print(ans_test)



Parameters: { "n_estimatores" } are not used.

[[0.9538178  0.04618224]
 [0.86457473 0.13542528]
 [0.97555155 0.02444843]
 ...
 [0.9697569  0.03024312]
 [0.9441286  0.05587142]
 [0.7804461  0.21955387]]


In [135]:
#生成答案
index=test['SK_ID_CURR']
ans=pd.DataFrame(ans_test[:,1],columns = ['TARGET'])
result=pd.concat([index,ans],axis=1)
result.to_csv('../result/'+version+'/submission_t.csv',encoding = 'utf-8',index = 0)
result.shape

(48744, 2)

In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc

df = pd.read_csv('../data/app_tr.csv')
df_te = pd.read_csv('../data/app_te.csv')


In [51]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
for col in df:
    if df[col].dtype == 'object':
        if len(list(df[col].unique())) <= 2:
            le.fit(df[col])
            df[col] = le.transform(df[col])
            df_te[col] = le.transform(df_te[col])

df = pd.get_dummies(df)
df_te = pd.get_dummies(df_te)

train_labels = df['TARGET']
df, df_te = df.align(df_te, join = 'inner', axis = 1)
df['TARGET'] = train_labels


In [52]:
import torch


df_te= np.array(df_te)
x_test = df_te[:, 2:]

In [53]:
from torch.utils.data import DataLoader, Dataset
import torch.utils.data as data

class MyDataset(Dataset):
    def __init__(self, data_root, data_label):
        super(MyDataset, self).__init__()
        self.data = data_root
        self.label = data_label

    def __getitem__(self, index):
        data = self.data[index]
        labels = self.label[index]
        return data, labels

    def __len__(self):
        return len(self.data)


    def get_dataset(x,y):
        dataset = MyDataset(x, y)
        datas = DataLoader(dataset, batch_size=256, shuffle=False, sampler=None, \
                        batch_sampler=None, num_workers=0, collate_fn=None, pin_memory=False, \
                        drop_last=False, timeout=0, worker_init_fn=None, multiprocessing_context=None)
        return datas

dataset = MyDataset.get_dataset(x_train,y_train)
train_size = int(len(dataset)*0.7)
val_size = len(dataset)-train_size
train, val = data.random_split(dataset, [train_size, val_size])
x_train.shape

torch.Size([307511, 237])

In [58]:
import lightning as L
import torch.nn.functional as F
import torch.nn as nn

class MLP(L.LightningModule):
    def __init__(self):
        super().__init__()
        self.encoder = nn.Sequential(nn.Linear(237, 128), nn.ReLU(),
                                     nn.Linear(128, 64), nn.ReLU(),
                                     nn.Linear(64,2))

    def forward(self, x):
        # in lightning, forward defines the prediction/inference actions
        y_pre = self.encoder(x)
        return out

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop. It is independent of forward
        x, y = batch
        y_pre = F.log_softmax(self.encoder(x), dim=1)
        loss = F.nll_loss(y_pre, y)
        self.log("train_loss", loss,on_step=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer


autoencoder = MLP()
trainer = L.Trainer(max_epochs=20)
trainer.fit(autoencoder, dataset)








GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 38.9 K
---------------------------------------
38.9 K    Trainable params
0         Non-trainable params
38.9 K    Total params
0.155     Total estimated model params size (MB)


Epoch 19: 100%|██████████| 1202/1202 [00:08<00:00, 141.48it/s, v_num=3]

`Trainer.fit` stopped: `max_epochs=20` reached.


Epoch 19: 100%|██████████| 1202/1202 [00:08<00:00, 141.36it/s, v_num=3]


In [57]:
model = autoencoder.eval().cuda(device=0)
x=torch.from_numpy(x_test[0]).float().unsqueeze(0).cuda(0)
res=model(x).data.cpu().numpy()
print(res)


[[0]]


In [None]:
res=res[0]
target=[]
for i in res:
    if i[0]<0 | i[1]<0 :
        
    i=i/i.sum()
    target.append(i)
type(target)
target=np.array(target)
type(target)
#生成答案
test = pd.read_csv('../data/app_te.csv')
index=test['SK_ID_CURR']
ans=pd.DataFrame(target[:,1],columns = ['TARGET'])
result=pd.concat([index,ans],axis=1)
result.to_csv('../result/'+version+'/submission_n.csv',encoding = 'utf-8',index = 0)
result.shape
