In [83]:
import numpy as np
import pandas as pd
import datetime
import gc

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import lightgbm as lgb

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.autograd import Variable

import warnings
warnings.filterwarnings('ignore')
np.random.seed(4590)

pd.options.display.max_rows = 500

In [84]:
df_train = pd.read_csv("../input/20181216_train.csv")
df_test = pd.read_csv("../input/20181216_test.csv")

In [85]:
def merge_new_feature(path):
    df_new_feature = pd.read_csv(path)
    
    df_tr = pd.merge(df_train, df_new_feature, on="card_id", how="left")
    df_te = pd.merge(df_test, df_new_feature, on="card_id", how="left")
    
    return df_tr, df_te

In [86]:
df_train, df_test = merge_new_feature("../input/merchants_nmf.csv")
df_train, df_test = merge_new_feature("../input/new_merchants_nmf.csv")
df_train, df_test = merge_new_feature("../input/prosperity_merchants_latest.csv")
df_train, df_test = merge_new_feature("../input/merchants_numerical_features.csv")
df_train, df_test = merge_new_feature("../input/merchants_item_scale_feature.csv")
df_train, df_test = merge_new_feature("../input/purchase_pattern.csv")
df_train, df_test = merge_new_feature("../input/purchase_pettern_nmf.csv")
df_train, df_test = merge_new_feature("../input/purchase_amount_pettern_nmf.csv")

In [87]:
df_train["Null_count"] = df_train.isnull().sum(axis=1)
df_test["Null_count"] = df_test.isnull().sum(axis=1)

In [88]:
dont_use = [
    'card_id', 
    'first_active_month',
    'second_active_date',
    'diff_first_and_second',
    'target','outliers', 
    'category_1_Y_std', 
    'hist_purchase_date_max',
    'hist_purchase_date_min',
    'hist_category_2_mean_mean',
    'hist_category_3_mean_mean'
]

df_train_columns = [c for c in df_train.columns if c not in dont_use]
target = df_train['target']
target_outlier = df_train["outliers"]

In [89]:
for col in df_train_columns: 
    df_train[col], df_test[col] = df_train[col].replace(np.inf, np.nan).fillna(0.0), df_test[col].replace(np.inf, np.nan).fillna(0.0)

In [90]:
max_each_col = np.max(df_train[df_train_columns].values, axis=0)

for i, max_ in enumerate(max_each_col):
    if max_ >= 1e+7:
        df_train[df_train_columns[i]] = 1/(1+np.exp(-1*df_train[df_train_columns[i]].values))
        df_test[df_train_columns[i]] = 1/(1+np.exp(-1*df_test[df_train_columns[i]].values))

In [91]:
len(df_train_columns)

249

In [93]:
class MLPNet(nn.Module):
    def __init__(self):
        super(MLPNet, self).__init__()
        self.fc1 = nn.Linear(len(df_train_columns), 300)
        self.fc2 = nn.Linear(300, 300)
        self.fc3 = nn.Linear(300, 1)
        self.dropout1 = nn.Dropout2d(0.2)
        self.dropout2 = nn.Dropout2d(0.2)
        
    def forward(self, x):
#         x = F.relu(self.fc1(x))
#         x = self.dropout1(x)
#         x = F.relu(self.fc2(x))
#         x = self.dropout2(x)
        
        x = self.fc1(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.dropout2(x)
        return self.fc3(x)
    
device = 'cpu'
net = MLPNet().to(device)

In [94]:
batch_size = 500
num_epochs = 10
learning_rate = 0.01

In [95]:
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=learning_rate)#, momentum=0.9, weight_decay=5e-2)

In [96]:
net

MLPNet(
  (fc1): Linear(in_features=249, out_features=300, bias=True)
  (fc2): Linear(in_features=300, out_features=300, bias=True)
  (fc3): Linear(in_features=300, out_features=1, bias=True)
  (dropout1): Dropout2d(p=0.2)
  (dropout2): Dropout2d(p=0.2)
)

In [97]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1000) #4590
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['outliers'].values)):
    print("Fold {}".format(fold_))
    x_train, x_val = df_train[df_train_columns].iloc[trn_idx], df_train[df_train_columns].iloc[val_idx]
    y_train, y_val = target.iloc[trn_idx], target.iloc[val_idx]

    batch_no = x_train.shape[0] // batch_size 
    
# NN ==========================================================================
    for epoch in range(num_epochs):
            
        x_train2, y_train2 = shuffle(x_train, y_train, random_state=epoch)
        
        net.train()
        for i in range(batch_no):
            start = i * batch_size
            end = start + batch_size
            x_var = Variable(torch.FloatTensor(x_train2.values[start:end]))
            t_var = Variable(torch.FloatTensor(y_train2.values[start:end]))
            
            # Forward + Backward + Optimize
            optimizer.zero_grad()
            outputs = net(x_var)
            loss = criterion(outputs, t_var)
            loss.backward()
            optimizer.step()
        
        #評価
        net.eval()
        x_val_var = Variable(torch.FloatTensor(x_val.values), requires_grad=True)
        x_train_var = Variable(torch.FloatTensor(x_train.values), requires_grad=True)
        with torch.no_grad():
            x_train_outputs = net(x_train_var).numpy().reshape(x_train.shape[0])
            x_val_outputs = net(x_val_var).numpy().reshape(x_val.shape[0])
        train_rmse = np.sqrt(mean_squared_error(np.log2(y_train), np.log2(x_train_outputs)))
        val_rmse = np.sqrt(mean_squared_error(np.log2(y_val), np.log2(x_val_outputs)))
        
        print("[%d]     training's rmse: %.4f     valid_1's rmse: %.4f" %(epoch+1, train_rmse, val_rmse))
        
    
    x_val_var = Variable(torch.FloatTensor(x_val.values), requires_grad=True)
    with torch.no_grad():
        outputs = net(x_val_var).numpy().reshape(x_val.shape[0])
    oof[val_idx] = outputs
    
    rmse = np.sqrt(mean_squared_error(np.log2(outputs),np.log2( y_val.values)))
    print("valid's_loss: %.4f" %rmse)
    
    x_test_var = Variable(torch.FloatTensor(df_test[df_train_columns].values), requires_grad=True)
    with torch.no_grad():
        outputs = net(x_test_var)
    predictions += np.log2(outputs.numpy().reshape(df_test.shape[0])) / folds.n_splits
# NN ==========================================================================

print(np.sqrt(mean_squared_error(np.log2(target), np.log2(oof))))

Fold 0


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').