02で前処理をしたデータの読み込みとモデルの学習を行うためのnotebookです。  
ここで作成したモデルは **src/models/** フォルダに格納して推論の際に使うようにして下さい。

## 必要なライブラリのimport

In [1]:
import warnings
import time
import pickle

import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_columns', 500)

In [2]:
# データの読み込み
train = pd.read_csv('../data/processed/processed20240619_train.csv')
test = pd.read_csv('../data/processed/processed20240619_test.csv')

# 目的変数と説明変数の作成
target = train['target']
features = [c for c in train.columns if c not in ['card_id', 'first_active_month', 'target']]
categorical_feats = ['feature_2', 'feature_3']


In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error
import time
import pickle

# 目的変数の正規化
scaler = MinMaxScaler()
target_array = target.to_numpy().reshape(-1, 1)
target = scaler.fit_transform(target_array).flatten()

# PyTorchのデータセットクラス
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, data, targets):
        self.data = data
        self.targets = targets

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx], dtype=torch.float32), torch.tensor(self.targets[idx], dtype=torch.float32)

# ニューラルネットワークの定義
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# デバイスの設定
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# KFoldで5分割して学習
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))
start = time.time()

# 学習の設定
num_epochs = 100
batch_size = 32
learning_rate = 0.001

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target)):
    print(f"Fold {fold_+1}/5")

    trn_data = train.iloc[trn_idx][features].values
    trn_targets = target[trn_idx]
    val_data = train.iloc[val_idx][features].values
    val_targets = target[val_idx]

    train_dataset = CustomDataset(trn_data, trn_targets)
    val_dataset = CustomDataset(val_data, val_targets)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = SimpleNN(len(features)).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for data, targets in train_loader:
            data, targets = data.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(data).squeeze()
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            train_loss += loss.item() * data.size(0)

        train_loss /= len(train_loader.dataset)

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for data, targets in val_loader:
                data, targets = data.to(device), targets.to(device)
                outputs = model(data).squeeze()
                loss = criterion(outputs, targets)
                val_loss += loss.item() * data.size(0)

        val_loss /= len(val_loader.dataset)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = model.state_dict()

        if (epoch + 1) % 10 == 0:  # 10エポックごとにログを表示
            print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # ベストモデルを保存
    torch.save(best_model, f'../src/models/model_fold_{fold_}.pth')

Fold 1/5
Epoch [10/100], Train Loss: nan, Val Loss: nan
Epoch [20/100], Train Loss: nan, Val Loss: nan


KeyboardInterrupt: 

In [None]:
# テストデータに対する予測
predictions = np.zeros(len(test))
test_data = torch.tensor(test[features].values, dtype=torch.float32).to(device)

for fold_ in range(5):
    # モデルの読み込み
    model = SimpleNN(len(features)).to(device)
    model.load_state_dict(torch.load(f'../src/models/model_fold_{fold_}.pth'))
    model.eval()
    
    # テストデータに対する予測
    with torch.no_grad():
        fold_predictions = model(test_data).cpu().numpy().squeeze()
        predictions += fold_predictions

# 平均化
predictions /= 5

# 提出用のCSVファイルを作成
sub_df = pd.DataFrame({"card_id": test["card_id"].values})
sub_df["target"] = predictions
sub_df.to_csv("submit_original_NN.csv", index=False)

In [9]:
print(train[features].isnull().sum())
print(np.isfinite(train[features]).all())

feature_1                              0
feature_2                              0
feature_3                              0
elapsed_time                           0
hist_transactions_count            30959
                                   ...  
daily_count_y                          0
max_daily_count                    21931
min_daily_count                    21931
mean_month_lag_per_duration            0
new_mean_month_lag_per_duration    53665
Length: 172, dtype: int64
feature_1                           True
feature_2                           True
feature_3                           True
elapsed_time                        True
hist_transactions_count            False
                                   ...  
daily_count_y                       True
max_daily_count                    False
min_daily_count                    False
mean_month_lag_per_duration         True
new_mean_month_lag_per_duration    False
Length: 172, dtype: bool


In [10]:
import pandas as pd

# 前提: train データフレームが存在する

# NULL 値のある列を抽出
null_cols = train.columns[train.isnull().any()]

# 抽出した列を表示
print(null_cols)

Index(['hist_transactions_count', 'hist_category_1_sum',
       'hist_category_1_mean', 'hist_category_2_1.0_mean',
       'hist_category_2_2.0_mean', 'hist_category_2_3.0_mean',
       'hist_category_2_4.0_mean', 'hist_category_2_5.0_mean',
       'hist_category_3_A_mean', 'hist_category_3_B_mean',
       ...
       'category_1_installments_mean', 'category_1_installments_min',
       'category_1_installments_max', 'category_1_installments_std',
       'mean_lag_month', 'purchase_amount', 'last_purchase_amount',
       'max_daily_count', 'min_daily_count',
       'new_mean_month_lag_per_duration'],
      dtype='object', length=102)


In [11]:
null_cols.shape

(102,)