In [153]:
import sklearn
import os
import numpy as np
import pandas as pd

from glob import glob
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

In [154]:
dataset = {
    key: [] for key in ['train_x', 'train_y', 'test_x', 'test_y']
}

train_x_list = sorted(glob('./data/train_input/*.csv'))
train_y_list = sorted(glob('./data/train_target/*.csv'))
test_x_list = sorted(glob('./data/test_input/*.csv'))

print("Load train data")
for x_path, y_path in zip(train_x_list, train_y_list):
    x_df = pd.read_csv(x_path)
    y_df = pd.read_csv(y_path)
    
    x_df = x_df.drop(columns=['obs_time'])
    # x_df = x_df.drop(columns=['DAT'])
    x_df = x_df.fillna(0)
    
    for i in range(28):
        x = x_df[i * 24:(i+1) * 24].values
        x = np.array(x).flatten().tolist()
        dataset['train_x'].append(x)
    
    dataset['train_y'].extend(y_df['predicted_weight_g'])
print(len(dataset['train_y']))
print(len(dataset['train_x'][0]))
print("Done.")

print("Load test data")
for x_path in test_x_list:
    x_df = pd.read_csv(x_path)
    
    x_df = x_df.drop(columns=['obs_time'])
    # x_df = x_df.drop(columns=['DAT'])
    x_df = x_df.fillna(0)
    
    for i in range(28):
        x = x_df[i * 24:(i+1) * 24].values
        x = np.array(x).flatten().tolist()
        dataset['test_x'].append(x)
print(len(dataset['test_x']))
print(len(dataset['test_x'][0]))
print("Done.")

Load train data
784
360
Done.
Load test data
140
360
Done.


In [155]:
ddff = pd.DataFrame(dataset['train_x'])
ddff['target'] = dataset['train_y']
print(ddff.shape)
ddff.to_csv('./data/train.csv', index=False)

(784, 361)


In [156]:
ddff = pd.DataFrame(dataset['test_x'])
print(ddff.shape)
ddff.to_csv('./data/test.csv', index=False)

(140, 360)


# XGBoost

In [157]:
xgb = XGBRegressor(n_estimators=3000, learning_rate=0.1, subsample=0.5, max_depth=10, 
                   gamma=500, reg_lambda=500, colsample_bytree=0.5)
xgb.fit(dataset['train_x'], dataset['train_y'])

xgb.score(dataset['train_x'], dataset['train_y'])

0.940366957958488

# Inference

In [158]:
dataset['test_y'] = xgb.predict(dataset['test_x'])

In [159]:
test_y_list = sorted(glob('./data/test_target/*.csv'))

i = 0
for path in test_y_list:
    df = pd.read_csv(path)
    
    df['predicted_weight_g'] = dataset['test_y'][i * 28:(i+1) * 28]
    i += 1
    df.to_csv(path, index=False)

# submission

In [160]:
import zipfile

os.chdir("./data/test_target/")
submission = zipfile.ZipFile("../submission_xgboost.zip", 'w')

for path in test_y_list:
    path = path.split('/')[-1]
    submission.write(path)
    
submission.close()
os.chdir("../../")

# Best

- 1
    - xgb = XGBRegressor(n_estimators=3000, learning_rate=0.05, subsample=0.5, max_depth=10, 
                   gamma=500, reg_lambda=500, colsample_bytree=0.5)
- 2
    - xgb = XGBRegressor(n_estimators=3000, learning_rate=0.05, subsample=0.5, max_depth=13, 
                   gamma=500, reg_lambda=500, colsample_bytree=0.5)
- 3
    - xgb = XGBRegressor(n_estimators=5000, learning_rate=0.05, subsample=0.5, max_depth=6, 
                   gamma=500, reg_lambda=500, colsample_bytree=0.5)
- 4
- 5

# RandomForest

In [161]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score

rf = RandomForestRegressor(n_estimators=1, max_depth=6, max_samples=0.7)

rf.fit(dataset['train_x'], dataset['train_y'])

# Inference

In [162]:
dataset['test_y'] = rf.predict(dataset['test_x'])

In [163]:
test_y_list = sorted(glob('./data/test_target/*.csv'))

i = 0
for path in test_y_list:
    df = pd.read_csv(path)
    
    df['predicted_weight_g'] = dataset['test_y'][i * 28:(i+1) * 28]
    i += 1
    df.to_csv(path, index=False)

# submission

In [164]:
import zipfile

os.chdir("./data/test_target/")
submission = zipfile.ZipFile("../submission_rf.zip", 'w')

for path in test_y_list:
    path = path.split('/')[-1]
    submission.write(path)
    
submission.close()
os.chdir("../../")

# Deep Learning

In [165]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

from sklearn.metrics import mean_squared_error

In [166]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [167]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.FloatTensor(y)
        self.len = len(self.y)
        
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
    def __len__(self):
        return self.len

In [168]:
class Regressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(360, 120)
        self.fc2 = nn.Linear(120, 1)
        
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        
        return x

In [169]:
trainsets = CustomDataset(dataset['train_x'], dataset['train_y'])
trainloader = DataLoader(trainsets, batch_size=28, shuffle=False)

model = Regressor()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

In [170]:
model.to(device)
loss = []
n = len(trainloader)

for epoch in range(1):
    for i, data in enumerate(trainloader, 0):
        X, y = data
        
        optimizer.zero_grad()
        
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument mat1 in method wrapper_addmm)

In [None]:
test_y_list = glob('./data/test_target/*.csv')

for i in range(5):
    testsets = CustomDataset(dataset['test_x'][i*28:(i+1)*28], dataset['test_y'][i*28:(i+1)*28])
    testloader = DataLoader(testsets, batch_size=28, shuffle=False)
    arr = []

    with torch.no_grad():
        model.eval()

        for data in testloader:
            X, y = data

            outputs = model(X)
            arr.append(outputs)
    
    submit_df = pd.read_csv(test_y_list[i])
    submit_df['predicted_weight_g'] = arr[0]
    submit_df.to_csv(test_y_list[i], index=False)

# Submission

In [None]:
import zipfile
import os

os.chdir("./data/test_target/")
submission = zipfile.ZipFile("../submission_DL.zip", 'w')
for path in test_y_list:
    path = path.split('/')[-1]
    submission.write(path)
submission.close()
os.chdir('../../')
os.getcwd()