# gpu/ram/drive

In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri Nov 25 09:37:44 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   40C    P8     9W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')
import gc
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

Mounted at /content/drive


In [4]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


# import

In [7]:
import random
import pandas as pd
import numpy as np
import os
import glob
import warnings
warnings.filterwarnings('ignore')

# modeling

In [5]:
def make_dataset(input_path, target_path):
    train = pd.DataFrame()
    all_input_list = sorted(glob.glob(input_path))
    all_target_list = sorted(glob.glob(target_path))
    for x, y in zip(all_input_list,all_target_list):
        x = pd.read_csv(x)
        y = pd.read_csv(y)
        x['obs_time'] = x['obs_time'].str[:2]
        x = x.iloc[:,1:]
        col_list = x.columns[1:]
        for i in range(0,28) :
            day = x.iloc[24*i:24*i+24]
            time_list = day['obs_time'].unique()
            if len(time_list) > 24 :
                for i in range(0,len(time_list)):
                    x['obs_time'] = x['obs_time'].replace(time_list[24+i],time_list[i])
            for col in col_list :
                for time in time_list :
                    value = day[day['obs_time']==time][col].iloc[0]
                    x[col+time] = value
            nx = x.iloc[:1,15:]
            ny = y.iloc[i:i+1].reset_index(drop=True)
            xy = pd.merge(nx,ny,left_index=True, right_index=True)
            train = pd.concat([train,xy]).reset_index(drop=True)
    return train

In [17]:
train_input_path, train_target_path = 'drive/MyDrive/dacon/data/train_input/*.csv', 'drive/MyDrive/dacon/data/train_target/*.csv'
test_input_path, test_target_path = 'drive/MyDrive/dacon/data/test_input/*.csv', 'drive/MyDrive/dacon/data/test_target/*.csv'

train = make_dataset(train_input_path, train_target_path)
test = make_dataset(test_input_path, test_target_path)

In [18]:
train_test = train.copy()
test_test = test.copy()

In [26]:
train_test['DAT'] = (train_test['DAT']-1)*10
train_test['DAT']

0        0
1       10
2       20
3       30
4       40
      ... 
779    230
780    240
781    250
782    260
783    270
Name: DAT, Length: 784, dtype: int64

In [27]:
test_test['DAT'] = (test_test['DAT']-1)*10
test_test['DAT']

0        0
1       10
2       20
3       30
4       40
      ... 
135    230
136    240
137    250
138    260
139    270
Name: DAT, Length: 140, dtype: int64

In [12]:
import xgboost

In [40]:
X = train_test.iloc[:,:-1]
y = train_test.iloc[:,-1:]
testX = test_test.iloc[:,:-1]
testy = test_test.iloc[:,-1:]

In [41]:
xgb_model = xgboost.XGBRegressor()
xgb_model.fit(X, y)



XGBRegressor()

In [42]:
y_pred = xgb_model.predict(testX)

In [43]:
y_pred[:28] # 원래

array([ 3.314386 ,  2.873422 ,  3.0523722,  3.691428 ,  4.943701 ,
        4.2387676,  4.9655294,  5.451253 ,  6.681161 ,  7.442542 ,
        8.6249075, 11.074293 , 23.12757  , 28.255405 , 18.212914 ,
       25.777475 , 28.320366 , 19.346506 , 18.880136 , 24.509848 ,
       52.697346 , 53.73388  , 64.404015 , 76.070755 , 69.60325  ,
       81.6572   , 99.0856   , 83.30005  ], dtype=float32)

In [16]:
y_pred[:28] # *2

array([ 3.314386 ,  2.873422 ,  3.0523722,  3.691428 ,  4.943701 ,
        4.2387676,  4.9655294,  5.451253 ,  6.681161 ,  7.442542 ,
        8.6249075, 11.074293 , 23.12757  , 28.255405 , 18.212914 ,
       25.777475 , 28.320366 , 19.346506 , 18.880136 , 24.509848 ,
       52.697346 , 53.73388  , 64.404015 , 76.070755 , 69.60325  ,
       81.6572   , 99.0856   , 83.30005  ], dtype=float32)

In [31]:
y_pred[:28] # *10

array([ 3.314386 ,  2.873422 ,  3.0523722,  3.691428 ,  4.943701 ,
        4.2387676,  4.9655294,  5.451253 ,  6.681161 ,  7.442542 ,
        8.6249075, 11.074293 , 23.12757  , 28.255405 , 18.212914 ,
       25.777475 , 28.320366 , 19.346506 , 18.880136 , 24.509848 ,
       52.697346 , 53.73388  , 64.404015 , 76.070755 , 69.60325  ,
       81.6572   , 99.0856   , 83.30005  ], dtype=float32)

In [32]:
from sklearn.model_selection import train_test_split
X = train.iloc[:,:-1]
y = train.iloc[:,-1:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [34]:
xgb_model = xgboost.XGBRegressor()
xgb_model.fit(X_train, y_train)



XGBRegressor()

In [35]:
y_pred = xgb_model.predict(X_test)

In [36]:
from sklearn.metrics import mean_squared_error

RMSE = mean_squared_error(y_test, y_pred)**0.5

In [37]:
RMSE

12.219482311851545

In [44]:
all_target_list = sorted(glob.glob(test_target_path))

In [45]:
i=0
for test_path in all_target_list:
    submit_df = pd.read_csv(test_path)
    submit_df['predicted_weight_g'] = y_pred[i*28:i*28+28]
    submit_df.to_csv(test_path, index=False)
    i+=1