# Import

In [5]:
import pandas as pd
import numpy as np
import random
import os

# Fixed RandomSeed

In [6]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# Data Load

In [7]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [8]:
train.head()

Unnamed: 0,ID,월,일,측정 시간대,섭씨 온도(°⁣C),절대 온도(K),이슬점 온도(°C),상대 습도 (%),대기압(mbar),포화 증기압(mbar),실제 증기압(mbar),증기압 부족량(mbar),수증기 함량 (g/kg),공기 밀도 (g/m**3),풍향 (deg),풍속 (m/s)
0,TRAIN_00000,7,2,저녁,13.97,287.78,9.84,76.1,992.08,15.98,12.16,3.82,7.66,1198.06,155.6,1.61
1,TRAIN_00001,8,21,오전,16.94,290.85,12.14,73.3,991.07,19.33,14.17,5.16,8.94,1183.67,177.0,1.68
2,TRAIN_00002,11,1,저녁,9.76,283.84,5.4,74.2,988.71,12.1,8.98,3.12,5.67,1213.22,146.2,0.73
3,TRAIN_00003,12,28,오전,5.27,277.3,2.71,83.5,1014.25,8.89,7.43,1.47,4.57,1265.48,264.5,2.71
4,TRAIN_00004,9,26,오후,17.35,290.86,12.68,74.0,995.77,19.84,14.68,5.16,9.22,1187.4,19.34,1.0


# Analysis

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36581 entries, 0 to 36580
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              36581 non-null  object 
 1   월               36581 non-null  int64  
 2   일               36581 non-null  int64  
 3   측정 시간대          36581 non-null  object 
 4   섭씨 온도(°⁣C)      36581 non-null  float64
 5   절대 온도(K)        36581 non-null  float64
 6   이슬점 온도(°C)      36581 non-null  float64
 7   상대 습도 (%)       36581 non-null  float64
 8   대기압(mbar)       36581 non-null  float64
 9   포화 증기압(mbar)    36581 non-null  float64
 10  실제 증기압(mbar)    36581 non-null  float64
 11  증기압 부족량(mbar)   36581 non-null  float64
 12  수증기 함량 (g/kg)   36581 non-null  float64
 13  공기 밀도 (g/m**3)  36581 non-null  float64
 14  풍향 (deg)        36581 non-null  float64
 15  풍속 (m/s)        36581 non-null  float64
dtypes: float64(12), int64(2), object(2)
memory usage: 4.5+ MB


# Feature / Target split

In [10]:
# train_x는 독립변수이므로 종속변수(풍속 (m/s))를 제거합니다. 
# 또한 target 이외의 분석에 활용하지 않는 데이터(id)를 제거합니다.
train_x = train.drop(columns=['ID', '풍속 (m/s)'], axis = 1)

# train_y는 종속변수로 값을 설정합니다.
train_y = train['풍속 (m/s)']

In [11]:
# train에서와 마찬가지로 분석에 활용하지 않는 데이터(id)를 제거합니다.
test_x = test.drop(columns=['ID'])

# Data Preprocessing

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le = le.fit(train_x['측정 시간대'])
train_x['측정 시간대'] = le.transform(train_x['측정 시간대'])
test_x['측정 시간대'] = le.transform(test_x['측정 시간대'])

# Define Evaluate Function

In [25]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score, cross_validate

def test_func(test_model, X_train, y_train):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    nmse = cross_validate(test_model, X_train_scaled, y_train, scoring=['neg_mean_squared_error'], return_train_score=True, cv=5, n_jobs=-1)
    mse = -1 * nmse['test_neg_mean_squared_error']
    avg_mse = round(np.mean(mse), 4)
    return print(f"평균 MSE :{avg_mse :.4f}")

# Define Regression Model and Evaluation

In [55]:
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [27]:
test_func(DecisionTreeRegressor(random_state=1004), train_x, train_y)

평균 MSE :0.7636


In [29]:
test_func(RandomForestRegressor(random_state=1004), train_x, train_y)

평균 MSE :0.3726


In [30]:
test_func(BaggingRegressor(random_state=1004), train_x, train_y)

평균 MSE :0.4230


In [31]:
test_func(AdaBoostRegressor(random_state=1004), train_x, train_y)

평균 MSE :1.8642


In [32]:
test_func(MLPRegressor(random_state=1004), train_x, train_y)

평균 MSE :0.9413


In [33]:
test_func(XGBRegressor(random_state=1004), train_x, train_y)

평균 MSE :0.5256


In [34]:
test_func(LGBMRegressor(random_state=1004), train_x, train_y)

평균 MSE :0.6268


In [35]:
test_func(ExtraTreesRegressor(random_state=1004), train_x, train_y)

평균 MSE :0.2826


In [37]:
test_func(SVR(), train_x, train_y)

평균 MSE :1.2035


In [51]:
test_func(LinearRegression(), train_x, train_y)

평균 MSE :1.8885


# Training and Prediction

In [38]:
# Define Model
model = ExtraTreesRegressor(random_state=1004)

# Normalization
scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x)
test_x_scaled = scaler.transform(test_x)

# Training
model.fit(train_x_scaled , train_y)

# Prediction
preds = model.predict(test_x_scaled) 

# Submission

In [39]:
submission = pd.read_csv('./sample_submission.csv')

In [40]:
submission['풍속 (m/s)'] = preds
submission.head()

Unnamed: 0,ID,풍속 (m/s)
0,TEST_00000,1.5664
1,TEST_00001,1.0879
2,TEST_00002,2.0635
3,TEST_00003,1.0884
4,TEST_00004,1.2342


In [41]:
submission.to_csv('./submission_windspeed_0720_1.csv', index=False)

# AutoGluon test

In [42]:
from autogluon.tabular import TabularPredictor, TabularDataset

# Normalization
scaler = StandardScaler()
train_x_scaled = scaler.fit_transform(train_x)
test_x_scaled = scaler.transform(test_x)

# Define Dataset
train_df = pd.DataFrame(train_x_scaled)
train_df['TARGET'] = train_y

# autogluon 
predictor = TabularPredictor(label='TARGET', eval_metric='mse', problem_type='regression').fit(train_data=train_df, presets=['best_quality'])

No path specified. Models will be saved in: "AutogluonModels/ag-20230720_012636/"
Presets specified: ['best_quality']
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230720_012636/"
AutoGluon Version:  0.7.0
Python Version:     3.8.8
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #112-Ubuntu SMP Thu Feb 3 13:50:55 UTC 2022
Train Data Rows:    36581
Train Data Columns: 14
Label Column: TARGET
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    162686.98 MB
	Train Data (Original)  Memory Usage: 4.1 MB (0.0% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generato

Fitting model: XGBoost_BAG_L1 ...
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
2023-07-20 01:32:59,863	ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): [36mray::_ray_fit()[39m (pid=2249087, ip=172.17.0.2)
  File "/root/anaconda3/lib/python3.8/ctypes/__init__.py", line 381, in __init__
    self._handle = _dlopen(self._name, mode)
OSError: /root/anaconda3/lib/python3.8/site-packages/torch/lib/../../nvidia/cublas/lib/libcublas.so.11: undefined symbol: cublasLtGetStatusString, version libcublasLt.so.11

During handling of the above exception, another exception occurred:

[36mray::_ray_fit()[39m (pid=2249087, ip=172.17.0.2)
  File "/root/anaconda3/lib/python3.8/site-packages/autogluon/core/models/ensemble/fold_fitting_strategy.py", line 374, in _ray_fit
    fold_model.fit(X=X_fold, y=y_fold, X_val=X_val_fold, y_val=y_val_fold,
  File "/root/anaconda3/lib/python3.8/site-packages/autogluon/core/models/abstract

Fitting model: LightGBMLarge_BAG_L1 ...
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
2023-07-20 01:34:02,444	ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2023-07-20 01:34:02,451	ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2023-07-20 01:34:02,453	ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2023-07-20 01:34:02,454	ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-

Fitting model: XGBoost_BAG_L2 ...
	Fitting 8 child models (S1F1 - S1F8) | Fitting with ParallelLocalFoldFittingStrategy
2023-07-20 01:37:10,285	ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2023-07-20 01:37:10,287	ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2023-07-20 01:37:10,288	ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2023-07-20 01:37:10,290	ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log 

2023-07-20 01:37:21,304	ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2023-07-20 01:37:21,305	ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2023-07-20 01:37:21,307	ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2023-07-20 01:37:21,308	ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2023-07-20 01:37:21,309	ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UN

In [45]:
# Prediction
test_df = pd.DataFrame(test_x_scaled)
preds = predictor.predict(test_df) 



In [46]:
preds

0        1.957653
1        1.172094
2        2.051428
3        1.137925
4        1.228405
           ...   
15673    4.633889
15674    2.137432
15675    1.159498
15676    1.976796
15677    0.745534
Name: TARGET, Length: 15678, dtype: float32

In [47]:
submission['풍속 (m/s)'] = preds
submission.head()

Unnamed: 0,ID,풍속 (m/s)
0,TEST_00000,1.957653
1,TEST_00001,1.172094
2,TEST_00002,2.051428
3,TEST_00003,1.137925
4,TEST_00004,1.228405


In [48]:
submission.to_csv('./submission_windspeed_0720_2.csv', index=False)