In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
from tensorflow import keras
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from easydict import EasyDict
from sklearn.metrics import accuracy_score

# 경로설정

In [4]:
cfg = EasyDict({
    'dataset_path': './Monitoring/dataset/monitoring_dataset_distributed.npy',
    'epochs': 100,
    'lr': 0.1,
    'weight_decay': 0.0001,
    'momentum': 0.9,
    'threshold': 0.5,
})

In [5]:
data = np.load(cfg.dataset_path)

In [6]:
data["train_x"]


array([[0.        , 0.5489832 , 0.        , ..., 0.12587614, 0.09326255,
        0.5441178 ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.17283219,
        0.10555376],
       [0.21344315, 0.09608716, 0.        , ..., 0.        , 0.03724245,
        0.        ],
       ...,
       [0.03493185, 0.4037363 , 0.        , ..., 0.        , 0.46157905,
        0.        ],
       [0.        , 0.        , 2.633469  , ..., 1.0653102 , 0.        ,
        0.18814471],
       [0.        , 1.3275399 , 0.        , ..., 0.11789932, 0.4255386 ,
        0.26578477]], dtype=float32)

In [18]:
data["train_y"]


array([1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0,

In [21]:
test = data["test_x"]

# train 설정

In [9]:
X=data["train_x"]
y=data["train_y"]

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.1, random_state = 0)

# 랜덤포레스트

In [11]:
rfc = RandomForestClassifier(n_estimators=100, random_state=0)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_valid)

In [12]:
print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_valid, y_pred)))

Model accuracy score with 10 decision-trees : 0.6500


# xgboost

In [13]:
import xgboost as xgb

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.1, random_state = 0)

model = xgb.XGBClassifier(objective='binary:logistic', learning_rate=0.2, max_depth=4, n_estimators = 3000, tree_method='gpu_hist', reg_lambda=0.2, reg_alpha=0.9)

model.fit(X_train,y_train, 
          eval_set=[(X_valid,y_valid)],
          eval_metric = 'logloss',
          early_stopping_rounds=100,
          verbose=5
          )

y_pred = model.predict(X_valid)



[0]	validation_0-logloss:0.64935
[5]	validation_0-logloss:0.61634
[10]	validation_0-logloss:0.58411
[15]	validation_0-logloss:0.57673
[20]	validation_0-logloss:0.58816
[25]	validation_0-logloss:0.55926
[30]	validation_0-logloss:0.57137
[35]	validation_0-logloss:0.56968
[40]	validation_0-logloss:0.54871
[45]	validation_0-logloss:0.55330
[50]	validation_0-logloss:0.56565
[55]	validation_0-logloss:0.56994
[60]	validation_0-logloss:0.56581
[65]	validation_0-logloss:0.56739
[70]	validation_0-logloss:0.56739
[75]	validation_0-logloss:0.56739
[80]	validation_0-logloss:0.56739
[85]	validation_0-logloss:0.56739
[90]	validation_0-logloss:0.56739
[95]	validation_0-logloss:0.56739
[100]	validation_0-logloss:0.56739
[105]	validation_0-logloss:0.56739
[110]	validation_0-logloss:0.56739
[115]	validation_0-logloss:0.56739
[120]	validation_0-logloss:0.56739
[125]	validation_0-logloss:0.56739
[130]	validation_0-logloss:0.56739
[135]	validation_0-logloss:0.56739
[140]	validation_0-logloss:0.56739
[142]	v

In [14]:
accuracy_score(y_valid, y_pred)

0.725

In [15]:
xgb.__version__

'1.7.1'

In [16]:
print(y_pred,y_valid)
print(len(y_pred))

[1 0 1 0 1 0 1 1 0 1 0 1 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 1 1 0
 0 0 1] [1 0 1 1 1 0 1 0 0 1 1 0 0 0 1 1 1 1 1 0 0 0 1 1 0 1 0 0 1 0 0 0 0 1 1 0 0
 0 0 1]
40


In [17]:
print('Model accuracy score with 10 decision-trees : {0:0.4f}'. format(accuracy_score(y_valid, y_pred)))

Model accuracy score with 10 decision-trees : 0.7250


In [26]:
pred = model.predict(test)

df = pd.DataFrame({'pred': pred})
df.to_csv('Monitoring/monitoring-submission.csv',header=True, index=True, index_label='idx')