# Import

In [1]:
import pandas as pd
import numpy as np
import random
import os
from tqdm import tqdm

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

# 데이터 읽어오기

In [16]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

# 시드 고정

In [17]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# 데이터 확인

In [18]:
train.head(3)

Unnamed: 0,ID,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
0,TRAIN_00000,3,0.42099,0.39479,0.4231,0.4058,0.55107,0.41532,0.10631,0.20542,...,0.83333,0.83333,0.83333,0.5129,0.40409,0.0,0.46583,0.0,0.0,5.69
1,TRAIN_00001,4,0.34401,0.22868,0.26533,0.16498,0.60467,0.65628,0.58338,0.07245,...,0.18003,0.20223,0.17768,0.6391,0.62743,0.16667,0.114,0.14434,0.09053,43.6
2,TRAIN_00002,4,0.34437,0.25134,0.2651,0.1635,0.60353,0.45025,0.59097,0.18672,...,0.21317,0.20223,0.2066,0.6391,0.50931,0.16667,0.16327,0.14434,0.14951,39.0


In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12759 entries, 0 to 12758
Data columns (total 83 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               12759 non-null  object 
 1   number_of_elements               12759 non-null  int64  
 2   mean_atomic_mass                 12759 non-null  float64
 3   wtd_mean_atomic_mass             12759 non-null  float64
 4   gmean_atomic_mass                12759 non-null  float64
 5   wtd_gmean_atomic_mass            12759 non-null  float64
 6   entropy_atomic_mass              12759 non-null  float64
 7   wtd_entropy_atomic_mass          12759 non-null  float64
 8   range_atomic_mass                12759 non-null  float64
 9   wtd_range_atomic_mass            12759 non-null  float64
 10  std_atomic_mass                  12759 non-null  float64
 11  wtd_std_atomic_mass              12759 non-null  float64
 12  mean_fie          

In [20]:
train.describe()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
count,12759.0,12759.0,12759.0,12759.0,12759.0,12759.0,12759.0,12759.0,12759.0,12759.0,...,12759.0,12759.0,12759.0,12759.0,12759.0,12759.0,12759.0,12759.0,12759.0,12759.0
mean,4.116388,0.399213,0.328101,0.323766,0.27271,0.58757,0.543765,0.55664,0.161431,0.440387,...,0.358173,0.341746,0.342036,0.605014,0.539805,0.339865,0.212724,0.279486,0.223743,34.40876
std,1.439978,0.146245,0.164699,0.151425,0.176247,0.18422,0.204986,0.262763,0.130694,0.198738,...,0.19838,0.173722,0.195639,0.183635,0.1948,0.206801,0.140148,0.161251,0.151262,34.244239
min,1.0,0.0,0.00256,0.00179,0.00596,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00032
25%,3.0,0.32589,0.225855,0.26085,0.16096,0.48852,0.39593,0.37938,0.08148,0.3259,...,0.186025,0.21328,0.18182,0.49527,0.39876,0.16667,0.13202,0.15713,0.102715,5.32
50%,4.0,0.38649,0.26731,0.29993,0.18338,0.60467,0.58609,0.59097,0.13032,0.44846,...,0.26684,0.26922,0.23773,0.6391,0.59714,0.33333,0.15204,0.26667,0.16653,20.0
75%,5.0,0.46338,0.39283,0.35744,0.341705,0.72817,0.696455,0.74106,0.18677,0.58757,...,0.50417,0.44381,0.48581,0.74186,0.68135,0.5,0.27459,0.4,0.33993,63.0
max,9.0,1.0,1.0,1.0,1.0,1.0,0.99304,1.0,0.99217,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,185.0


In [21]:
# import matplotlib.pyplot as plt
# except_target = train.drop('ID', axis = 1)
# except_target.hist(figsize = (12,12))
# plt.show()

In [22]:
# train = train.drop(['ID'], axis=1)
import seaborn as sns
plt.figure(figsize = (20,20), dpi = 100)
sns.heatmap(train.corr(), annot = True, cmap = 'Blues')
plt.show()

ValueError: could not convert string to float: 'TRAIN_00000'

<Figure size 2000x2000 with 0 Axes>

# 독립변수, 종속변수 생성

In [28]:
X = train.drop(['ID', 'critical_temp'], axis = 1)
Y = train['critical_temp']

X_test = test.drop('ID', axis = 1)

# 모델학습 및 추론

In [33]:
from sklearn.svm import SVR
from sklearn.linear_model import Ridge

# 두 개의 모델 정의
rf = RandomForestRegressor()
gb = SVR()

# 5-Fold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 각 fold의 모델로부터의 예측을 저장할 리스트와 NMAE 점수 리스트
ensemble_predictions = []
nmae_scores = []

for train_idx, val_idx in tqdm(kf.split(X), total=5, desc="Processing folds"):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y[train_idx], Y[val_idx]
    
    # 두 모델 모두 학습
    rf.fit(X_train, y_train)
    gb.fit(X_train, y_train)
    
    # 각 모델로부터 Validation set에 대한 예측을 평균내어 앙상블 예측 생성
    ensemble_val_pred = (rf.predict(X_val) + gb.predict(X_val)) / 2
    
    # NMAE (Normalized MAE) 계산 후 저장
    mae = mean_absolute_error(y_val, ensemble_val_pred)
    nmae = mae / np.mean(abs(y_val))
    nmae_scores.append(nmae)
    
    # test 데이터셋에 대한 예측 수행 후 저장
    rf_pred = rf.predict(X_test)
    gb_pred = gb.predict(X_test)
    
    # 두 모델의 예측을 평균내어 앙상블 예측 생성
    ensemble_pred = (rf_pred + gb_pred) / 2
    ensemble_predictions.append(ensemble_pred)

# K-fold 모든 예측의 평균을 계산하여 최종 앙상블 예측 생성
final_predictions = np.mean(ensemble_predictions, axis=0)

# 각 fold에서의 NMAE와 전체 평균 NMAE 출력
print("NMAE scores for each fold:", nmae_scores)
print("Average NMAE:", np.mean(nmae_scores))

Processing folds:   0%|          | 0/5 [00:00<?, ?it/s]

Processing folds: 100%|██████████| 5/5 [15:55<00:00, 191.14s/it]

NMAE scores for each fold: [0.2519943557485278, 0.26020982878125093, 0.2589922873706145, 0.25759442123110915, 0.2478681896699405]
Average NMAE: 0.2553318165602886





In [34]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm

# 두 개의 모델 정의
rf = RandomForestRegressor(random_state=42)
gb = GradientBoostingRegressor(random_state=42)

# 데이터 분할을 위한 K-Fold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 그리드 서치를 위한 파라미터 그리드 정의
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

param_grid_gb = {
    'n_estimators': [50, 100, 200,300,400,500,600],
    'learning_rate': [0.01, 0.1, 0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0],
    'max_depth': [3, 4, 5,8,10,12,15,18,21],
    'min_samples_split': [2, 5, 10,13,16,19,22,25,28,31],
    'min_samples_leaf': [1, 2, 4,6,8,10,12]
}

# 그리드 서치 객체 생성
grid_rf = GridSearchCV(rf, param_grid_rf, scoring='neg_mean_absolute_error', cv=kf)
grid_gb = GridSearchCV(gb, param_grid_gb, scoring='neg_mean_absolute_error', cv=kf)

# 각 fold의 모델로부터의 예측을 저장할 리스트와 NMAE 점수 리스트 초기화
ensemble_predictions = []
nmae_scores = []

# 각 fold 별로 모델 학습 및 예측 수행
for train_idx, val_idx in tqdm(kf.split(X), total=5, desc="Processing folds"):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y[train_idx], Y[val_idx]
    
    # Random Forest 모델에 대한 그리드 서치 수행
    grid_rf.fit(X_train, y_train)
    best_rf = grid_rf.best_estimator_
    
    # Gradient Boosting 모델에 대한 그리드 서치 수행
    grid_gb.fit(X_train, y_train)
    best_gb = grid_gb.best_estimator_
    
    # 각 모델로부터 Validation set에 대한 예측 생성
    rf_val_pred = best_rf.predict(X_val)
    gb_val_pred = best_gb.predict(X_val)
    
    # 두 모델의 예측을 평균내어 앙상블 예측 생성
    ensemble_val_pred = (rf_val_pred + gb_val_pred) / 2
    
    # NMAE (Normalized MAE) 계산 후 저장
    nmae = mean_absolute_error(y_val, ensemble_val_pred) / np.mean(abs(y_val))
    nmae_scores.append(nmae)
    
    # test 데이터셋에 대한 예측 수행
    rf_test_pred = best_rf.predict(X_test)
    gb_test_pred = best_gb.predict(X_test)
    
    # 두 모델의 예측을 평균내어 앙상블 예측 생성 및 저장
    ensemble_test_pred = (rf_test_pred + gb_test_pred) / 2
    ensemble_predictions.append(ensemble_test_pred)

# K-fold 모든 예측의 평균을 계산하여 최종 앙상블 예측 생성
final_predictions = np.mean(ensemble_predictions, axis=0)

# 각 fold에서의 NMAE와 전체 평균 NMAE 출력
print("NMAE scores for each fold:", nmae_scores)
print("Average NMAE:", np.mean(nmae_scores))


Processing folds:   0%|          | 0/5 [10:50:22<?, ?it/s]


KeyboardInterrupt: 

# 파일 저장

In [None]:
submission['critical_temp'] = final_predictions
submission.head(3)

Unnamed: 0,ID,critical_temp
0,TEST_00000,2.883359
1,TEST_00001,4.187488
2,TEST_00002,12.677138


In [None]:
submission.to_csv('E:/초전도체/초전도체/models/baseline_submit_02.csv', index=False)