# Import

In [8]:
import pandas as pd
import numpy as np
import random
import os
from tqdm import tqdm

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

# 데이터 읽어오기

In [9]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

# 시드 고정

In [10]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# 데이터 확인

In [11]:
train.head(3)

Unnamed: 0,ID,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
0,TRAIN_00000,3,0.42099,0.39479,0.4231,0.4058,0.55107,0.41532,0.10631,0.20542,...,0.83333,0.83333,0.83333,0.5129,0.40409,0.0,0.46583,0.0,0.0,5.69
1,TRAIN_00001,4,0.34401,0.22868,0.26533,0.16498,0.60467,0.65628,0.58338,0.07245,...,0.18003,0.20223,0.17768,0.6391,0.62743,0.16667,0.114,0.14434,0.09053,43.6
2,TRAIN_00002,4,0.34437,0.25134,0.2651,0.1635,0.60353,0.45025,0.59097,0.18672,...,0.21317,0.20223,0.2066,0.6391,0.50931,0.16667,0.16327,0.14434,0.14951,39.0


In [15]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12759 entries, 0 to 12758
Data columns (total 83 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               12759 non-null  object 
 1   number_of_elements               12759 non-null  int64  
 2   mean_atomic_mass                 12759 non-null  float64
 3   wtd_mean_atomic_mass             12759 non-null  float64
 4   gmean_atomic_mass                12759 non-null  float64
 5   wtd_gmean_atomic_mass            12759 non-null  float64
 6   entropy_atomic_mass              12759 non-null  float64
 7   wtd_entropy_atomic_mass          12759 non-null  float64
 8   range_atomic_mass                12759 non-null  float64
 9   wtd_range_atomic_mass            12759 non-null  float64
 10  std_atomic_mass                  12759 non-null  float64
 11  wtd_std_atomic_mass              12759 non-null  float64
 12  mean_fie          

# 독립변수, 종속변수 생성

In [12]:
X = train.drop(['ID', 'critical_temp'], axis = 1)
Y = train['critical_temp']

X_test = test.drop('ID', axis = 1)

# 모델학습 및 추론

In [13]:
# 5로 진행 20으로 진행 이후 svm도 써보자 

In [20]:
from sklearn.svm import SVR
from sklearn.linear_model import Ridge

# 두 개의 모델 정의
rf = RandomForestRegressor()
gb = Ridge()

# 5-Fold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 각 fold의 모델로부터의 예측을 저장할 리스트와 NMAE 점수 리스트
ensemble_predictions = []
nmae_scores = []

for train_idx, val_idx in tqdm(kf.split(X), total=5, desc="Processing folds"):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y[train_idx], Y[val_idx]
    
    # 두 모델 모두 학습
    rf.fit(X_train, y_train)
    gb.fit(X_train, y_train)
    
    # 각 모델로부터 Validation set에 대한 예측을 평균내어 앙상블 예측 생성
    ensemble_val_pred = (rf.predict(X_val) + gb.predict(X_val)) / 2
    
    # NMAE (Normalized MAE) 계산 후 저장
    mae = mean_absolute_error(y_val, ensemble_val_pred)
    nmae = mae / np.mean(abs(y_val))
    nmae_scores.append(nmae)
    
    # test 데이터셋에 대한 예측 수행 후 저장
    rf_pred = rf.predict(X_test)
    gb_pred = gb.predict(X_test)
    
    # 두 모델의 예측을 평균내어 앙상블 예측 생성
    ensemble_pred = (rf_pred + gb_pred) / 2
    ensemble_predictions.append(ensemble_pred)

# K-fold 모든 예측의 평균을 계산하여 최종 앙상블 예측 생성
final_predictions = np.mean(ensemble_predictions, axis=0)

# 각 fold에서의 NMAE와 전체 평균 NMAE 출력
print("NMAE scores for each fold:", nmae_scores)
print("Average NMAE:", np.mean(nmae_scores))

Processing folds:   0%|          | 0/5 [00:00<?, ?it/s]

Processing folds: 100%|██████████| 5/5 [14:03<00:00, 168.60s/it]

NMAE scores for each fold: [0.2634013722835396, 0.27196318963104876, 0.2635456845605247, 0.2683748775832454, 0.25773911744992983]
Average NMAE: 0.2650048483016576





# 파일 저장

In [16]:
submission['critical_temp'] = final_predictions
submission.head(3)

Unnamed: 0,ID,critical_temp
0,TEST_00000,2.897254
1,TEST_00001,4.060731
2,TEST_00002,12.672299


In [17]:
submission.to_csv('E:/초전도체/초전도체/models/baseline_submit.csv', index=False)