# Import

In [1]:
import pandas as pd
import numpy as np
import random
import os
from tqdm import tqdm

from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error

# 데이터 읽어오기

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

# 시드 고정

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# 데이터 확인

In [4]:
train.head(3)

Unnamed: 0,ID,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
0,TRAIN_00000,3,0.42099,0.39479,0.4231,0.4058,0.55107,0.41532,0.10631,0.20542,...,0.83333,0.83333,0.83333,0.5129,0.40409,0.0,0.46583,0.0,0.0,5.69
1,TRAIN_00001,4,0.34401,0.22868,0.26533,0.16498,0.60467,0.65628,0.58338,0.07245,...,0.18003,0.20223,0.17768,0.6391,0.62743,0.16667,0.114,0.14434,0.09053,43.6
2,TRAIN_00002,4,0.34437,0.25134,0.2651,0.1635,0.60353,0.45025,0.59097,0.18672,...,0.21317,0.20223,0.2066,0.6391,0.50931,0.16667,0.16327,0.14434,0.14951,39.0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12759 entries, 0 to 12758
Data columns (total 83 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   ID                               12759 non-null  object 
 1   number_of_elements               12759 non-null  int64  
 2   mean_atomic_mass                 12759 non-null  float64
 3   wtd_mean_atomic_mass             12759 non-null  float64
 4   gmean_atomic_mass                12759 non-null  float64
 5   wtd_gmean_atomic_mass            12759 non-null  float64
 6   entropy_atomic_mass              12759 non-null  float64
 7   wtd_entropy_atomic_mass          12759 non-null  float64
 8   range_atomic_mass                12759 non-null  float64
 9   wtd_range_atomic_mass            12759 non-null  float64
 10  std_atomic_mass                  12759 non-null  float64
 11  wtd_std_atomic_mass              12759 non-null  float64
 12  mean_fie          

# 독립변수, 종속변수 생성

In [6]:
X = train.drop(['ID', 'critical_temp'], axis = 1)
Y = train['critical_temp']

X_test = test.drop('ID', axis = 1)

# 모델학습 및 추론

In [13]:
# 5로 진행 20으로 진행 이후 svm도 써보자 

In [None]:
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import numpy as np
from tqdm import tqdm

# 두 개의 모델 정의
rf = RandomForestRegressor()
gb = GradientBoostingRegressor()

# 보팅 앙상블 모델 정의
ensemble_model = VotingRegressor(estimators=[('rf', rf), ('gb', gb)])

# 5-Fold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 각 fold의 NMAE 점수 리스트
nmae_scores = []

for train_idx, val_idx in tqdm(kf.split(X), total=5, desc="Processing folds"):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y[train_idx], Y[val_idx]
    
    # 보팅 앙상블 모델 학습
    ensemble_model.fit(X_train, y_train)
    
    # Validation set에 대한 예측 수행
    ensemble_val_pred = ensemble_model.predict(X_val)
    
    # NMAE (Normalized MAE) 계산 후 저장
    mae = mean_absolute_error(y_val, ensemble_val_pred)
    nmae = mae / np.mean(np.abs(y_val))
    nmae_scores.append(nmae)

# 각 fold에서의 NMAE와 전체 평균 NMAE 출력
print("NMAE scores for each fold:", nmae_scores)
print("Average NMAE:", np.mean(nmae_scores))


In [7]:
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import numpy as np
from tqdm import tqdm

# 두 개의 모델 정의
rf = RandomForestRegressor()
gb = GradientBoostingRegressor()

# 보팅 앙상블 모델 정의
ensemble_model = VotingRegressor(estimators=[('rf', rf), ('gb', gb)])

# 5-Fold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# 각 fold의 NMAE 점수 리스트
nmae_scores = []

for train_idx, val_idx in tqdm(kf.split(X), total=5, desc="Processing folds"):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = Y[train_idx], Y[val_idx]
    
    # 보팅 앙상블 모델 학습
    ensemble_model.fit(X_train, y_train)
    
    # Validation set에 대한 예측 수행
    ensemble_val_pred = ensemble_model.predict(X_val)
    
    # NMAE (Normalized MAE) 계산 후 저장
    mae = mean_absolute_error(y_val, ensemble_val_pred)
    nmae = mae / np.mean(np.abs(y_val))
    nmae_scores.append(nmae)

# 각 fold에서의 NMAE와 전체 평균 NMAE 출력
print("NMAE scores for each fold:", nmae_scores)
print("Average NMAE:", np.mean(nmae_scores))


Processing folds: 100%|██████████| 5/5 [16:49<00:00, 201.91s/it]

NMAE scores for each fold: [0.1990950425973085, 0.2057902294533724, 0.20538151170759558, 0.2029954492599846, 0.19347082602751825]
Average NMAE: 0.20134661180915586





## 보팅

In [None]:
# from sklearn.ensemble import VotingRegressor
# from sklearn.model_selection import KFold
# from sklearn.metrics import mean_absolute_error
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# import numpy as np
# from tqdm import tqdm

# # 두 개의 모델 정의
# rf = RandomForestRegressor()
# gb = GradientBoostingRegressor()

# # 보팅 앙상블 모델 정의
# ensemble_model = VotingRegressor(estimators=[('rf', rf), ('gb', gb)])

# # ... (이하 코드는 이전 예시와 동일)


## 배깅

In [None]:
# from sklearn.ensemble import BaggingRegressor
# from sklearn.model_selection import KFold
# from sklearn.metrics import mean_absolute_error
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# import numpy as np
# from tqdm import tqdm

# # 두 개의 모델 정의
# rf = RandomForestRegressor()
# gb = GradientBoostingRegressor()

# # 배깅 앙상블 모델 정의
# ensemble_model = BaggingRegressor(base_estimator=rf, n_estimators=10)

# # ... (이하 코드는 이전 예시와 동일)


# 스태킹

In [None]:
# from sklearn.ensemble import StackingRegressor
# from sklearn.model_selection import KFold
# from sklearn.metrics import mean_absolute_error
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# import numpy as np
# from tqdm import tqdm

# # 두 개의 모델 정의
# rf = RandomForestRegressor()
# gb = GradientBoostingRegressor()

# # 스태킹 앙상블 모델 정의
# estimators = [('rf', rf), ('gb', gb)]
# final_model = RandomForestRegressor()
# ensemble_model = StackingRegressor(estimators=estimators, final_estimator=final_model)

# # ... (이하 코드는 이전 예시와 동일)


## 부스팅

In [None]:
# from sklearn.ensemble import AdaBoostRegressor
# from sklearn.model_selection import KFold
# from sklearn.metrics import mean_absolute_error
# from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# import numpy as np
# from tqdm import tqdm

# # 두 개의 모델 정의
# rf = RandomForestRegressor()
# gb = GradientBoostingRegressor()

# # 부스팅 앙상블 모델 정의
# ensemble_model = AdaBoostRegressor(base_estimator=gb, n_estimators=10, random_state=42)

# # 5-Fold 설정
# kf = KFold(n_splits=5, shuffle=True, random_state=42)

# # 각 fold의 NMAE 점수 리스트
# nmae_scores = []

# for train_idx, val_idx in tqdm(kf.split(X), total=5, desc="Processing folds"):
#     X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
#     y_train, y_val = Y[train_idx], Y[val_idx]
    
#     # 부스팅 앙상블 모델 학습
#     ensemble_model.fit(X_train, y_train)
    
#     # Validation set에 대한 예측 수행
#     ensemble_val_pred = ensemble_model.predict(X_val)
    
#     # NMAE (Normalized MAE) 계산 후 저장
#     mae = mean_absolute_error(y_val, ensemble_val_pred)
#     nmae = mae / np.mean(np.abs(y_val))
#     nmae_scores.append(nmae)

# # 각 fold에서의 NMAE와 전체 평균 NMAE 출력
# print("NMAE scores for each fold:", nmae_scores)
# print("Average NMAE:", np.mean(nmae_scores))

# # 'critical_temp' 열을 추가
# submission['critical_temp'] = ensemble_model.predict(X_test)

# # 결과 확인
# print(submission.head(3))


# 파일 저장

In [8]:
submission['critical_temp'] = ensemble_model.predict(X_test)
submission.head(3)

Unnamed: 0,ID,critical_temp
0,TEST_00000,2.507662
1,TEST_00001,4.287399
2,TEST_00002,13.076239


In [9]:
submission.to_csv('E:/초전도체/초전도체/models/baseline_submit_05.csv', index=False)