In [4]:
import random
import os
import numpy as np
import pandas as pd
import gc
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, make_scorer, log_loss
from sklearn.feature_selection import SelectKBest, f_classif
from xgboost import XGBClassifier

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_submission = pd.read_csv('sample_submission.csv', index_col=0)

# 결측값 처리
NaN_col = ['Origin_State', 'Destination_State', 'Airline', 'Estimated_Departure_Time', 'Estimated_Arrival_Time', 'Carrier_Code(IATA)', 'Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col].fillna(mode, inplace=True)

    if col in test.columns:
        test[col].fillna(mode, inplace=True)

print('결측값 처리 완료')

# 정성적 변수 인코딩
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']
combined = pd.concat([train[qual_col], test[qual_col]], axis=0)

for i in qual_col:
    le = LabelEncoder()
    le.fit(combined[i])
    train[i] = le.transform(train[i])
    test[i] = le.transform(test[i])

print('인코딩 완료')

# 레이블이 지정되지 않은 데이터 제거
train = train.dropna()

column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i

def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('레이블 처리 완료')

def correlation_based_feature_selection(train_df, target_col, threshold=0.1):
    corr_matrix = train_df.drop(['ID', 'Delay', 'Delay_num'], axis=1).corr()
    corr_with_target = corr_matrix[target_col].sort_values(ascending=False)
    selected_features = corr_with_target[corr_with_target > threshold].index.tolist()
    return selected_features

def select_k_best_feature_selection(train_x, train_y, k=10):
    selector = SelectKBest(score_func=f_classif, k=k)
    selector.fit(train_x, train_y)
    selected_features = train_x.columns[selector.get_support()].tolist()
    return selected_features

# Feature Selection을 수행합니다.
selected_features = correlation_based_feature_selection(train, 'Delay_num', threshold=0.1)
# 또는
# selected_features = select_k_best_feature_selection(train_x, train_y, k=10)

# 선택된 변수들을 사용하여 train, val, test 데이터를 변환합니다.
train_x = train[selected_features]
train_y = train['Delay_num']
test_x = test[selected_features]

train_x, val_x, train_y, val_y = train_test_split(train[selected_features], train['Delay_num'], test_size=0.2, random_state=42)

scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
val_x = scaler.transform(val_x)
test_x = scaler.transform(test[selected_features])

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=91)
model = XGBClassifier(random_state=91)

param_grid = {
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
'n_estimators': [100, 200, 300],
'subsample': [0.8, 1],
'colsample_bytree': [0.8, 1],
'min_child_weight': [1, 3, 5]
}

random_search = RandomizedSearchCV(model,
param_distributions=param_grid,
n_iter=20,
cv=cv,
scoring='neg_log_loss',
n_jobs=-1,
verbose=1,
random_state=91)

random_search.fit(train_x, train_y)

best_model = random_search.best_estimator_

val_y_pred = best_model.predict(val_x)
val_y_pred_proba = best_model.predict_proba(val_x)
logloss = log_loss(val_y, val_y_pred_proba)

accuracy = accuracy_score(val_y, val_y_pred)
f1 = f1_score(val_y, val_y_pred, average='weighted')
precision = precision_score(val_y, val_y_pred, average='weighted')
recall = recall_score(val_y, val_y_pred, average='weighted')

print(f'LogLoss: {logloss}')
print(f'정확도: {accuracy}')
print(f'F1 점수: {f1}')
print(f'정밀도: {precision}')
print(f'재현율: {recall}')

y_pred = best_model.predict_proba(test_x)
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('enhanced_submission.csv', index=True)

print('예측 및 제출 파일 생성 완료')

결측값 처리 완료
인코딩 완료
레이블 처리 완료


KeyError: 'Delay_num'