In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
# Load the fitted scaler 
filename_scaler = '../2_저장된모델/fitted_scaler.pkl'
loaded_scaler = joblib.load(filename_scaler)

# Load the fitted model
filename = '../2_저장된모델/voting_ensemble.joblib'
loaded_model = joblib.load(filename)

# Apply Model to Testset

Applying the same feature engineering as Q1_train.csv

In [3]:
# Extract relevant features from datetime
def categorize_hour(hour):
    '''
    'bedtime' = 00:05 - 07:00  
    'daytime' = 07:05 - 17:00  
    'evening' = 17:05 - 00:00  
    '''
    if 0 <= hour < 7:
        return 'bedtime'
    elif 7 <= hour < 17:
        return 'day'
    else:
        return 'evening'
    
# Create lag features
lag_var_list = ['nummsg3', 'erabaddatt', 'rlculbyte', 'bler_dl', 'attpaging', 'dltransmittedmcsavg']

def create_lag(df, var_list, num_stations, num_lag):
    df['hour'] = df.index.hour  # datetime 인덱스에서 시간 추출

    for lag in range(0, num_lag):
        for var in var_list:
            df[f'{var}_lag{lag+1}'] = df.groupby('ru_id')[var].shift(1)
            df[f'{var}_lag{lag+1}'] = df.groupby(['ru_id', 'hour'])[f'{var}_lag{lag+1}'].transform(lambda x: x.fillna(x.mean()))
    
    df.drop('hour', axis=1, inplace=True)
    return df

# Drop features with high VIFs
drop_cols = ['rachpreamblea', 'time_of_day_day', 'endcaddsucc', 'erabaddsucc', 'endcmodbymenbsucc', 'connestabsucc', 'handoversucc', 'endcmodbysgnbsucc', 'reestabsucc']


In [4]:
def create_submission(model_name):
    # Read in test data
    Q1_test = pd.read_csv('../Q1_data/Q1_test.csv')

    # Separate BaseStationB and BaseStationJ
    Q1_test_B = Q1_test[Q1_test['ru_id'] == 'BaseStationB']
    Q1_test_J = Q1_test[Q1_test['ru_id'] == 'BaseStationJ']

    # Define function to preprocess and predict for a given dataset and model
    def preprocess_and_predict(dataset, model):
        X_test = dataset.copy()
        X_test.drop(['Unnamed: 0'], axis=1, inplace=True)
        X_test['datetime'] = pd.to_datetime(X_test['datetime'], format='%Y-%m-%d %H:%M:%S')

        ############## Feature Engineering ###############
        # Create daytime/evening/bedtime
        X_test['hour'] = X_test['datetime'].dt.hour
        X_test['time_of_day'] = X_test['hour'].apply(categorize_hour)

        # Set datetime as index & Drop time cols 
        X_test.set_index('datetime', inplace=True)
        X_test = X_test.drop(['hour'], axis=1)

        # OHE
        X_test = pd.get_dummies(X_test, columns=['time_of_day'])     

        # Time Series variables
        X_test = create_lag(X_test, lag_var_list, 1, 1)

        # Final drop
        X_test.drop(drop_cols, axis=1, inplace=True)

        ################### Scaling #####################
        feature_cols = [col for col in X_test.columns if col not in ['ru_id', 'time_of_day_bedtime', 'time_of_day_evening']]
        scaled_features = loaded_scaler.transform(X_test[feature_cols])  # Transform

        # Create a dataframe for the scaled features
        scaled_features_df = pd.DataFrame(scaled_features, columns=feature_cols)

        # Concatenate 
        part_a_reset = X_test[['time_of_day_bedtime', 'time_of_day_evening']].reset_index(drop=True)
        part_b_reset = scaled_features_df.reset_index(drop=True)
        scaled_df = pd.concat([part_a_reset, part_b_reset], axis=1)

        ##################################################
        # Predict X_test's 'uenomax'
        y_test_pred = np.round(model.predict(scaled_df)).astype(int)
        return y_test_pred

    # Predict for BaseStationB and BaseStationJ
    y_testcsv_pred_B = preprocess_and_predict(Q1_test_B, model_name)
    y_testcsv_pred_J = preprocess_and_predict(Q1_test_J, model_name)

    # Remove duplicated datetimes
    datetime = Q1_test['datetime'].drop_duplicates().reset_index(drop=True)

    # Create a submission file
    submission = pd.DataFrame({'datetime': datetime, 'BaseStationB': y_testcsv_pred_B, 'BaseStationJ': y_testcsv_pred_J})
    display(submission.head(6))
    return submission

In [5]:
# Input the saved model in the argument to execute the function 
submission = create_submission(loaded_model)

Unnamed: 0,datetime,BaseStationB,BaseStationJ
0,2023-05-19 00:00:00,1,1
1,2023-05-19 00:05:00,2,1
2,2023-05-19 00:10:00,2,1
3,2023-05-19 00:15:00,2,1
4,2023-05-19 00:20:00,2,2
5,2023-05-19 00:25:00,1,1


In [6]:
def submitResult(pred):
    try: 
        label = pd.read_csv('../Q1_data/Q1_label_sample.csv')
        # 1. 컬럼명과 순서가 동일한지 체크
        if pred.columns.equals(label.columns):
            print("Check: 컬럼명과 순서가 동일합니다.")
        else:
            print(f"Warning: 컬럼명과 순서가 동일하지 않습니다.\n- 예측 데이터프레임 컬럼명: {pred.columns}\n- 레이블 데이터프레임 컬럼명: {label.columns}")
            return

        # 2. datetime 컬럼이 존재하며 해당 컬럼의 샘플수와 값이 일치하는지 체크
        if (label['datetime'] == pred['datetime']).all():
            print("Check: datetime 순서와 샘플 수가 일치합니다.")
        else:
            print("Warning: 테스트 세트와 모델 예측의 datetime이 일치하지 않습니다.")
            return
        
        pred.to_csv('../Q1_submitResult.csv', index=False)
        print('Done : Q1_submitResult.csv 파일로 저장되었습니다.')
    except Exception as e:
        # 예외가 발생한 경우 오류 메시지 출력
        print("Error:", e)

submitResult(submission)

Check: 컬럼명과 순서가 동일합니다.
Check: datetime 순서와 샘플 수가 일치합니다.
Done : Q1_submitResult.csv 파일로 저장되었습니다.
