In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

## SEED 설정

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

# SEED 고정
seed_everything(42)

## Csv to Parquet

- 메모리에 효율적인 데이터 유형을 사용하여 용량을 줄이고 빠른 작업이 가능함

In [5]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()    # Garbage Collector
    print(save_name, 'Done.')

In [6]:
csv_to_parquet('../data/train.csv', 'train')
csv_to_parquet('../data/test.csv', 'test')

train Done.
test Done.


## Data Load

In [16]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('../data/sample_submission.csv', index_col=0)   # index_col : 데이터프레임의 인덱스로 사용할 열 지정

## Data Pre-Processing

In [30]:
# 각 feature 에서 결측치 확인
train.isnull().any()

ID                          False
Month                       False
Day_of_Month                False
Estimated_Departure_Time     True
Estimated_Arrival_Time       True
Cancelled                   False
Diverted                    False
Origin_Airport              False
Origin_Airport_ID           False
Origin_State                 True
Destination_Airport         False
Destination_Airport_ID      False
Destination_State            True
Distance                    False
Airline                      True
Carrier_Code(IATA)           True
Carrier_ID(DOT)              True
Tail_Number                 False
Delay                        True
dtype: bool

In [38]:
# Label 값인 'Delay'를 제외한 feature 중 결측값이 존재하는 값들의 학습 데이터의 최빈값으로 지정.
NaN_col = [i for i in train.columns if train[i].isnull().any() == True and i != 'Delay']

for col in NaN_col:
    mode = train[col].mode()[0] # mode() : 최빈값을 찾는 함수
    train[col] = train[col].fillna(mode)

    if col in test.columns:
        test[col] = test[col].fillna(mode)

print("Done")

Done


In [43]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 19 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   ID                        1000000 non-null  object 
 1   Month                     1000000 non-null  int64  
 2   Day_of_Month              1000000 non-null  int64  
 3   Estimated_Departure_Time  1000000 non-null  float64
 4   Estimated_Arrival_Time    1000000 non-null  float64
 5   Cancelled                 1000000 non-null  int64  
 6   Diverted                  1000000 non-null  int64  
 7   Origin_Airport            1000000 non-null  object 
 8   Origin_Airport_ID         1000000 non-null  int64  
 9   Origin_State              1000000 non-null  object 
 10  Destination_Airport       1000000 non-null  object 
 11  Destination_Airport_ID    1000000 non-null  int64  
 12  Destination_State         1000000 non-null  object 
 13  Distance                  10

In [47]:
#질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):    # np.unique() : 배열에서 중복값을 제외한 고유한 값들을 반환함
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [51]:
# 라벨이 없는 값 제외
train = train.dropna()

In [56]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i

def to_number(x, dic):
    return dic[x]

# 라벨 값 수치화
train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done')

Done


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))


In [58]:
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

## Classification Model fit

In [59]:
clf = RandomForestClassifier()
clf.fit(train_x, train_y)

## Inference

In [60]:
y_pred = clf.predict_proba(test_x)  # 각 클래스에 대해 예측 확률 반환

## Submit

In [65]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)

In [None]:
submission.to_csv('baseline_submission.csv', index=True)

## Evaluation with Train Dataset

In [None]:
import numpy as np
from sklearn.metrics import log_loss

# 실제 테스트 데이터의 정답(y_true)과 예측값(y_pred)을 불러옵니다
y_true = pd.read_csv('실제정답파일.csv')  # 실제 정답 파일
y_pred = pd.read_csv('예측파일.csv')

# 만약 인덱스 컬럼이 있다면 제외하고 값만 가져옵니다
if 'index' in y_true.columns:
    y_true = y_true.drop('index', axis=1)
if 'index' in y_pred.columns:
    y_pred = y_pred.drop('index', axis=1)

# LogLoss 계산
score = log_loss(
    y_true=y_true.values,  # 실제 값
    y_pred=y_pred.values,  # 예측 확률값
)

print(f'LogLoss Score: {score:.4f}')