In [1]:
import pandas as pd
import numpy as np
import random
import os
import gc

from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [3]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [4]:
csv_to_parquet('train.csv', 'train')
csv_to_parquet('test.csv', 'test')

train Done.
test Done.


In [5]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

In [None]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    print("mode = ", mode)
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')

In [None]:
#질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

In [None]:
semi_supervise_train = train

train_x_labled = semi_supervise_train[semi_supervise_train['Delay'].notnull()].drop(['Delay', 'ID'], axis=1)
train_y_labled = pd.Series(semi_supervise_train[semi_supervise_train['Delay'].notnull()]['Delay'])

labelEncoder = LabelEncoder()
labelEncoder.fit(train_y_labled)
train_y_labled = labelEncoder.transform(train_y_labled)

train_x_unlabeled = semi_supervise_train[semi_supervise_train['Delay'].isnull()].drop(['Delay', 'ID'], axis=1)


model = RandomForestClassifier()

model.fit(train_x_labled, train_y_labled)


In [None]:
train_y_unlabeled = model.predict(train_x_unlabeled)

In [None]:
#0 = Delayed
#1 = Not_Delayed
#unlabled_y_df = pd.DataFrame(, columns=["Delay"])
#unlabled_y_df["Delay"] = np.where(unlabled_y_df["Delay"] == 0, "Delayed", "Not_Delayed")
null_data = train_x_unlabeled

null_data['Delay'] = ["Not_Delayed" if x == 1 else "Delayed" for x in train_y_unlabeled]


In [None]:
notnull_data = semi_supervise_train[semi_supervise_train['Delay'].notnull()]

In [None]:
null_data.reset_index(drop=True, inplace=True)
notnull_data.reset_index(drop=True, inplace=True)

combined_df = pd.concat([null_data, notnull_data], axis=0)

In [None]:
combined_df
train = combined_df

In [None]:
train_x = train.drop(columns=['ID', 'Delay'])
train_y = train['Delay']
test_x = test.drop(columns=['ID'])

In [None]:
#레이블이 없는 데이터들을 제거합니다
train_x = train_x.dropna()

In [None]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

In [None]:
n_splits = 6
kfold = StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=123)
cat_paramets = {"learning_rate" : [0.20909079092170735],
                'depth' : [8],
                'od_pval' : [0.236844398775451],
                'model_size_reg': [0.30614059763442997],
                'l2_leaf_reg' :[5.535171839105427]}


cat = CatBoostClassifier(random_state=123, verbose=False, n_estimators=500)
clf = GridSearchCV(cat,cat_paramets,cv=kfold,n_jobs=-1)


In [None]:
clf.fit(train_x, train_y)

In [None]:
y_pred = clf.predict_proba(test_x)

In [None]:
y_pred

In [None]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)

In [None]:
submission.to_csv('tea_submission.csv', index=True)