In [35]:
import pandas as pd
import numpy as np
import random
import os
import gc
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import catboost as cb
from sklearn.model_selection import train_test_split

from scipy import stats

In [36]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

# csv to parquet

In [37]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [38]:
csv_to_parquet('C:/Users/systj/bigdata/train.csv', 'train')
csv_to_parquet('C:/Users/systj/bigdata/test.csv', 'test')

train Done.
test Done.


# Data load

In [39]:
train = pd.read_parquet('C:/Users/systj/bigdata/train.parquet')
test = pd.read_parquet('C:/Users/systj/bigdata/test.parquet')
sample_submission = pd.read_csv('sample_submission.csv', index_col = 0)

In [40]:
print(train.columns)
print(test.columns)
print(sample_submission.columns)

Index(['ID', 'Month', 'Day_of_Month', 'Estimated_Departure_Time',
       'Estimated_Arrival_Time', 'Cancelled', 'Diverted', 'Origin_Airport',
       'Origin_Airport_ID', 'Origin_State', 'Destination_Airport',
       'Destination_Airport_ID', 'Destination_State', 'Distance', 'Airline',
       'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number', 'Delay'],
      dtype='object')
Index(['ID', 'Month', 'Day_of_Month', 'Estimated_Departure_Time',
       'Estimated_Arrival_Time', 'Cancelled', 'Diverted', 'Origin_Airport',
       'Origin_Airport_ID', 'Origin_State', 'Destination_Airport',
       'Destination_Airport_ID', 'Destination_State', 'Distance', 'Airline',
       'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number'],
      dtype='object')
Index(['Not_Delayed', 'Delayed'], dtype='object')


In [41]:
print(train.Delay.value_counts())
print()
print('train.csv의 Delay Column 결측치 비율:', round(train.Delay.isnull().sum()/len(train.Delay)*100,3),'%')

Delay
Not_Delayed    210001
Delayed         45000
Name: count, dtype: int64

train.csv의 Delay Column 결측치 비율: 74.5 %


### train의 결측치가 아닌 비율은 25.5%
### 25.5%로 학습 -> 나머지 74.5%를 예측

### 이렇게 label된 100만개 데이터로 test.csv 예측


# Data Pre-Processing

In [42]:
#레이블(Delay)을 제외한 결측값이 존재하는 변수들을 학습 데이터의 최빈값으로 대체합니다
NaN_col = ['Origin_State','Destination_State','Airline','Estimated_Departure_Time', 'Estimated_Arrival_Time','Carrier_Code(IATA)','Carrier_ID(DOT)']

for col in NaN_col:
    mode = train[col].mode()[0]
    train[col] = train[col].fillna(mode)
    
    if col in test.columns:
        test[col] = test[col].fillna(mode)
print('Done.')

Done.


In [43]:
#질적 변수들을 수치화합니다
qual_col = ['Origin_Airport', 'Origin_State', 'Destination_Airport', 'Destination_State', 'Airline', 'Carrier_Code(IATA)', 'Tail_Number']

for i in qual_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])
print('Done.')

Done.


In [44]:
#레이블이 없는 데이터들을 제거합니다
train_label_is_null = train[train.Delay.isnull()]
train = train.dropna()

In [45]:
train_label_is_null

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
0,TRAIN_000000,4,15,600.0,1900.0,0,0,252,13851,34,159,12191,42,419.0,23,10,19393.0,4319,
1,TRAIN_000001,8,15,740.0,1024.0,0,0,256,13930,11,331,14869,45,1250.0,22,8,20304.0,310,
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,74,11057,31,204,12953,30,544.0,3,0,19805.0,140,
3,TRAIN_000003,7,10,905.0,1735.0,0,0,195,12892,4,119,11618,28,2454.0,26,8,19393.0,3021,
4,TRAIN_000004,1,11,900.0,1019.0,0,0,322,14771,4,7,10157,4,250.0,22,8,20304.0,556,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,936.0,1243.0,0,0,256,13930,4,270,14100,4,678.0,26,8,19977.0,2477,
999996,TRAIN_999996,5,30,920.0,1028.0,0,0,122,11637,4,242,13487,21,223.0,22,3,19393.0,2294,
999997,TRAIN_999997,6,28,800.0,1340.0,0,0,248,13796,4,159,12191,42,1642.0,23,10,19393.0,994,
999998,TRAIN_999998,9,27,1613.0,1824.0,0,0,45,10693,41,22,10397,4,214.0,9,3,19790.0,6207,


In [46]:
train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay
5,TRAIN_000005,4,13,1545.0,1900.0,0,0,119,11618,4,93,11278,47,199.0,21,8,20452.0,3435,Not_Delayed
6,TRAIN_000006,1,20,1742.0,1903.0,0,0,119,11618,28,47,10721,19,200.0,26,8,19393.0,3495,Not_Delayed
8,TRAIN_000008,6,13,1420.0,1550.0,0,0,59,10821,4,74,11057,31,361.0,23,10,19393.0,4083,Not_Delayed
10,TRAIN_000010,8,13,1730.0,1844.0,0,0,93,11278,47,277,14122,36,204.0,21,0,19393.0,241,Delayed
12,TRAIN_000012,1,12,1015.0,1145.0,0,0,72,11042,33,94,11292,5,1201.0,23,10,19393.0,5171,Not_Delayed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999962,TRAIN_999962,10,11,600.0,2003.0,0,0,310,14683,42,256,13930,11,1041.0,22,8,20304.0,488,Not_Delayed
999963,TRAIN_999963,5,2,1759.0,1926.0,0,0,204,12953,30,93,11278,47,214.0,23,3,20452.0,5204,Delayed
999969,TRAIN_999969,10,10,940.0,1056.0,0,0,223,13256,42,169,12266,42,316.0,19,8,20378.0,5350,Delayed
999985,TRAIN_999985,8,8,1914.0,2039.0,0,0,296,14492,31,183,12451,7,407.0,14,4,20436.0,1499,Not_Delayed


In [47]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train.loc[:, 'Delay_num'] = train['Delay'].apply(lambda x: to_number(x, column_number))
print('Done.')

Done.


In [48]:
train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Delay_num
5,TRAIN_000005,4,13,1545.0,1900.0,0,0,119,11618,4,93,11278,47,199.0,21,8,20452.0,3435,Not_Delayed,0
6,TRAIN_000006,1,20,1742.0,1903.0,0,0,119,11618,28,47,10721,19,200.0,26,8,19393.0,3495,Not_Delayed,0
8,TRAIN_000008,6,13,1420.0,1550.0,0,0,59,10821,4,74,11057,31,361.0,23,10,19393.0,4083,Not_Delayed,0
10,TRAIN_000010,8,13,1730.0,1844.0,0,0,93,11278,47,277,14122,36,204.0,21,0,19393.0,241,Delayed,1
12,TRAIN_000012,1,12,1015.0,1145.0,0,0,72,11042,33,94,11292,5,1201.0,23,10,19393.0,5171,Not_Delayed,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999962,TRAIN_999962,10,11,600.0,2003.0,0,0,310,14683,42,256,13930,11,1041.0,22,8,20304.0,488,Not_Delayed,0
999963,TRAIN_999963,5,2,1759.0,1926.0,0,0,204,12953,30,93,11278,47,214.0,23,3,20452.0,5204,Delayed,1
999969,TRAIN_999969,10,10,940.0,1056.0,0,0,223,13256,42,169,12266,42,316.0,19,8,20378.0,5350,Delayed,1
999985,TRAIN_999985,8,8,1914.0,2039.0,0,0,296,14492,31,183,12451,7,407.0,14,4,20436.0,1499,Not_Delayed,0


In [49]:
train = train.sample(n=50000)

In [50]:
train.drop(columns={'Cancelled','Diverted'}, inplace=True)
train

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Delay_num
99048,TRAIN_099048,11,1,929.0,1047.0,22,10397,8,170,12278,14,782.0,9,3,19790.0,6241,Not_Delayed,0
728920,TRAIN_728920,9,11,720.0,838.0,242,13487,21,122,11637,32,223.0,23,3,20304.0,5716,Not_Delayed,0
520722,TRAIN_520722,4,8,1735.0,1817.0,95,11298,42,113,11540,42,551.0,19,0,20378.0,6040,Not_Delayed,0
14119,TRAIN_014119,7,14,1105.0,1243.0,22,10397,8,195,12892,4,1947.0,9,3,19790.0,5595,Not_Delayed,0
955604,TRAIN_955604,10,2,600.0,644.0,257,13931,47,74,11057,31,290.0,6,8,20397.0,3718,Not_Delayed,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
883850,TRAIN_883850,8,9,1105.0,1432.0,266,14057,35,94,11292,5,991.0,23,8,19977.0,1846,Not_Delayed,0
632842,TRAIN_632842,9,14,600.0,1900.0,90,11259,42,193,12889,26,1067.0,23,10,19393.0,2461,Not_Delayed,0
191094,TRAIN_191094,10,13,1822.0,2042.0,174,12339,12,267,14082,7,916.0,2,8,20368.0,13,Not_Delayed,0
752923,TRAIN_752923,10,8,2225.0,1.0,195,12892,4,301,14570,26,391.0,23,8,20304.0,288,Not_Delayed,0


In [51]:
train_x = train.drop(columns=['ID', 'Delay', 'Delay_num'])
train_y = train['Delay_num']
test_x = test.drop(columns=['ID'])

In [52]:
print(len(train_x))
train_y.value_counts()

50000


Delay_num
0    41072
1     8928
Name: count, dtype: int64

# Classification Model Fit

In [53]:
# 1. 레이블이 있는 데이터로 분류 모델을 훈련합니다.

# CatBoostClassifier 모델 초기화
model = cb.CatBoostClassifier(task_type="GPU", devices='0:1', verbose=200)

# 튜닝할 하이퍼파라미터 설정
param_grid = {
    'iterations': [500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [6, 8, 10],
    'l2_leaf_reg': [1, 3, 5]
}

# GridSearchCV 사용
grid_search_train = GridSearchCV(model, param_grid, cv=3, scoring='neg_log_loss', verbose=10)
grid_search_train.fit(train_x, train_y)

# 최적의 하이퍼파라미터 출력
print("Best Parameters: ", grid_search_train.best_params_)

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV 1/3; 1/54] START depth=6, iterations=500, l2_leaf_reg=1, learning_rate=0.01.
0:	learn: 0.6878792	total: 36.1ms	remaining: 18s
200:	learn: 0.4563221	total: 7.86s	remaining: 11.7s
400:	learn: 0.4489937	total: 16.1s	remaining: 3.96s
499:	learn: 0.4469688	total: 20.1s	remaining: 0us
[CV 1/3; 1/54] END depth=6, iterations=500, l2_leaf_reg=1, learning_rate=0.01;, score=-0.454 total time=  20.6s
[CV 2/3; 1/54] START depth=6, iterations=500, l2_leaf_reg=1, learning_rate=0.01.
0:	learn: 0.6879620	total: 37.3ms	remaining: 18.6s
200:	learn: 0.4567523	total: 8.1s	remaining: 12s
400:	learn: 0.4489431	total: 16.3s	remaining: 4.03s
499:	learn: 0.4468632	total: 20.3s	remaining: 0us
[CV 2/3; 1/54] END depth=6, iterations=500, l2_leaf_reg=1, learning_rate=0.01;, score=-0.454 total time=  20.6s
[CV 3/3; 1/54] START depth=6, iterations=500, l2_leaf_reg=1, learning_rate=0.01.
0:	learn: 0.6879557	total: 34.1ms	remaining: 17s
200:	learn: 0.457

In [54]:
best_params_train = grid_search_train.best_params_

best_model_train = cb.CatBoostClassifier(
    iterations=best_params_train['iterations'],
    learning_rate=best_params_train['learning_rate'],
    depth=best_params_train['depth'],
    l2_leaf_reg=best_params_train['l2_leaf_reg'],
    task_type="GPU", 
    devices='0:1', 
    verbose=200
)

best_model_train.fit(train_x, train_y)

0:	learn: 0.6876056	total: 16.2ms	remaining: 16.2s
200:	learn: 0.4327177	total: 4.08s	remaining: 16.2s
400:	learn: 0.4090320	total: 8.19s	remaining: 12.2s
600:	learn: 0.3914785	total: 12.3s	remaining: 8.15s
800:	learn: 0.3755851	total: 16.4s	remaining: 4.08s
999:	learn: 0.3605026	total: 20.5s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x19906214b50>

In [55]:
# 2. 훈련된 모델을 사용하여 레이블이 없는 데이터의 레이블을 예측합니다.
train_label_is_null_x = train_label_is_null.drop(['ID', 'Delay'], axis=1)
train_label_is_null_y = best_model_train.predict(train_label_is_null_x)

# TEST (train test split을 통한 정확도 측정)

In [56]:
train_x_, valid_x_, train_y_, valid_y_ = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

In [57]:
# 검증 데이터에 대한 예측
valid_predictions = best_model_train.predict(valid_x_)

# 예측값이 2D 배열로 반환될 수 있으므로, 1D로 변환합니다.
# valid_predictions = valid_predictions.reshape(-1)

# 정확도 계산
accuracy = accuracy_score(valid_y_, valid_predictions)

print(f"Validation Accuracy: {accuracy * 100:.2f}%")

Validation Accuracy: 83.21%


# TEST END

In [58]:
pd.Series(train_label_is_null_y).value_counts()

0    743119
1      1880
Name: count, dtype: int64

In [59]:
train.Delay_num.value_counts()

Delay_num
0    41072
1     8928
Name: count, dtype: int64

In [60]:
# 'ID', 'Delay_num'컬럼 생성 및 컬럼 병합
trained_label_is_null = train_label_is_null_x.copy()
trained_label_is_null['ID'] = train_label_is_null.ID
trained_label_is_null['Delay_num'] = train_label_is_null_y
trained_label_is_null['Delay'] = trained_label_is_null['Delay_num'].map(lambda x: 'Delayed' if x==1 else 'Not_Delayed')

# 컬럼 순서 변경
trained_label_is_null = trained_label_is_null[['ID','Month', 'Day_of_Month', 'Estimated_Departure_Time',
       'Estimated_Arrival_Time', 'Cancelled', 'Diverted', 'Origin_Airport',
       'Origin_Airport_ID', 'Origin_State', 'Destination_Airport',
       'Destination_Airport_ID', 'Destination_State', 'Distance', 'Airline',
       'Carrier_Code(IATA)', 'Carrier_ID(DOT)', 'Tail_Number', 'Delay',
       'Delay_num']]
trained_label_is_null

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Delay_num
0,TRAIN_000000,4,15,600.0,1900.0,0,0,252,13851,34,159,12191,42,419.0,23,10,19393.0,4319,Not_Delayed,0
1,TRAIN_000001,8,15,740.0,1024.0,0,0,256,13930,11,331,14869,45,1250.0,22,8,20304.0,310,Not_Delayed,0
2,TRAIN_000002,9,6,1610.0,1805.0,0,0,74,11057,31,204,12953,30,544.0,3,0,19805.0,140,Not_Delayed,0
3,TRAIN_000003,7,10,905.0,1735.0,0,0,195,12892,4,119,11618,28,2454.0,26,8,19393.0,3021,Not_Delayed,0
4,TRAIN_000004,1,11,900.0,1019.0,0,0,322,14771,4,7,10157,4,250.0,22,8,20304.0,556,Not_Delayed,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,TRAIN_999995,9,18,936.0,1243.0,0,0,256,13930,4,270,14100,4,678.0,26,8,19977.0,2477,Not_Delayed,0
999996,TRAIN_999996,5,30,920.0,1028.0,0,0,122,11637,4,242,13487,21,223.0,22,3,19393.0,2294,Not_Delayed,0
999997,TRAIN_999997,6,28,800.0,1340.0,0,0,248,13796,4,159,12191,42,1642.0,23,10,19393.0,994,Not_Delayed,0
999998,TRAIN_999998,9,27,1613.0,1824.0,0,0,45,10693,41,22,10397,4,214.0,9,3,19790.0,6207,Not_Delayed,0


In [61]:
# 기존 labeling된 DF와 예측된 DF 병합
combined_df = pd.concat([trained_label_is_null, train])
combined_df = combined_df.sort_index(ascending=True)
combined_df.head(10)

Unnamed: 0,ID,Month,Day_of_Month,Estimated_Departure_Time,Estimated_Arrival_Time,Cancelled,Diverted,Origin_Airport,Origin_Airport_ID,Origin_State,Destination_Airport,Destination_Airport_ID,Destination_State,Distance,Airline,Carrier_Code(IATA),Carrier_ID(DOT),Tail_Number,Delay,Delay_num
0,TRAIN_000000,4,15,600.0,1900.0,0.0,0.0,252,13851,34,159,12191,42,419.0,23,10,19393.0,4319,Not_Delayed,0
1,TRAIN_000001,8,15,740.0,1024.0,0.0,0.0,256,13930,11,331,14869,45,1250.0,22,8,20304.0,310,Not_Delayed,0
2,TRAIN_000002,9,6,1610.0,1805.0,0.0,0.0,74,11057,31,204,12953,30,544.0,3,0,19805.0,140,Not_Delayed,0
3,TRAIN_000003,7,10,905.0,1735.0,0.0,0.0,195,12892,4,119,11618,28,2454.0,26,8,19393.0,3021,Not_Delayed,0
4,TRAIN_000004,1,11,900.0,1019.0,0.0,0.0,322,14771,4,7,10157,4,250.0,22,8,20304.0,556,Not_Delayed,0
7,TRAIN_000007,4,20,1815.0,1955.0,0.0,0.0,256,13930,11,217,13198,23,403.0,23,8,20304.0,173,Not_Delayed,0
9,TRAIN_000009,6,6,650.0,838.0,0.0,0.0,207,12992,3,169,12266,42,374.0,13,8,20366.0,468,Not_Delayed,0
11,TRAIN_000011,3,18,600.0,748.0,0.0,0.0,270,14100,36,103,11433,20,453.0,9,3,19790.0,1859,Not_Delayed,0
12,TRAIN_000012,1,12,1015.0,1145.0,,,72,11042,33,94,11292,5,1201.0,23,10,19393.0,5171,Not_Delayed,0
14,TRAIN_000014,12,18,845.0,855.0,0.0,0.0,271,14107,2,193,12889,26,255.0,23,10,19393.0,3734,Not_Delayed,0


In [62]:
# 모델 다시 학습
combined_df_x = combined_df.drop(['ID','Delay','Delay_num'], axis=1)
combined_df_y = combined_df['Delay_num']

# CatBoostClassifier 모델 초기화
model = cb.CatBoostClassifier(task_type="GPU", devices='0:1', verbose=200)

# 튜닝할 하이퍼파라미터 설정
param_grid = {
    'iterations': [200, 500, 700, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [6, 8, 10],
    'l2_leaf_reg': [1, 3, 5]
}

# GridSearchCV 사용
#grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_log_loss', verbose=10, n_jobs=-1)
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_log_loss', verbose=10)
grid_search.fit(combined_df_x, combined_df_y)

# 최적의 하이퍼파라미터 출력
print("Best Parameters: ", grid_search.best_params_)

Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV 1/3; 1/108] START depth=6, iterations=200, l2_leaf_reg=1, learning_rate=0.01
0:	learn: 0.6638665	total: 8.01ms	remaining: 1.59s
199:	learn: 0.0394143	total: 2.36s	remaining: 0us
[CV 1/3; 1/108] END depth=6, iterations=200, l2_leaf_reg=1, learning_rate=0.01;, score=-0.040 total time=   2.9s
[CV 2/3; 1/108] START depth=6, iterations=200, l2_leaf_reg=1, learning_rate=0.01
0:	learn: 0.6633253	total: 7.63ms	remaining: 1.52s
199:	learn: 0.0395526	total: 2.34s	remaining: 0us
[CV 2/3; 1/108] END depth=6, iterations=200, l2_leaf_reg=1, learning_rate=0.01;, score=-0.040 total time=   2.8s
[CV 3/3; 1/108] START depth=6, iterations=200, l2_leaf_reg=1, learning_rate=0.01
0:	learn: 0.6567644	total: 7.43ms	remaining: 1.48s
199:	learn: 0.0395149	total: 2.4s	remaining: 0us
[CV 3/3; 1/108] END depth=6, iterations=200, l2_leaf_reg=1, learning_rate=0.01;, score=-0.040 total time=   2.8s
[CV 1/3; 2/108] START depth=6, iterations=200, l2_lea

In [63]:
# 최적의 하이퍼파라미터로 모델 학습
best_params = grid_search.best_params_

best_model = cb.CatBoostClassifier(
    iterations=best_params['iterations'],
    learning_rate=best_params['learning_rate'],
    depth=best_params['depth'],
    l2_leaf_reg=best_params['l2_leaf_reg'],
    task_type="GPU", 
    devices='0:1', 
    verbose=200
)

best_model.fit(combined_df_x, combined_df_y)

0:	learn: 0.5258332	total: 12.9ms	remaining: 12.8s
200:	learn: 0.0321827	total: 3.44s	remaining: 13.7s
400:	learn: 0.0296427	total: 6.79s	remaining: 10.1s
600:	learn: 0.0278503	total: 10.1s	remaining: 6.71s
800:	learn: 0.0264005	total: 13.6s	remaining: 3.38s
999:	learn: 0.0251581	total: 17.2s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1990e5f4d30>

# Inference

In [64]:
y_pred = best_model.predict_proba(test_x)

# Submit

In [65]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)

In [66]:
submission.to_csv('C:/Users/systj/bigdata/baseline_submission.csv', index=True)

In [67]:
pd.read_csv('baseline_submission.csv')

Unnamed: 0,ID,Not_Delayed,Delayed
0,TEST_000000,1.000000,2.798819e-07
1,TEST_000001,1.000000,1.130772e-07
2,TEST_000002,0.999998,2.232956e-06
3,TEST_000003,0.999958,4.185915e-05
4,TEST_000004,0.898151,1.018494e-01
...,...,...,...
999995,TEST_999995,0.997819,2.180906e-03
999996,TEST_999996,1.000000,2.947370e-07
999997,TEST_999997,0.999778,2.216025e-04
999998,TEST_999998,0.999787,2.127007e-04


In [68]:
pd.read_csv('sample_submission.csv')

Unnamed: 0,ID,Not_Delayed,Delayed
0,TEST_000000,0,1
1,TEST_000001,0,1
2,TEST_000002,0,1
3,TEST_000003,0,1
4,TEST_000004,0,1
...,...,...,...
999995,TEST_999995,0,1
999996,TEST_999996,0,1
999997,TEST_999997,0,1
999998,TEST_999998,0,1
