In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
from category_encoders import TargetEncoder
import warnings

In [2]:
def cross_validation (X_train, X_valid, y_train, categ_feat_idx):
    n_fold = 5 
    folds = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=12)

    params = {'loss_function':'Logloss',
              'eval_metric':'AUC',
              'verbose': 100,
              'random_seed': 12,
              'depth': 7,
              'iterations': 2000,
              'learning_rate': 0.06,
             }

    test_data = Pool(data=X_valid,
                     cat_features=categ_feat_idx)

    scores = []
    prediction = np.zeros(X_valid.shape[0])
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X_train, y_train)):
    
        X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index] 
        y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]
   
        train_data = Pool(data=X_train_fold, 
                          label=y_train_fold,
                          cat_features=categ_feat_idx)
        valid_data = Pool(data=X_valid_fold, 
                          label=y_valid_fold,
                          cat_features=categ_feat_idx)
     
        model = CatBoostClassifier(**params, task_type = "GPU")
        model.fit(train_data,
                  eval_set=valid_data, 
                  use_best_model=True,
                  early_stopping_rounds=20
                 )
    
        score = model.get_best_score()['validation']['AUC']
        scores.append(score)

        y_pred = model.predict_proba(X_valid)[:, 1]
        prediction += y_pred
        print(model.feature_importances_)
    prediction /= n_fold
    print(scores)
    print(np.mean(scores), np.std(scores))
    return prediction

In [3]:
def add_feature_w_prop (X, X_test, feature_name):

    rating_df = pd.crosstab(X['dep_delayed_15min'], X[feature_name], normalize='columns').iloc[1,:]

    
    rating = {}
    for key, item in zip(list(rating_df.index), rating_df.values):
        rating[key] = item
    
    X[feature_name+'DelayProp'] = X[feature_name].map(rating)
    X_test[feature_name+'DelayProp'] = X_test[feature_name].map(rating)
    return X, X_test

In [4]:
df = pd.read_csv('flight_delays_train.csv')
df['dep_delayed_15min'] = df['dep_delayed_15min'].map({'N':0, 'Y':1})
X_test = pd.read_csv('flight_delays_test.csv')


In [5]:
X = df.copy()

In [6]:
X['flightBy'] = X['Origin'] + '-->' + X['Dest'] + 'by' + X['UniqueCarrier']
X_test['flightBy'] = X_test['Origin'] + '-->' + X_test['Dest'] + 'by' + X_test['UniqueCarrier']


X['DepTime_hour'] = X['DepTime'].apply(lambda x: (x)//100)
X_test['DepTime_hour'] = X_test['DepTime'].apply(lambda x: (x)//100)
X['Minute'] = X['DepTime_hour']*60 + X['DepTime'] % 100


#X_test['DepTime_minute'] = X_test['DepTime'] % 100
X_test['Minute'] = X_test['DepTime_hour']*60 + X_test['DepTime'] % 100


X['DepTime_2hour'] = X['Minute'].apply(lambda x: x//120)
X_test['DepTime_2hour'] = X_test['Minute'].apply(lambda x: x//120)

X['DepTime_2hour'] = X['Minute'].apply(lambda x: x//120)
X_test['DepTime_2hour'] = X_test['Minute'].apply(lambda x: x//120)

X['DepTime_2hour_intersection'] = X['Minute'].apply(lambda x: (x-60)//120 +1)
X_test['DepTime_2hour_intersection'] = X_test['Minute'].apply(lambda x: (x-60)//120 +1)

X['DepTime_hour_intersection'] = X['Minute'].apply(lambda x: (x-30)//60 +1)
X_test['DepTime_hour_intersection'] = X_test['Minute'].apply(lambda x: (x-30)//60 +1)

X['DepTime_3hour'] = X['Minute'].apply(lambda x: (x)//180)
X_test['DepTime_3hour'] = X_test['Minute'].apply(lambda x: (x)//180)

X['DepTime_3hour_intersection'] = X['Minute'].apply(lambda x: (x-90)//180 + 1)
X_test['DepTime_3hour_intersection'] = X_test['Minute'].apply(lambda x: (x-90)//180 + 1)

In [7]:
X, X_test = add_feature_w_prop(X, X_test, 'UniqueCarrier')


In [8]:
y = X['dep_delayed_15min']
X.drop(columns='dep_delayed_15min', inplace=True)

In [9]:
X['train'] = 1
X_test['train'] = 0
combined = pd.concat([X, X_test], sort=False)

In [10]:

combined['diff_w_min'] = combined['Minute'] - combined.groupby(['flightBy', 'DepTime_hour'])['Minute'].transform(min)
combined['diff_w_min_1'] = combined['Minute'] - combined.groupby(['flightBy', 'DepTime_hour_intersection'])['Minute'].transform(min)
combined['diff_w_min2'] = combined['Minute'] - combined.groupby(['flightBy', 'DepTime_2hour'])['Minute'].transform(min)
combined['diff_w_min2_1'] = combined['Minute'] - combined.groupby(['flightBy', 'DepTime_2hour_intersection'])['Minute'].transform(min)
combined['diff_w_min3'] = combined['Minute'] - combined.groupby(['flightBy', 'DepTime_3hour'])['Minute'].transform(min)
combined['diff_w_min3_1'] = combined['Minute'] - combined.groupby(['flightBy', 'DepTime_3hour_intersection'])['Minute'].transform(min)

In [11]:
X = combined[combined['train'] == 1]
X_test = combined[combined['train'] == 0]
X.drop(columns='train', inplace=True)
X_test.drop(columns='train', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [19]:
cols = [
         'Month', 
         "DayofMonth", 
         'DayOfWeek', 
         'UniqueCarrier', 
         "Origin", 
 #        'Dest', 
 
         'DepTime_hour', 
         'UniqueCarrierDelayProp', 
         'flightBy', 

         'diff_w_min',
         'diff_w_min2',
         'diff_w_min2_1',
         'diff_w_min_1',
         'diff_w_min3_1',
         'diff_w_min3',
         ]

In [20]:
X_ = X[cols]




In [21]:
###categorical columns for catboost
categ_feat_idx = np.where(X_.dtypes == 'object')[0]


In [22]:
X_train, X_valid, y_train, y_valid = train_test_split(X_, y, test_size=0.3, random_state=12)

In [23]:
valid_pred = cross_validation(X_train[cols], X_valid[cols], y_train, categ_feat_idx)

0:	learn: 0.7458495	test: 0.7405171	best: 0.7405171 (0)	total: 53.4ms	remaining: 1m 46s
100:	learn: 0.8305536	test: 0.8151867	best: 0.8152268 (99)	total: 5.62s	remaining: 1m 45s
200:	learn: 0.8452354	test: 0.8201033	best: 0.8201097 (198)	total: 11.3s	remaining: 1m 40s
300:	learn: 0.8575328	test: 0.8238963	best: 0.8238963 (300)	total: 16.7s	remaining: 1m 34s
400:	learn: 0.8685365	test: 0.8256580	best: 0.8256605 (399)	total: 21.9s	remaining: 1m 27s
bestTest = 0.8258501887
bestIteration = 403
Shrink model to first 404 iterations.
[ 4.53471571  3.97048491  3.72670084  3.60185743  4.23272042 21.36537096
  2.42484419  6.56718677  8.39256688  7.75977232  7.88747486 11.48605369
  6.90346586  7.14678517]
0:	learn: 0.7529227	test: 0.7511255	best: 0.7511255 (0)	total: 57.5ms	remaining: 1m 54s
100:	learn: 0.8287400	test: 0.8198538	best: 0.8198538 (100)	total: 5.62s	remaining: 1m 45s
200:	learn: 0.8438101	test: 0.8244916	best: 0.8244916 (200)	total: 11.2s	remaining: 1m 39s
300:	learn: 0.8564163	tes

In [24]:
roc_auc_score(y_valid, valid_pred)


0.8342459175770102

In [25]:
test_pred = cross_validation(X[cols], X_test[cols], y, categ_feat_idx)

0:	learn: 0.7491795	test: 0.7555883	best: 0.7555883 (0)	total: 64ms	remaining: 2m 7s
100:	learn: 0.8317708	test: 0.8257467	best: 0.8257467 (100)	total: 6.3s	remaining: 1m 58s
200:	learn: 0.8449551	test: 0.8321395	best: 0.8321395 (200)	total: 12.6s	remaining: 1m 52s
300:	learn: 0.8550577	test: 0.8362761	best: 0.8362761 (300)	total: 18.7s	remaining: 1m 45s
400:	learn: 0.8640405	test: 0.8383698	best: 0.8383698 (400)	total: 24.8s	remaining: 1m 38s
500:	learn: 0.8713101	test: 0.8398003	best: 0.8398003 (500)	total: 30.6s	remaining: 1m 31s
600:	learn: 0.8778879	test: 0.8407631	best: 0.8407644 (599)	total: 36.5s	remaining: 1m 24s
bestTest = 0.8413373828
bestIteration = 677
Shrink model to first 678 iterations.
[ 4.43405458  3.59437124  3.92631417  4.39374371  3.96233974 20.57570603
  2.26884426  8.66093005  8.50300286  8.39818904  7.66754306 10.6342488
  7.53930245  5.44141   ]
0:	learn: 0.7489627	test: 0.7472855	best: 0.7472855 (0)	total: 68.1ms	remaining: 2m 16s
100:	learn: 0.8308379	test: 0

In [39]:
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sample_sub = pd.read_csv('sample_submission.csv', 
                             index_col='id')
    sample_sub['dep_delayed_15min'] = test_pred
    sample_sub.to_csv('ctb_pred.csv')