In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/flight-delays-spring-2018/sample_submission.csv
/kaggle/input/flight-delays-spring-2018/flight_delays_test.csv
/kaggle/input/flight-delays-spring-2018/flight_delays_train.csv


In [2]:
train = pd.read_csv('/kaggle/input/flight-delays-spring-2018/flight_delays_train.csv')
test = pd.read_csv('/kaggle/input/flight-delays-spring-2018/flight_delays_test.csv')

In [3]:
train.shape, test.shape

((100000, 9), (100000, 8))

In [4]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [5]:
train['dep_delayed_15min'] = train['dep_delayed_15min'].map({'N':0, 'Y':1})
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,0
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,0
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,0
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,0
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,1


In [6]:
test.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance
0,c-7,c-25,c-3,615,YV,MRY,PHX,598
1,c-4,c-17,c-2,739,WN,LAS,HOU,1235
2,c-12,c-2,c-7,651,MQ,GSP,ORD,577
3,c-3,c-25,c-7,1614,WN,BWI,MHT,377
4,c-6,c-6,c-3,1505,UA,ORD,STL,258


In [7]:
all_data = pd.concat([train, test], sort=True).reset_index(drop=True)
all_data.head()

Unnamed: 0,DayOfWeek,DayofMonth,DepTime,Dest,Distance,Month,Origin,UniqueCarrier,dep_delayed_15min
0,c-7,c-21,1934,DFW,732,c-8,ATL,AA,0.0
1,c-3,c-20,1548,MCO,834,c-4,PIT,US,0.0
2,c-5,c-2,1422,CLE,416,c-9,RDU,XE,0.0
3,c-6,c-25,1015,MEM,872,c-11,DEN,OO,0.0
4,c-6,c-7,1828,OMA,423,c-10,MDW,WN,1.0


In [8]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   DayOfWeek          200000 non-null  object 
 1   DayofMonth         200000 non-null  object 
 2   DepTime            200000 non-null  int64  
 3   Dest               200000 non-null  object 
 4   Distance           200000 non-null  int64  
 5   Month              200000 non-null  object 
 6   Origin             200000 non-null  object 
 7   UniqueCarrier      200000 non-null  object 
 8   dep_delayed_15min  100000 non-null  float64
dtypes: float64(1), int64(2), object(6)
memory usage: 13.7+ MB


In [9]:
all_data['DayOfWeek'] = all_data['DayOfWeek'].apply(lambda x: x[2:])
all_data['DayofMonth'] = all_data['DayofMonth'].apply(lambda x: x[2:])
all_data['Month'] = all_data['Month'].apply(lambda x: x[2:])
all_data.head()

Unnamed: 0,DayOfWeek,DayofMonth,DepTime,Dest,Distance,Month,Origin,UniqueCarrier,dep_delayed_15min
0,7,21,1934,DFW,732,8,ATL,AA,0.0
1,3,20,1548,MCO,834,4,PIT,US,0.0
2,5,2,1422,CLE,416,9,RDU,XE,0.0
3,6,25,1015,MEM,872,11,DEN,OO,0.0
4,6,7,1828,OMA,423,10,MDW,WN,1.0


In [10]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

In [11]:
time = all_data[['DayOfWeek', 'DayofMonth', 'Month']]
time.head()

Unnamed: 0,DayOfWeek,DayofMonth,Month
0,7,21,8
1,3,20,4
2,5,2,9
3,6,25,11
4,6,7,10


In [12]:
encod = ohe.fit_transform(time).toarray()
encod.shape

(200000, 50)

In [13]:
for i in range(encod.shape[1]):
    all_data['OHE_{}'.format(i)] = encod[:,i]
all_data.head()

Unnamed: 0,DayOfWeek,DayofMonth,DepTime,Dest,Distance,Month,Origin,UniqueCarrier,dep_delayed_15min,OHE_0,...,OHE_40,OHE_41,OHE_42,OHE_43,OHE_44,OHE_45,OHE_46,OHE_47,OHE_48,OHE_49
0,7,21,1934,DFW,732,8,ATL,AA,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3,20,1548,MCO,834,4,PIT,US,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,5,2,1422,CLE,416,9,RDU,XE,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,6,25,1015,MEM,872,11,DEN,OO,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,7,1828,OMA,423,10,MDW,WN,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 59 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   DayOfWeek          200000 non-null  object 
 1   DayofMonth         200000 non-null  object 
 2   DepTime            200000 non-null  int64  
 3   Dest               200000 non-null  object 
 4   Distance           200000 non-null  int64  
 5   Month              200000 non-null  object 
 6   Origin             200000 non-null  object 
 7   UniqueCarrier      200000 non-null  object 
 8   dep_delayed_15min  100000 non-null  float64
 9   OHE_0              200000 non-null  float64
 10  OHE_1              200000 non-null  float64
 11  OHE_2              200000 non-null  float64
 12  OHE_3              200000 non-null  float64
 13  OHE_4              200000 non-null  float64
 14  OHE_5              200000 non-null  float64
 15  OHE_6              200000 non-null  float64
 16  OH

In [15]:
all_data[all_data['DepTime']>2400]

Unnamed: 0,DayOfWeek,DayofMonth,DepTime,Dest,Distance,Month,Origin,UniqueCarrier,dep_delayed_15min,OHE_0,...,OHE_40,OHE_41,OHE_42,OHE_43,OHE_44,OHE_45,OHE_46,OHE_47,OHE_48,OHE_49
8189,2,14,2435,AVL,275,6,CVG,EV,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
20766,2,31,2534,HSV,151,5,ATL,EV,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
27391,4,23,2505,AGS,143,3,ATL,EV,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
44332,5,15,2440,SHV,552,7,ATL,EV,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
45796,4,18,2447,JAN,341,8,ATL,EV,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
47218,1,2,2500,ILM,377,1,ATL,EV,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48180,7,27,2514,CAE,191,2,ATL,EV,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
55909,3,9,2417,SYR,793,8,ATL,EV,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
60639,7,8,2459,JAN,341,1,ATL,EV,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
62669,1,20,2412,GSP,153,3,ATL,EV,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
def day(x):
    if x <=600:
        return 'M'
    elif x<=1200:
        return 'D'
    elif x <=1800:
        return 'E'
    elif x<= 2400:
        return 'N'
    else:
        return 'L'

In [17]:
all_data['time'] = all_data['DepTime'].apply(lambda x: day(x))
all_data

Unnamed: 0,DayOfWeek,DayofMonth,DepTime,Dest,Distance,Month,Origin,UniqueCarrier,dep_delayed_15min,OHE_0,...,OHE_41,OHE_42,OHE_43,OHE_44,OHE_45,OHE_46,OHE_47,OHE_48,OHE_49,time
0,7,21,1934,DFW,732,8,ATL,AA,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,N
1,3,20,1548,MCO,834,4,PIT,US,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,E
2,5,2,1422,CLE,416,9,RDU,XE,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,E
3,6,25,1015,MEM,872,11,DEN,OO,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,D
4,6,7,1828,OMA,423,10,MDW,WN,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,2,5,852,HOU,187,6,CRP,WN,,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,D
199996,6,24,1446,LAS,1515,11,ORD,UA,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,E
199997,2,30,1509,SGF,438,1,ORD,OO,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,E
199998,5,5,804,ATL,761,1,LGA,DL,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,D


In [18]:
ohe = OneHotEncoder()

In [19]:
encod = ohe.fit_transform(np.array(all_data['time']).reshape(-1,1)).toarray()
encod.shape

(200000, 5)

In [20]:
for i in range(encod.shape[1]):
    all_data['time_{}'.format(i)] = encod[:,i]
all_data.head()

Unnamed: 0,DayOfWeek,DayofMonth,DepTime,Dest,Distance,Month,Origin,UniqueCarrier,dep_delayed_15min,OHE_0,...,OHE_46,OHE_47,OHE_48,OHE_49,time,time_0,time_1,time_2,time_3,time_4
0,7,21,1934,DFW,732,8,ATL,AA,0.0,0.0,...,0.0,0.0,1.0,0.0,N,0.0,0.0,0.0,0.0,1.0
1,3,20,1548,MCO,834,4,PIT,US,0.0,0.0,...,0.0,0.0,0.0,0.0,E,0.0,1.0,0.0,0.0,0.0
2,5,2,1422,CLE,416,9,RDU,XE,0.0,0.0,...,0.0,0.0,0.0,1.0,E,0.0,1.0,0.0,0.0,0.0
3,6,25,1015,MEM,872,11,DEN,OO,0.0,0.0,...,0.0,0.0,0.0,0.0,D,1.0,0.0,0.0,0.0,0.0
4,6,7,1828,OMA,423,10,MDW,WN,1.0,0.0,...,0.0,0.0,0.0,0.0,N,0.0,0.0,0.0,0.0,1.0


In [21]:
all_data['is_weeked'] = all_data['DayOfWeek'].apply(lambda x: 1 if x=='6' or x=='7' else 0)
all_data['is_weeked']

0         1
1         0
2         0
3         1
4         1
         ..
199995    0
199996    1
199997    0
199998    0
199999    0
Name: is_weeked, Length: 200000, dtype: int64

In [22]:
all_data.drop(['time'],axis = 1, inplace=True)
all_data.head()

Unnamed: 0,DayOfWeek,DayofMonth,DepTime,Dest,Distance,Month,Origin,UniqueCarrier,dep_delayed_15min,OHE_0,...,OHE_46,OHE_47,OHE_48,OHE_49,time_0,time_1,time_2,time_3,time_4,is_weeked
0,7,21,1934,DFW,732,8,ATL,AA,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1
1,3,20,1548,MCO,834,4,PIT,US,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0
2,5,2,1422,CLE,416,9,RDU,XE,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0
3,6,25,1015,MEM,872,11,DEN,OO,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
4,6,7,1828,OMA,423,10,MDW,WN,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1


In [23]:
def year(x):
    if x == '1' or x=='2' or x=='12':
        return 'W'
    elif x =='3' or x=='4' or x=='5':
        return 'P'
    elif x =='6' or x=='7' or x=='8':
        return 'S'
    elif x =='9' or x=='10' or x=='11':
        return 'A'

In [24]:
all_data['year'] = all_data['Month'].apply(lambda x: year(x))
all_data

Unnamed: 0,DayOfWeek,DayofMonth,DepTime,Dest,Distance,Month,Origin,UniqueCarrier,dep_delayed_15min,OHE_0,...,OHE_47,OHE_48,OHE_49,time_0,time_1,time_2,time_3,time_4,is_weeked,year
0,7,21,1934,DFW,732,8,ATL,AA,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1,S
1,3,20,1548,MCO,834,4,PIT,US,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,P
2,5,2,1422,CLE,416,9,RDU,XE,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0,A
3,6,25,1015,MEM,872,11,DEN,OO,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1,A
4,6,7,1828,OMA,423,10,MDW,WN,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,2,5,852,HOU,187,6,CRP,WN,,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,S
199996,6,24,1446,LAS,1515,11,ORD,UA,,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,A
199997,2,30,1509,SGF,438,1,ORD,OO,,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,W
199998,5,5,804,ATL,761,1,LGA,DL,,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,W


In [25]:
ohe = OneHotEncoder()
encod = ohe.fit_transform(np.array(all_data['year']).reshape(-1,1)).toarray()
encod.shape

(200000, 4)

In [26]:
for i in range(encod.shape[1]):
    all_data['year_{}'.format(i)] = encod[:,i]
all_data.head()

Unnamed: 0,DayOfWeek,DayofMonth,DepTime,Dest,Distance,Month,Origin,UniqueCarrier,dep_delayed_15min,OHE_0,...,time_1,time_2,time_3,time_4,is_weeked,year,year_0,year_1,year_2,year_3
0,7,21,1934,DFW,732,8,ATL,AA,0.0,0.0,...,0.0,0.0,0.0,1.0,1,S,0.0,0.0,1.0,0.0
1,3,20,1548,MCO,834,4,PIT,US,0.0,0.0,...,1.0,0.0,0.0,0.0,0,P,0.0,1.0,0.0,0.0
2,5,2,1422,CLE,416,9,RDU,XE,0.0,0.0,...,1.0,0.0,0.0,0.0,0,A,1.0,0.0,0.0,0.0
3,6,25,1015,MEM,872,11,DEN,OO,0.0,0.0,...,0.0,0.0,0.0,0.0,1,A,1.0,0.0,0.0,0.0
4,6,7,1828,OMA,423,10,MDW,WN,1.0,0.0,...,0.0,0.0,0.0,1.0,1,A,1.0,0.0,0.0,0.0


In [27]:
all_data.drop(['year'],axis = 1, inplace=True)
all_data.head()

Unnamed: 0,DayOfWeek,DayofMonth,DepTime,Dest,Distance,Month,Origin,UniqueCarrier,dep_delayed_15min,OHE_0,...,time_0,time_1,time_2,time_3,time_4,is_weeked,year_0,year_1,year_2,year_3
0,7,21,1934,DFW,732,8,ATL,AA,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1,0.0,0.0,1.0,0.0
1,3,20,1548,MCO,834,4,PIT,US,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0
2,5,2,1422,CLE,416,9,RDU,XE,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0
3,6,25,1015,MEM,872,11,DEN,OO,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1,1.0,0.0,0.0,0.0
4,6,7,1828,OMA,423,10,MDW,WN,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1,1.0,0.0,0.0,0.0


In [28]:
from sklearn.preprocessing import LabelEncoder


In [29]:
le = LabelEncoder()
all_data['UniqueCarrier'] = le.fit_transform(all_data['UniqueCarrier'])
all_data.head()

Unnamed: 0,DayOfWeek,DayofMonth,DepTime,Dest,Distance,Month,Origin,UniqueCarrier,dep_delayed_15min,OHE_0,...,time_0,time_1,time_2,time_3,time_4,is_weeked,year_0,year_1,year_2,year_3
0,7,21,1934,DFW,732,8,ATL,1,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1,0.0,0.0,1.0,0.0
1,3,20,1548,MCO,834,4,PIT,19,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0
2,5,2,1422,CLE,416,9,RDU,21,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0
3,6,25,1015,MEM,872,11,DEN,16,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1,1.0,0.0,0.0,0.0
4,6,7,1828,OMA,423,10,MDW,20,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1,1.0,0.0,0.0,0.0


In [30]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
all_data['Origin'] = le.fit_transform(all_data['Origin'])
all_data['Dest'] = le.transform(all_data['Dest'])
all_data.head()

Unnamed: 0,DayOfWeek,DayofMonth,DepTime,Dest,Distance,Month,Origin,UniqueCarrier,dep_delayed_15min,OHE_0,...,time_0,time_1,time_2,time_3,time_4,is_weeked,year_0,year_1,year_2,year_3
0,7,21,1934,82,732,8,19,1,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1,0.0,0.0,1.0,0.0
1,3,20,1548,180,834,4,226,19,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0,0.0,1.0,0.0,0.0
2,5,2,1422,62,416,9,239,21,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0,1.0,0.0,0.0,0.0
3,6,25,1015,184,872,11,81,16,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1,1.0,0.0,0.0,0.0
4,6,7,1828,210,423,10,182,20,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1,1.0,0.0,0.0,0.0


In [31]:
X_train = all_data.loc[:99999].drop(['dep_delayed_15min'], axis=1).values
y_train = all_data.loc[:99999]['dep_delayed_15min'].values
X_test = all_data.loc[100000:].drop(['dep_delayed_15min'], axis=1).values

X_train_part, X_valid, y_train_part, y_valid = \
    train_test_split(X_train, y_train, 
                     test_size=0.3, random_state=17)

In [32]:
xgb_model = XGBClassifier(seed=17)

xgb_model.fit(X_train_part, y_train_part)
xgb_valid_pred = xgb_model.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, xgb_valid_pred)





0.7351132281077364

In [33]:
xgb_model.fit(X_train, y_train)
xgb_test_pred = xgb_model.predict_proba(X_test)[:, 1]

pd.Series(xgb_test_pred, 
          name='dep_delayed_15min').to_csv('xgb_2feat.csv', 
                                           index_label='id', header=True)



