In [46]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [47]:
train = pd.read_csv('../../dataset/flight_delays_train.csv')
test = pd.read_csv('../../dataset/flight_delays_test.csv')

In [48]:
train.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


## First baseline (logit)

In [49]:
X_train, y_train = train[['Distance', 'DepTime']].values, train['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
X_test = test[['Distance', 'DepTime']].values

X_train_part, X_valid, y_train_part, y_valid = train_test_split(X_train, 
                                                                y_train, 
                                                                test_size=0.3, 
                                                                random_state=17)

In [50]:
logit = LogisticRegression(random_state=17)
logit.fit(X_train_part, y_train_part)
logit_valid_pred = logit.predict_proba(X_valid)[:, 1]

roc_auc_score(y_valid, logit_valid_pred)

0.6789733731013721

In [51]:
logit.fit(X_train, y_train)
logit_test_pred = logit.predict_proba(X_test)[:, 1]

pd.Series(logit_test_pred, 
          name='dep_delayed_15min').to_csv('logit_2feat.csv', 
                                           index_label='id', 
                                           header=True)

## Second benchmark

In [52]:
y_train = train['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values
train.drop('dep_delayed_15min', axis=1, inplace=True);

### Создание признака "маршрут"

In [53]:
train['Route'] = train['Origin'] + '-' + train['Dest']


In [54]:
train.head(1)

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,Route
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,ATL-DFW


### OHE

In [55]:
lb = LabelBinarizer()

In [56]:
month = lb.fit_transform(train['Month'])
day_month = lb.fit_transform(train['DayofMonth'])
day_w = lb.fit_transform(train['DayOfWeek'])
car = lb.fit_transform(train['UniqueCarrier'])
rt = lb.fit_transform(train['Route'])

In [57]:
train.drop(['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Route', 'Origin', 'Dest'], axis=1, inplace=True)

In [58]:
X_train = train.values
trainset = np.hstack([X_train, month, day_month, day_w, car, rt])

In [59]:
trainset

array([[1934,  732,    0, ...,    0,    0,    0],
       [1548,  834,    0, ...,    0,    0,    0],
       [1422,  416,    0, ...,    0,    0,    0],
       ...,
       [1901, 1076,    1, ...,    0,    0,    0],
       [1515,  140,    0, ...,    0,    0,    0],
       [1800,  605,    0, ...,    0,    0,    0]], dtype=int64)

### Train test split

In [61]:
X_train_part, X_valid, y_train_part, y_valid = train_test_split(trainset, y_train,
                                                              test_size=0.3,
                                                              random_state=17)

### Fitting

In [62]:
logit = LogisticRegression(random_state=17)
logit.fit(X_train_part, y_train_part)
roc_auc_score(y_valid, logit.predict_proba(X_valid)[:, 1])

0.6901826103025382

In [65]:
xgb = XGBClassifier(random_state=17, n_estimators=500)
xgb.fit(X_train_part, y_train_part)
roc_auc_score(y_valid, xgb.predict_proba(X_valid)[:, 1])

OSError: [WinError -529697949] Windows Error 0xe06d7363