In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix,hstack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier
import xgboost
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_train = pd.read_csv('https://raw.githubusercontent.com/Yorko/mlcourse.ai/master/data/flight_delays_train.csv')
data_test = pd.read_csv('https://raw.githubusercontent.com/Yorko/mlcourse.ai/master/data/flight_delays_test.csv')

In [3]:
data_train

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y
...,...,...,...,...,...,...,...,...,...
99995,c-5,c-4,c-3,1618,OO,SFO,RDD,199,N
99996,c-1,c-18,c-3,804,CO,EWR,DAB,884,N
99997,c-1,c-24,c-2,1901,NW,DTW,IAH,1076,N
99998,c-4,c-27,c-4,1515,MQ,DFW,GGG,140,N


In [4]:
data_train.shape , data_test.shape

((100000, 9), (100000, 8))

In [5]:
X , y = data_train.drop('dep_delayed_15min',axis=1) , data_train.dep_delayed_15min
X.shape , y.shape

((100000, 8), (100000,))

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   Month          100000 non-null  object
 1   DayofMonth     100000 non-null  object
 2   DayOfWeek      100000 non-null  object
 3   DepTime        100000 non-null  int64 
 4   UniqueCarrier  100000 non-null  object
 5   Origin         100000 non-null  object
 6   Dest           100000 non-null  object
 7   Distance       100000 non-null  int64 
dtypes: int64(2), object(6)
memory usage: 6.1+ MB


In [7]:
X.isna().sum() , y.isna().sum()

(Month            0
 DayofMonth       0
 DayOfWeek        0
 DepTime          0
 UniqueCarrier    0
 Origin           0
 Dest             0
 Distance         0
 dtype: int64,
 0)

In [8]:
X.Month.nunique() , X['DayofMonth'].nunique() , X['DayOfWeek'].nunique() , X['UniqueCarrier'].nunique() , X['Origin'].nunique(),\
X['Dest'].nunique()

(12, 31, 7, 22, 289, 289)

In [9]:
X.columns

Index(['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier',
       'Origin', 'Dest', 'Distance'],
      dtype='object')

In [10]:
%%time
X = X.join(pd.get_dummies(X['Month'],prefix='Month'))
X = X.join(pd.get_dummies(X['DayofMonth'],prefix='DayofMonth'))
X = X.join(pd.get_dummies(X['DayOfWeek'],prefix='DayOfWeek'))
X = X.join(pd.get_dummies(X['UniqueCarrier'],prefix='UniqueCarrier'))

Wall time: 209 ms


In [11]:
X.shape

(100000, 80)

In [12]:
X.drop(['Month','DayofMonth','DayOfWeek','UniqueCarrier'],axis=1,inplace=True)

In [13]:
X.shape

(100000, 76)

In [14]:
X

Unnamed: 0,DepTime,Origin,Dest,Distance,Month_c-1,Month_c-10,Month_c-11,Month_c-12,Month_c-2,Month_c-3,...,UniqueCarrier_MQ,UniqueCarrier_NW,UniqueCarrier_OH,UniqueCarrier_OO,UniqueCarrier_TZ,UniqueCarrier_UA,UniqueCarrier_US,UniqueCarrier_WN,UniqueCarrier_XE,UniqueCarrier_YV
0,1934,ATL,DFW,732,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1548,PIT,MCO,834,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1422,RDU,CLE,416,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1015,DEN,MEM,872,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1828,MDW,OMA,423,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1618,SFO,RDD,199,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
99996,804,EWR,DAB,884,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99997,1901,DTW,IAH,1076,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
99998,1515,DFW,GGG,140,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [15]:
%%time
cv = CountVectorizer()
X_origin = cv.fit_transform(X['Origin'])
X_dest = cv.fit_transform(X['Dest'])

Wall time: 1.23 s


In [16]:
X_origin.shape , X_dest.shape

((100000, 289), (100000, 289))

In [17]:
X_sparse = csr_matrix(X.drop(['Origin','Dest'],axis=1))

In [18]:
X_sparse.shape

(100000, 74)

In [19]:
X_sparse = hstack([X_sparse,X_origin,X_dest])

In [20]:
X_sparse

<100000x652 sparse matrix of type '<class 'numpy.int64'>'
	with 800000 stored elements in COOrdinate format>

In [21]:
y.loc[y=='N'] = 0
y.loc[y=='Y'] = 1
y = y.astype('int64')

In [22]:
X_train_sparse , X_valid_sparse , y_train , y_valid = train_test_split(X_sparse,y,test_size=0.3,random_state=17)
X_train_sparse.shape , X_valid_sparse.shape , y_train.shape , y_valid.shape

((70000, 652), (30000, 652), (70000,), (30000,))

In [23]:
X_train_array = X_train_sparse.toarray()
X_valid_array = X_valid_sparse.toarray()

In [24]:
%%time
scaler = StandardScaler()
X_train_array_scaled = scaler.fit_transform(X_train_array)
X_valid_array_scaled = scaler.transform(X_valid_array)

Wall time: 2.19 s


In [25]:
X_train_sparse_scaled = csr_matrix(X_train_array_scaled)
X_valid_sparse_scaled = csr_matrix(X_valid_array_scaled)

In [26]:
X_train_sparse_scaled , X_valid_sparse_scaled

(<70000x652 sparse matrix of type '<class 'numpy.float64'>'
 	with 45360000 stored elements in Compressed Sparse Row format>,
 <30000x652 sparse matrix of type '<class 'numpy.float64'>'
 	with 19440006 stored elements in Compressed Sparse Row format>)

In [27]:
%%time
log_reg = LogisticRegression()
log_reg.fit(X_train_sparse_scaled,y_train)
roc_auc_score(y_valid,log_reg.predict(X_valid_sparse_scaled)) , accuracy_score(y_valid,log_reg.predict(X_valid_sparse_scaled))

Wall time: 3.93 s


(0.526573999906788, 0.8125333333333333)

In [28]:
X_train_gb = xgboost.DMatrix(X_train_sparse,y_train)
X_valid_gb = xgboost.DMatrix(X_valid_sparse)

In [29]:
%%time
params = {
    
}
xgb = xgboost.train(params=params,dtrain=X_train_gb)
roc_auc_score(y_valid,xgb.predict(X_valid_gb))

Wall time: 465 ms


0.7166402765458822

In [30]:
%%time
xgb_clf = xgboost.XGBClassifier()
xgb_clf.fit(X_train_sparse,y_train)
roc_auc_score(y_valid,xgb_clf.predict(X_valid_sparse))

Wall time: 3.82 s


0.5558879976120317

In [31]:
%%time
bagging = BaggingClassifier(base_estimator=LogisticRegression(random_state=17),n_estimators=100,n_jobs=-1)
bagging.fit(X_train_sparse_scaled,y_train)
roc_auc_score(y_valid,bagging.predict(X_valid_sparse_scaled))

Wall time: 6min 8s


0.5268532551029494