# Airlines:
* 1. Cargar y explorar los datos
* 2. Transformar los datos
* 3. Modelar los Datos
* 4. Crear el proceso de Score y evaluar los modelos
* 5. Crear pipeline para scorear los datos (score.py)

In [3]:
# 1. Carga de datos:
import pandas as pd
pd.set_option('display.max_columns', 500)
df = pd.read_csv('airlines_train.csv')
df.head(3)

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,56532,CO,229,DEN,EWR,6,505,223,1
1,258942,YV,2902,SBA,PHX,4,360,129,0
2,226491,AA,114,LAX,EWR,2,495,325,1


In [4]:
tar = 'Delay'
varc = ['Length', 'Time']
vard = [c for c in df.columns if c not in [tar,'id']+varc]

for v in vard:
    print(v)
    print(df[v].value_counts(1, dropna=False))
    print('\n'*2)

Airline
WN    0.174607
DL    0.112986
OO    0.093137
AA    0.084627
MQ    0.067614
US    0.063804
XE    0.057756
EV    0.052060
UA    0.051139
CO    0.039239
FL    0.038451
9E    0.038150
B6    0.033731
YV    0.025582
OH    0.023432
AS    0.021316
F9    0.011998
HA    0.010371
Name: Airline, dtype: float64



Flight
9       0.000781
16      0.000774
5       0.000765
8       0.000732
12      0.000684
          ...   
4710    0.000002
7340    0.000002
4813    0.000002
4565    0.000002
7777    0.000002
Name: Flight, Length: 6575, dtype: float64



AirportFrom
ATL    0.063670
ORD    0.045932
DFW    0.041072
DEN    0.036577
LAX    0.030869
         ...   
MMH    0.000028
SJT    0.000023
GUM    0.000019
ADK    0.000016
ABR    0.000005
Name: AirportFrom, Length: 293, dtype: float64



AirportTo
ATL    0.064071
ORD    0.046027
DFW    0.041047
DEN    0.036746
LAX    0.031110
         ...   
FLO    0.000028
MMH    0.000023
GUM    0.000023
ADK    0.000014
ABR    0.000002
Name: AirportTo, Length: 

# 2. Transformar los Datos:

In [5]:
from sklearn.model_selection import train_test_split

tar = 'Delay'
X = df[varc+vard].copy()
y = df[tar].copy()

Xt, Xv, yt, yv = train_test_split(X, y, test_size=.2)

In [6]:
int(Xt['Time'][0]/60)

8

In [7]:
hours_dict = (
    [(str(i), 'Midnight') for i in [23, 0, 1, 2, 3, 4, 5, 6]] +
    [(str(i), 'Morning') for i in [7, 8, 9, 10, 11, 12]] +
    [(str(i), 'Mid-day') for i in [13, 14, 15, 16, 17]] +
    [(str(i), 'Eve') for i in [18, 19, 20, 21, 22]]
)
hours_dict = dict(hours_dict)

def disc_time_and_hour(df, hours_dict, column='Time'):
    '''Obtener la columna discreta a partir de la columna Time'''
    
    df['hour'] = df[column].map(lambda x:int(x/60))
    df['disctime'] = df['hour'].map(lambda x:str(x)).map(hours_dict)
    return(df)

In [8]:
Xt = disc_time_and_hour(Xt, hours_dict)
Xv = disc_time_and_hour(Xv, hours_dict)

In [9]:
# Discretizamos la última variable continua:
aux = list(set(pd.cut(Xt['Length'], bins=10)))
label = 'Length_intervals'
save_dict = {label:[str(a)[1:-1].split(',') for a in aux]}

# Guardado:
import json
file_name = 'airlines_models/'+label+'.json'
with open(file_name, 'w') as outfile:
    json.dump(save_dict, outfile)

In [10]:
# Cargamos los intervalos:
with open(file_name) as json_file:
    cuts = json.load(json_file)
intervals = pd.Series(cuts.get(label)).map(
    lambda x:pd.Interval(float(x[0]), float(x[1]), closed='right'))
intervals = dict(zip(intervals, [str(a) for a in intervals]))

In [11]:
def len_cut(df, column, interval_dict, label='_interval'):
    df[column+label] = df[column].map(interval_dict)
    return(df)

In [12]:
Xt = len_cut(Xt, 'Length', intervals)
Xv = len_cut(Xv, 'Length', intervals)

In [13]:
import numpy as np

def norm_cat(df, column, threshold=0.05, label='category', others_label='Others',
            new_col=True):
    
    '''Salida de mi función: 
    1. Df con variables normalizadas.
    2. Diccionario de normalización'''
    
    aux = pd.DataFrame(df[column].value_counts(1, dropna=False))
    aux[label] = aux.index
    aux[label] = aux[label].map(lambda x:
                                x if aux.loc[x, column]>threshold else others_label)
    aux_dict = dict(zip(aux.index, aux[label]))
    
    if new_col:
        df[column+'_norm'] = df[column].map(aux_dict)
    else:
        df[column] = df[column].map(aux_dict)
        
    return(df, aux_dict)

def WoE(df, column, tar, label='_WoE'):
    
    '''Salida de mi función: 
    1. Df con variables normalizadas.
    2. Diccionario de WoEs'''
    
    df[column].fillna('Missings', inplace=True)
    aux = df[[tar, column]].pivot_table(index=column, columns=tar, aggfunc='size')
    woe = aux.apply(lambda x:x/sum(x)).apply(lambda x:np.log(x[1]/x[0]), axis=1)
    aux['WoE'] = woe
    aux_dict = dict(zip(aux.index, aux['WoE']))
    
    df[column+label] = df[column].map(aux_dict)
    
    return(df, aux_dict)

def prop_delay(df, column, tar='Delay', label='_%Delay', id_='id'):
    
    aux = df[[column, tar, id_]].pivot_table(columns=tar, 
                                            index=column,
                                            aggfunc='count',
                                            values=id_)
    aux['%Delay'] = aux.apply(lambda x:x/sum(x), axis=1)[1]
    delay_aux = dict(zip(aux.index, aux['%Delay']))
    df[column+label] = df[column].map(delay_aux).fillna(0)
    return(df, delay_aux)

In [14]:
import numpy as np

Xt_aux = Xt.copy()
Xt_aux['id'] = Xt_aux.index
Xt_aux['Delay'] = yt
del_cols = ['Flight', 'hour', 'Airline', 'AirportFrom', 'AirportTo', 'DayOfWeek']
del_dict = dict()

# Propension a delay:
for c in del_cols:
    aux_delays, del_dict[c] = prop_delay(Xt_aux, c)
    
# Normalizar:
disc_columns = ['Airline', 'AirportFrom', 'AirportTo', 'DayOfWeek',
               'disctime', 'Length_interval']
norm_dict = {}
for c in disc_columns:
    aux_norm, norm_dict[c] = norm_cat(aux_delays, c, threshold=.02)
    
# WoE:
columns = [c+'_norm' for c in disc_columns]
woes_dict = {}
for c in columns:
    aux_woe, woes_dict[c] = WoE(aux_norm, c, tar)

In [15]:
aux_woe.head(1)

Unnamed: 0,Length,Time,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,hour,disctime,Length_interval,id,Delay,Flight_%Delay,hour_%Delay,Airline_%Delay,AirportFrom_%Delay,AirportTo_%Delay,DayOfWeek_%Delay,Airline_norm,AirportFrom_norm,AirportTo_norm,DayOfWeek_norm,disctime_norm,Length_interval_norm,Airline_norm_WoE,AirportFrom_norm_WoE,AirportTo_norm_WoE,DayOfWeek_norm_WoE,disctime_norm_WoE,Length_interval_norm_WoE
153794,85,565,WN,1115,MDW,MSP,4,9,Morning,"(65.5, 131.0]",153794,0,0.602564,0.405846,0.698427,0.733493,0.464771,0.452545,WN,Others,Others,4,Morning,"(65.5, 131.0]",1.055717,-0.041251,0.070165,0.025502,-0.188579,-0.051707


In [16]:
def woes_norms_propensity(df, del_cols, disc_cols, woe_cols, del_dict,
                         disc_dict, woe_dict,
                         del_label='_%Delay', disc_label='_norm', woe_label='_WoE'):
    
    for c in del_cols:
        df[c+del_label] = df[c].map(lambda x:del_dict.get(c).get(x,0))
        
    for c in disc_cols:
        df[c+disc_label] = df[c].map(lambda x:disc_dict.get(c).get(x, 'Others'))
        
    for c in woe_cols:
        df[c+woe_label] = df[c].map(lambda x:woe_dict.get(c).get(x, 0))
        
    return(df)

In [17]:
Xt = woes_norms_propensity(Xt, del_cols, disc_columns, 
                           columns, del_dict, norm_dict, woes_dict).fillna(0)
Xv = woes_norms_propensity(Xv, del_cols, disc_columns, 
                           columns, del_dict, norm_dict, woes_dict).fillna(0)

# 3. Modelar los Datos:

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, classification_report
from sklearn.feature_selection import SelectKBest

In [19]:
predictors = ([c for c in Xt.columns if 'WoE' in c] +
             [c for c in Xt.columns if '%Delay' in c])

In [20]:
kbest = SelectKBest(k=7)
kbest.fit(Xt[predictors].fillna(0), yt)

In [21]:
new_vars = list(kbest.get_feature_names_out())

In [22]:
new_vars

['Airline_norm_WoE',
 'disctime_norm_WoE',
 'Flight_%Delay',
 'hour_%Delay',
 'Airline_%Delay',
 'AirportFrom_%Delay',
 'AirportTo_%Delay']

In [23]:
Xt[new_vars]

Unnamed: 0,Airline_norm_WoE,disctime_norm_WoE,Flight_%Delay,hour_%Delay,Airline_%Delay,AirportFrom_%Delay,AirportTo_%Delay
153794,1.055717,-0.188579,0.602564,0.405846,0.698427,0.733493,0.464771
218200,-0.230875,-0.188579,0.406250,0.468960,0.390129,0.400694,0.489312
350655,-0.159277,0.249763,0.555556,0.523624,0.407292,0.412272,0.385203
9755,-0.533089,0.249763,0.379747,0.500297,0.321042,0.262903,0.523381
397343,0.028761,-0.188579,0.312500,0.346876,0.453352,0.500514,0.584563
...,...,...,...,...,...,...,...
265970,0.028761,-0.188579,0.370370,0.405846,0.453352,0.496148,0.520000
267897,0.028761,-0.188579,0.309091,0.405846,0.453352,0.500514,0.551484
69458,1.055717,0.249763,0.573529,0.513565,0.698427,0.725144,0.514523
309837,0.028761,0.249763,0.500000,0.513565,0.453352,0.384494,0.375641


In [24]:
from Data_eng import transformations as trf

Xpt, pca_ = trf.pca(Xt[new_vars], components=2)
knn = KNeighborsClassifier()
knn.fit(Xpt, yt)

# 4. Score y evaluación:

In [25]:
print(recall_score(yt, knn.predict(pca_.transform(Xt[new_vars]))))
print(recall_score(yv, knn.predict(pca_.transform(Xv[new_vars]))))

0.5435270900144117
0.54271644824262


In [26]:
out_path = 'airlines_models/'
import pickle as pk

In [27]:
pca_name='pca_airlines_individual.pkl'
pk.dump(pca_, open(out_path+pca_name, 'wb'))
knn_name = 'knn_model_airlines_individual.sav'
pk.dump(knn, open(out_path+knn_name, 'wb'))

# 5. Construcción del pipeline

In [28]:
file_name = 'airlines_test.csv'
test_data = pd.read_csv(general_data_path+file_name)
test_data.head(3)

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length
0,535217,CO,678,SAT,IAH,5,954,59
1,504611,US,587,PHX,GEG,3,1260,167
2,47611,OO,4618,DEN,SLC,5,955,89


In [29]:
import pandas as pd
import numpy as np
import pickle as pk

def will_my_flight_be_delayed(df, features, pca_name, model_name, label='Delay'):
    
    pca_object = pk.load(open(pca_name, 'rb'))
    model = pk.load(open(model_name, 'rb'))
    pca_array = pca_object.transform(df[features].fillna(0))
    pca_frame = pd.DataFrame(pca_array, columns=[f'P_{i}' for i in range(pca_array.shape[1])])
    df[label] = model.predict(pca_frame)
    return(df)

In [30]:
# Ejecución con Xv desde 0, para evaluar el funcionamiento:
will_my_flight_be_delayed(Xv, new_vars, out_path+pca_name, out_path+knn_name)

Unnamed: 0,Length,Time,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,hour,disctime,Length_interval,Flight_%Delay,hour_%Delay,Airline_%Delay,AirportFrom_%Delay,AirportTo_%Delay,DayOfWeek_%Delay,Airline_norm,AirportFrom_norm,AirportTo_norm,DayOfWeek_norm,disctime_norm,Length_interval_norm,Airline_norm_WoE,AirportFrom_norm_WoE,AirportTo_norm_WoE,DayOfWeek_norm_WoE,disctime_norm_WoE,Length_interval_norm_WoE,Delay
259599,85,450,DL,2291,ATL,TPA,4,7,Morning,"(65.5, 131.0]",0.381818,0.304386,0.448902,0.425379,0.513222,0.452545,DL,ATL,Others,4,Morning,"(65.5, 131.0]",0.010789,-0.084834,0.070165,0.025502,-0.188579,-0.051707,1
257819,116,855,9E,3921,MEM,CLE,5,14,Mid-day,"(65.5, 131.0]",0.250000,0.500297,0.404474,0.327015,0.431175,0.417648,9E,Others,Others,5,Mid-day,"(65.5, 131.0]",-0.170962,-0.041251,0.070165,-0.116538,0.249763,-0.051707,1
60390,95,434,XE,5956,ORD,ROA,6,7,Morning,"(65.5, 131.0]",0.484848,0.304386,0.376954,0.479780,0.446328,0.401248,XE,ORD,Others,6,Morning,"(65.5, 131.0]",-0.286600,0.134974,0.070165,-0.184371,-0.188579,-0.051707,1
217504,60,805,DL,1482,JAC,SLC,7,13,Mid-day,"(-0.655, 65.5]",0.373626,0.487051,0.448902,0.426573,0.489889,0.456014,DL,Others,Others,7,Mid-day,"(-0.655, 65.5]",0.010789,-0.041251,0.070165,0.039495,0.249763,-0.080263,0
257128,121,360,OO,6471,ABQ,LAX,3,6,Midnight,"(65.5, 131.0]",0.270270,0.256602,0.453352,0.495233,0.489312,0.471317,OO,Others,LAX,3,Midnight,"(65.5, 131.0]",0.028761,-0.041251,0.173138,0.101040,-0.793184,-0.051707,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363192,370,493,UA,846,IAD,SFO,3,8,Morning,"(327.5, 393.0]",0.407895,0.346876,0.321042,0.379672,0.523381,0.471317,UA,Others,SFO,3,Morning,"(327.5, 393.0]",-0.533089,-0.041251,0.309487,0.101040,-0.188579,0.002499,1
184968,80,750,WN,1192,BHM,TPA,5,12,Morning,"(65.5, 131.0]",0.448718,0.468960,0.698427,0.448615,0.513222,0.417648,WN,Others,Others,5,Morning,"(65.5, 131.0]",1.055717,-0.041251,0.070165,-0.116538,-0.188579,-0.051707,1
421746,105,860,WN,755,STL,DAL,7,14,Mid-day,"(65.5, 131.0]",0.483871,0.500297,0.698427,0.564750,0.599834,0.456014,WN,Others,Others,7,Mid-day,"(65.5, 131.0]",1.055717,-0.041251,0.070165,0.039495,0.249763,-0.051707,0
17885,107,1164,YV,7308,JAX,IAD,4,19,Eve,"(65.5, 131.0]",0.285714,0.524887,0.245697,0.326824,0.357093,0.452545,YV,Others,Others,4,Eve,"(65.5, 131.0]",-0.905800,-0.041251,0.070165,0.025502,0.281624,-0.051707,1


In [31]:
# Paso 1. Variables auxiliares iniciales:
df_test = disc_time_and_hour(test_data, hours_dict)

# Paso 2. Intervalos:
df_test = len_cut(df_test, 'Length', intervals)

# Paso 3. WoEs, Norm, Propensity:
df_test = woes_norms_propensity(df_test, del_cols, disc_columns, columns, del_dict, norm_dict, woes_dict)

# Listo para mandar al pickle:
df_test

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,hour,disctime,Length_interval,Flight_%Delay,hour_%Delay,Airline_%Delay,AirportFrom_%Delay,AirportTo_%Delay,DayOfWeek_%Delay,Airline_norm,AirportFrom_norm,AirportTo_norm,DayOfWeek_norm,disctime_norm,Length_interval_norm,Airline_norm_WoE,AirportFrom_norm_WoE,AirportTo_norm_WoE,DayOfWeek_norm_WoE,disctime_norm_WoE,Length_interval_norm_WoE
0,535217,CO,678,SAT,IAH,5,954,59,15,Mid-day,"(-0.655, 65.5]",0.369863,0.513565,0.564585,0.428436,0.401167,0.417648,CO,Others,IAH,5,Mid-day,"(-0.655, 65.5]",0.475689,-0.041251,-0.184710,-0.116538,0.249763,-0.080263
1,504611,US,587,PHX,GEG,3,1260,167,21,Eve,"(131.0, 196.5]",0.477477,0.484627,0.337493,0.437437,0.554074,0.471317,US,PHX,Others,3,Eve,"(131.0, 196.5]",-0.458590,-0.035674,0.070165,0.101040,0.281624,0.014437
2,47611,OO,4618,DEN,SLC,5,955,89,15,Mid-day,"(65.5, 131.0]",0.620690,0.513565,0.453352,0.475602,0.489889,0.417648,OO,DEN,Others,5,Mid-day,"(65.5, 131.0]",0.028761,0.118226,0.070165,-0.116538,0.249763,-0.051707
3,96558,UA,118,LAX,SFO,1,920,83,15,Mid-day,"(65.5, 131.0]",0.448980,0.513565,0.321042,0.500514,0.523381,0.468174,UA,LAX,SFO,1,Mid-day,"(65.5, 131.0]",-0.533089,0.217954,0.309487,0.088421,0.249763,-0.051707
4,321490,US,2133,BOS,LGA,7,900,76,15,Mid-day,"(65.5, 131.0]",0.352941,0.513565,0.337493,0.392170,0.402245,0.456014,US,Others,Others,7,Mid-day,"(65.5, 131.0]",-0.458590,-0.041251,0.070165,0.039495,0.249763,-0.051707
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
107872,313963,OH,6494,JFK,ORF,7,500,95,8,Morning,"(65.5, 131.0]",0.423077,0.346876,0.280000,0.416586,0.438710,0.456014,OH,Others,Others,7,Morning,"(65.5, 131.0]",-0.728566,-0.041251,0.070165,0.039495,-0.188579,-0.051707
107873,193243,WN,2661,RNO,PHX,7,625,100,10,Morning,"(65.5, 131.0]",0.281690,0.435741,0.698427,0.582873,0.462407,0.456014,WN,Others,PHX,7,Morning,"(65.5, 131.0]",1.055717,-0.041251,0.065239,0.039495,-0.188579,-0.051707
107874,77999,XE,3121,LIT,IAH,7,898,82,14,Mid-day,"(65.5, 131.0]",0.428571,0.500297,0.376954,0.380350,0.401167,0.456014,XE,Others,IAH,7,Mid-day,"(65.5, 131.0]",-0.286600,-0.041251,-0.184710,0.039495,0.249763,-0.051707
107875,538303,OO,6503,SFO,EUG,5,1175,91,19,Eve,"(65.5, 131.0]",0.408163,0.524887,0.453352,0.529598,0.521739,0.417648,OO,SFO,Others,5,Eve,"(65.5, 131.0]",0.028761,0.334428,0.070165,-0.116538,0.281624,-0.051707


In [32]:
# Paso 4. Predecir:
df_test = will_my_flight_be_delayed(df_test, new_vars, out_path+pca_name, out_path+knn_name)
df_test[['id', 'Delay']]

Unnamed: 0,id,Delay
0,535217,0
1,504611,0
2,47611,1
3,96558,0
4,321490,0
...,...,...
107872,313963,0
107873,193243,1
107874,77999,0
107875,538303,0
