**Nota:**
Si es la primera vez que usas este notebook descomenta y ejecuta la siguiente celda, o instala de tu manera preferida los siguientes paquetes.

* matplotlib==3.5.2
* numpy==1.22.4
* pandas==1.4.2
* scikit-learn==1.1.1
* tqdm==4.64.0
* seaborn==0.10.1
* missingno==0.4.2
* dill==0.3.4

In [None]:
#!pip install matplotlib==3.5.2 numpy==1.22.4 pandas==1.4.2 scikit-learn==1.1.1 tqdm==4.64.0 seaborn==0.10.1 missingno==0.4.2 dill==0.3.4

In [None]:
import pandas as pd
import numpy as np
import os
import datetime
import glob
import missingno as msno
import matplotlib.pyplot as plt
import dill as pickle
import warnings
import seaborn as sns

from datetime import date, datetime, time
from os.path import dirname, exists
from tqdm import tqdm
from time import perf_counter, sleep
from humanfriendly import format_timespan
from pandas.api.types import is_numeric_dtype  
tqdm.pandas()

from sklearn import linear_model
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split

from statsmodels.stats.outliers_influence import variance_inflation_factor

warnings.filterwarnings("ignore")  # Suppress all warnings
plt.style.use('ggplot')
pd.set_option('display.max_rows', 1000000)

In [None]:
if exists('dataset_SCL.csv'):
    dataset = pd.read_csv('dataset_SCL.csv', sep = ",", decimal = ".", na_values = "NA", encoding='utf8', low_memory = False)

dataset['Fecha-I'] = pd.to_datetime(dataset['Fecha-I'])
dataset['Fecha-O'] = pd.to_datetime(dataset['Fecha-O'])
dataset.head()

In [None]:
for col in dataset:
    print("Valores Unicos de la Variable ", col)
    print(dataset[col].unique())
    print("")

In [None]:
print(" \nDataset shape: \n\n", 
      dataset.shape)
print(" \nCount total NaN at each column in the dataset : \n\n", 
      dataset.isnull().sum()) 

**Limpieza de Datos**

In [None]:
print("Información Dataset")
dataset.info()

In [None]:
if dataset.isnull().values.any():
    msno.matrix(dataset)

In [None]:
dataset.dropna(inplace=True)
dataset.info()

In [None]:
def histogramgraph(data):
    fig = plt.figure(figsize = (20,15))
    ax = fig.gca()
    data.hist(ax = ax, alpha=0.7)

In [None]:
histogramgraph(dataset)

In [None]:
def boxplot(data):
    data.plot.box(figsize = (15, 10), rot = 90, grid = True)

In [None]:
boxplot(dataset)

In [None]:
def correlationgraph(data):
    f = plt.figure(figsize=(15, 10))
    plt.matshow(data.corr(), fignum=f.number)
    plt.rcParams.update({'font.size': 14})
    for (i, j), z in np.ndenumerate(data.corr()):
        plt.text(j, i, '{:0.2f}'.format(z), ha='center', va='center',
                bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3'))
    plt.xticks(range(data.select_dtypes(['number']).shape[1]), data.select_dtypes(['number']).columns, fontsize=14, rotation=45)
    plt.yticks(range(data.select_dtypes(['number']).shape[1]), data.select_dtypes(['number']).columns, fontsize=14)
    cb = plt.colorbar()
    cb.ax.tick_params(labelsize=14)
    plt.title('Correlation Matrix', fontsize=16)

In [None]:
def correlationgraph2(data, scale):
    plt.figure(figsize=(15, 10))
    sns.set(font_scale = scale)
    sns.heatmap(data.corr(), annot=True, cmap="YlGnBu")
    plt.show()

In [None]:
correlationgraph2(dataset, 2)

**Generando archivo synthetic_features.csv**

In [None]:
def temporadas(row):
    datetime_obj = datetime.strptime(str(row), '%Y-%m-%d %H:%M:%S')
    my_date = datetime_obj.date()
    
    boundarydict = {"Si": [(date(my_date.year, 1, 1), date(my_date.year, 3, 3)),
                           (date(my_date.year, 7, 15), date(my_date.year, 7, 31)),
                           (date(my_date.year, 9, 11), date(my_date.year, 9, 30)),
                           (date(my_date.year, 12, 15), date(my_date.year, 12, 31))],
                    "No": [(date(my_date.year, 3, 4), date(my_date.year, 7, 14)),
                           (date(my_date.year, 8, 1), date(my_date.year, 9, 10)),
                           (date(my_date.year, 10, 1), date(my_date.year, 12, 14))]}
    
    for retval, boundaries in boundarydict.items():
        if any(a <= my_date <= b for a, b in boundaries):
            return retval

In [None]:
def periodo(row):
    datetime_obj = datetime.strptime(str(row), '%Y-%m-%d %H:%M:%S')
    my_time = datetime_obj.time()
    
    boundarydict = {"mañana": [(time(5, 0), time(11, 59))],
                    "tarde": [(time(12, 0), time(18, 59))],
                    "noche": [(time(19, 0), time(23, 59)),
                              (time(0, 0), time(4, 59))]}
    
    for retval, boundaries in boundarydict.items():
        if any(a <= my_time <= b for a, b in boundaries):
            return retval

In [None]:
synthetic_features = pd.DataFrame(columns=['temporada_alta', 'dif_min', 'atraso_15', 'periodo_dia'], 
                                  index = dataset.index)

In [None]:
synthetic_features['temporada_alta'] = dataset['Fecha-I'].progress_apply(temporadas)

In [None]:
synthetic_features['dif_min'] = dataset['Fecha-O'] - dataset['Fecha-I']
synthetic_features['dif_min'] = synthetic_features['dif_min'] / np.timedelta64(1,'m')
synthetic_features['dif_min'][synthetic_features['dif_min'] < 0] = 0

In [None]:
synthetic_features['atraso_15'] = synthetic_features['dif_min'].progress_apply(lambda x: 1 if x > 15.0 else 0)

In [None]:
synthetic_features['periodo_dia'] = dataset['Fecha-I'].progress_apply(periodo)

In [None]:
synthetic_features.to_csv("synthetic_features.csv", sep=',', decimal = ".", encoding='utf-8', index=False)

In [None]:
for col in synthetic_features:
    print("Valores Unicos de la Variable ", col)
    print(synthetic_features[col].unique())
    print("")

**Tasa de atraso**

A continuación observamos la tasas de retraso usando como variables de medición distintos atributos del conjunto de datos. Se puede ver que....

Con base a esto esperamos que las variables de... sean las de mayor aporte al momento de predecir retrasos.

In [None]:
def tasadeatraso(df, col, filas, decimales):
    print(round(df.groupby([col])[col].count()/filas, decimales))

In [None]:
atraso_synthetic_features = synthetic_features.loc[synthetic_features['atraso_15'] == 1]
atraso_dataset = dataset.loc[atraso_synthetic_features.index, :]

In [None]:
print("tasa de atraso/destino origen")
tasadeatraso(df = atraso_dataset, col = 'Des-I', 
             filas = dataset.shape[0], decimales = 3)

In [None]:
print("tasa de atraso/destino operación")
tasadeatraso(df = atraso_dataset, col = 'Des-O', 
             filas = dataset.shape[0], decimales = 3)

In [None]:
print("tasa de atraso/aerolinea")
tasadeatraso(df = atraso_dataset, col = 'OPERA', 
             filas = dataset.shape[0], decimales = 3)

In [None]:
print("tasa de atraso/mes")
tasadeatraso(df = atraso_dataset, col = 'MES', 
             filas = dataset.shape[0], decimales = 3)

In [None]:
print("tasa de atraso/día")
tasadeatraso(df = atraso_dataset, col = 'DIANOM', 
             filas = dataset.shape[0], decimales = 3)

In [None]:
print("tasa de atraso/tipo de vuelo")
tasadeatraso(df = atraso_dataset, col = 'TIPOVUELO', 
             filas = dataset.shape[0], decimales = 3)

In [None]:
print("tasa de atraso/temporada")
tasadeatraso(df = atraso_synthetic_features, col = 'temporada_alta', 
             filas = synthetic_features.shape[0], decimales = 3)

**Generación Algoritmo de Predicción**

Categorizamos las variables no numericas

In [None]:
def convert_cats(df):
    cats = []
    for col in df.columns:
        if is_numeric_dtype(df[col]) or "Fecha" in col:
            pass
        else:
            cats.append(col)
            
    for col in cats:
        df[col] = df[col].astype('category').cat.codes
    return df

In [None]:
def delete_highly_correlated(data, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = data.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j] >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i] # getting the name of column
                col_corr.add(colname)
                if colname in data.columns:
                    del data[colname] # deleting the column from the data
    return data

In [None]:
def probatraso(row):
    atraso = round((row.dif_min * 100 / 15), 3)
    if atraso >= 100.000:
        return 100.000
    else:
        return atraso

In [None]:
def savemodel(model, name, verbose):
    if verbose:
        print("Saving model:", name)
    if not os.path.isdir('Models'):
        os.mkdir("Models")
    with open(name, 'wb') as f:
        pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

In [None]:
def modelraining(data, modelos, verbose):
    startf = perf_counter()
    X = data.drop("Probabilidad_Atraso", axis = 1)
    y = data['Probabilidad_Atraso']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test
        
    for modelo in modelos:
        clase = 1
        if modelo == "linear":
            model = linear_model.LinearRegression()
        elif modelo == "Ridge":
            model = linear_model.Ridge(alpha=.5)
        elif modelo == "Lasso":
            model = linear_model.Lasso(alpha=.1)
        elif modelo == "BayesianRidge":
            model = linear_model.BayesianRidge()
        elif modelo == "SVM":
            model = svm.SVR()
        elif modelo == "Tree":
            model = tree.DecisionTreeRegressor()
        elif modelo == "RF":
            model = RandomForestRegressor(random_state=1)
        elif modelo == "GBR":
            model = GradientBoostingRegressor(random_state=1)
        elif modelo == "VR":
            reg1 = GradientBoostingRegressor(random_state=1)
            reg2 = RandomForestRegressor(random_state=1)
            reg3 = linear_model.LinearRegression()
            model = VotingRegressor(estimators=[('GBR', reg1), ('RF', reg2), ('linear', reg3)])
        elif modelo == "KNN":
            model = KNeighborsRegressor(n_neighbors=20, metric='euclidean')
        elif modelo == "MLP":
            model = MLPRegressor(random_state=1, max_iter=500)

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
            
        if verbose:
            print(modelo + " r2_score:",metrics.r2_score(y_test, y_pred))
            print(modelo + " MAE:",metrics.mean_absolute_error(y_test, y_pred))
            print(modelo + " MSE:",metrics.mean_squared_error(y_test, y_pred))
            #print(modelo + " MSLE:",metrics.mean_squared_log_error(y_test, y_pred))
            #print(modelo + " MAPE:",metrics.mean_absolute_percentage_error(y_test, y_pred))
            #print(modelo + " MedAE:",metrics.median_absolute_error(y_test, y_pred))
            #print(modelo + " Max Error:",metrics.max_error(y_test, y_pred))
            #print(modelo + " EVS:",metrics.explained_variance_score(y_test, y_pred))
        
        savemodel(model, "Models/"+ modelo +"_PrData.pkl", verbose)
        if verbose:
            print("-----------------------------------------------------------------------------")
            print("")
        
    if verbose == True:
        print("--Took", format_timespan(perf_counter() - startf), "to process the file with the selected models.")
        print("-----------------------------------------------------------------------------")
        print("")

In [None]:
dataset2 = convert_cats(dataset.copy())
dataset2.head()

In [None]:
dataset2.describe()

In [None]:
dataset2.drop('Ori-I', axis=1, inplace=True)
dataset2.drop('Ori-O', axis=1, inplace=True)
dataset2.drop('SIGLAORI', axis=1, inplace=True)

In [None]:
synthetic_features2 = convert_cats(synthetic_features.copy())
synthetic_features2.head()

In [None]:
correlationgraph2(dataset2, 1)

In [None]:
histogramgraph(dataset2)

In [None]:
correlationgraph2(synthetic_features2, 2)

In [None]:
histogramgraph(synthetic_features2)

In [None]:
allDataset = pd.concat([dataset2, synthetic_features2], axis=1)
allDataset.head()

In [None]:
correlationgraph2(allDataset, 1)

In [None]:
allDataset.drop('Fecha-I', axis=1, inplace=True)
allDataset.drop('Fecha-O', axis=1, inplace=True)

In [None]:
def vif(data, cols):
    if not cols:
        vif = pd.DataFrame()
        vif["features"] = allDataset.columns
        vif["vif_Factor"] = [variance_inflation_factor(allDataset.values, i) for i in range(allDataset.shape[1])]
    else:
        X = allDataset.drop(cols, axis=1)
        vif = pd.DataFrame()
        vif["features"] = X.columns
        vif["vif_Factor"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    print(vif)

In [None]:
collist = []
vif(allDataset, collist)

In [None]:
collist = ["Des-O", "Des-I"]
vif(allDataset, collist)

In [None]:
collist = ["Vlo-I", "Vlo-O", "Des-O", "Des-I"]
vif(allDataset, collist)

In [None]:
collist = ["Vlo-I", "Vlo-O", "Des-O", "Des-I", "Emp-O", "Emp-I"]
vif(allDataset, collist)

In [None]:
collist = ["Vlo-I", "Vlo-O", "Des-O", "Des-I", "Emp-O", "Emp-I", "AÑO"]
vif(allDataset, collist)

In [None]:
allDataset.drop(collist, inplace = True, axis=1)
correlationgraph2(allDataset, 1)

In [None]:
allDataset = delete_highly_correlated(data = allDataset, threshold = 0.8)

In [None]:
correlationgraph2(allDataset, 1)

In [None]:
allDataset.head()

In [None]:
allDataset.describe()

In [None]:
allDataset['Probabilidad_Atraso'] = allDataset.progress_apply(probatraso, axis=1)

In [None]:
allDataset.drop('atraso_15', axis=1, inplace=True)
allDataset.drop('dif_min', axis=1, inplace=True)

In [None]:
TestModels = ["linear","Ridge","Lasso","BayesianRidge","SVM","Tree","RF","GBR","VR","KNN", "MLP"]

In [None]:
modelraining(data = allDataset, modelos = TestModels, verbose = True)