# Logistic Regression with Feature Importante and with Balance

In [33]:
import numpy as np
import pandas as pd
import warnings
import sys
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')

data = pd.read_csv('../data/data.csv')

from datetime import datetime

def get_period_day(date):
    date_time = datetime.strptime(date, '%Y-%m-%d %H:%M:%S').time()
    morning_min = datetime.strptime("05:00", '%H:%M').time()
    morning_max = datetime.strptime("11:59", '%H:%M').time()
    afternoon_min = datetime.strptime("12:00", '%H:%M').time()
    afternoon_max = datetime.strptime("18:59", '%H:%M').time()
    evening_min = datetime.strptime("19:00", '%H:%M').time()
    evening_max = datetime.strptime("23:59", '%H:%M').time()
    night_min = datetime.strptime("00:00", '%H:%M').time()
    night_max = datetime.strptime("4:59", '%H:%M').time()
    
    if(date_time > morning_min and date_time < morning_max):
        return 'mañana'
    elif(date_time > afternoon_min and date_time < afternoon_max):
        return 'tarde'
    elif(
        (date_time > evening_min and date_time < evening_max) or
        (date_time > night_min and date_time < night_max)
    ):
        return 'noche'
    
def is_high_season(fecha):
    fecha_año = int(fecha.split('-')[0])
    fecha = datetime.strptime(fecha, '%Y-%m-%d %H:%M:%S')
    range1_min = datetime.strptime('15-Dec', '%d-%b').replace(year = fecha_año)
    range1_max = datetime.strptime('31-Dec', '%d-%b').replace(year = fecha_año)
    range2_min = datetime.strptime('1-Jan', '%d-%b').replace(year = fecha_año)
    range2_max = datetime.strptime('3-Mar', '%d-%b').replace(year = fecha_año)
    range3_min = datetime.strptime('15-Jul', '%d-%b').replace(year = fecha_año)
    range3_max = datetime.strptime('31-Jul', '%d-%b').replace(year = fecha_año)
    range4_min = datetime.strptime('11-Sep', '%d-%b').replace(year = fecha_año)
    range4_max = datetime.strptime('30-Sep', '%d-%b').replace(year = fecha_año)
    
    if ((fecha >= range1_min and fecha <= range1_max) or 
        (fecha >= range2_min and fecha <= range2_max) or 
        (fecha >= range3_min and fecha <= range3_max) or
        (fecha >= range4_min and fecha <= range4_max)):
        return 1
    else:
        return 0
    
def get_min_diff(data):
    fecha_o = datetime.strptime(data['Fecha-O'], '%Y-%m-%d %H:%M:%S')
    fecha_i = datetime.strptime(data['Fecha-I'], '%Y-%m-%d %H:%M:%S')
    min_diff = ((fecha_o - fecha_i).total_seconds())/60
    return min_diff

data['period_day'] = data['Fecha-I'].apply(get_period_day)
data['high_season'] = data['Fecha-I'].apply(is_high_season)
data['min_diff'] = data.apply(get_min_diff, axis = 1)

threshold_in_minutes = 15
data['delay'] = np.where(data['min_diff'] > threshold_in_minutes, 1, 0)

training_data = shuffle(data[['OPERA', 'MES', 'TIPOVUELO', 'SIGLADES', 'DIANOM', 'delay']], random_state = 111)

features = pd.concat([
    pd.get_dummies(data['OPERA'], prefix = 'OPERA'),
    pd.get_dummies(data['TIPOVUELO'], prefix = 'TIPOVUELO'), 
    pd.get_dummies(data['MES'], prefix = 'MES')], 
    axis = 1
)
target = data['delay']

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.33, random_state = 42)

top_10_features = [
    "OPERA_Latin American Wings", 
    "MES_7",
    "MES_10",
    "OPERA_Grupo LATAM",
    "MES_12",
    "TIPOVUELO_I",
    "MES_4",
    "MES_11",
    "OPERA_Sky Airline",
    "OPERA_Copa Air"
]

n_y0 = len(y_train[y_train == 0])
n_y1 = len(y_train[y_train == 1])
scale = n_y0/n_y1

x_train2, x_test2, y_train2, y_test2 = train_test_split(features[top_10_features], target, test_size = 0.33, random_state = 42)

reg_model_2 = LogisticRegression(class_weight={1: n_y0/len(y_train), 0: n_y1/len(y_train)})
reg_model_2.fit(x_train2, y_train2)

reg_y_preds_2 = reg_model_2.predict(x_test2)


In [34]:

# Crear un nuevo registro como DataFrame
new_data = pd.DataFrame({
    'OPERA': ['Grupo LATAM'],
    'MES': [7],
    'TIPOVUELO': ['I'],
    'SIGLADES': ['SCL'],
    'DIANOM': ['Lunes']
})

# Generar las variables dummies para el nuevo registro, asegurándonos de incluir solo las columnas necesarias
new_data_transformed = pd.concat([
    pd.get_dummies(new_data['OPERA'], prefix='OPERA'),
    pd.get_dummies(new_data['MES'], prefix='MES'),
    pd.get_dummies(new_data['TIPOVUELO'], prefix='TIPOVUELO')
], axis=1)

# Asegúrarnos de que el DataFrame transformado tenga las mismas columnas que el DataFrame de entrenamiento,
# incluso si algunos valores son 0
for column in x_train2.columns:
    if column not in new_data_transformed.columns:
        new_data_transformed[column] = 0

new_data_transformed = new_data_transformed[x_train2.columns]

# Predicción con el modelo
new_prediction = reg_model_2.predict(new_data_transformed)

# Output
print("La predicción para el nuevo registro es:", "Retrasado" if new_prediction[0] == 1 else "A tiempo")


La predicción para el nuevo registro es: Retrasado


In [41]:
from joblib import dump

# Guardar el modelo en un archivo
model_filename = '../challenge/reg_model_2.joblib'
dump(reg_model_2, model_filename)

print(f'Modelo guardado como {model_filename}')


Modelo guardado como ../challenge/reg_model_2.joblib


Data Science Conclusions

By looking at the results of the 6 trained models, it can be determined:
- There is no noticeable difference in results between XGBoost and LogisticRegression.
- Does not decrease the performance of the model by reducing the features to the 10 most important.
- Improves the model's performance when balancing classes, since it increases the recall of class "1".

**With this, the model to be productive must be the one that is trained with the top 10 features and class balancing, but which one?**