pair 2 regresión logística:

- Estandarizar las variables numéricas de vuestro set de datos

- Codificar las variables categóricas. Recordad que tendréis que tener en cuenta si vuestras variables tienen orden o no.

- Chequear si vuestros datos están balanceados. En caso de que no lo estén utilizad algunas de las herramientas aprendidas en la lección para balancearlos.

- Guardad el dataframe con los cambios que habéis aplicado para utilizarlo en la siguiente lección.

In [57]:
import numpy as np
import pandas as pd
import sidetable as stb

import matplotlib.pyplot as plt
import seaborn as sns

import math

# Estandarización variables numéricas y Codificación variables categóricas
# ------------------------------------------------------------------------------
from sklearn.preprocessing import StandardScaler

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.linear_model import LogisticRegression

# Para separar los datos en train y test
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_columns = None 

In [3]:
df_travel = pd.read_pickle('data/df_travel_1.pkl')
df_travel.head(2)

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Age,Commision
0,CBH,Travel Agency,Offline,Comprehensive Plan,0,186,MALAYSIA,36.782695,81,yes
1,CBH,Travel Agency,Offline,Comprehensive Plan,0,186,MALAYSIA,36.782695,71,yes


In [4]:
df_num = pd.read_pickle('data/df_num_1.pkl')
df_cat = pd.read_pickle('data/df_cat_1.pkl')

In [5]:
#Estandarizamos las variables numéricas
#llamamos al método
scaler = StandardScaler()

In [6]:
# ahora ya podemos ajustar nuestros datos. 
scaler.fit(df_num)

# transformamos los datos
X_escaladas = scaler.transform(df_num)

# por último convertiremos el array que nos devuelve en un dataframe.
numericas_estandar = pd.DataFrame(X_escaladas, columns = df_num.columns)
numericas_estandar.head(2)

Unnamed: 0,Duration,Net Sales,Age
0,2.58677,0.337585,4.254458
1,2.58677,0.337585,3.248902


In [7]:
#Eliminamos las columnas que ya no necesitamos

df_travel.drop(df_num.columns, axis = 1, inplace=True)
df_travel.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Destination,Commision
0,CBH,Travel Agency,Offline,Comprehensive Plan,0,MALAYSIA,yes
1,CBH,Travel Agency,Offline,Comprehensive Plan,0,MALAYSIA,yes
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,AUSTRALIA,yes
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,AUSTRALIA,yes
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,ITALY,yes


In [8]:
df_travel.stb.missing()

Unnamed: 0,missing,total,percent
Agency,0,61997,0.0
Agency Type,0,61997,0.0
Distribution Channel,0,61997,0.0
Product Name,0,61997,0.0
Claim,0,61997,0.0
Destination,0,61997,0.0
Commision,0,61997,0.0


In [9]:
numericas_estandar.stb.missing()

Unnamed: 0,missing,total,percent
Duration,0,61997,0.0
Net Sales,0,61997,0.0
Age,0,61997,0.0


In [10]:
#Hacemos un reset_index sobre el df_travel para que al concatenarlo con numericas_estandar no nos genere nulos en el df resultante.

df_travel.reset_index(inplace = True, drop = True)

In [11]:
#Añadimos las columnas estandarizadas

df_travel_est = pd.concat([df_travel, numericas_estandar], axis = 1)

In [12]:
df_travel_est.stb.missing()

Unnamed: 0,missing,total,percent
Agency,0,61997,0.0
Agency Type,0,61997,0.0
Distribution Channel,0,61997,0.0
Product Name,0,61997,0.0
Claim,0,61997,0.0
Destination,0,61997,0.0
Commision,0,61997,0.0
Duration,0,61997,0.0
Net Sales,0,61997,0.0
Age,0,61997,0.0


In [13]:
df_travel_est.head(2)

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Destination,Commision,Duration,Net Sales,Age
0,CBH,Travel Agency,Offline,Comprehensive Plan,0,MALAYSIA,yes,2.58677,0.337585,4.254458
1,CBH,Travel Agency,Offline,Comprehensive Plan,0,MALAYSIA,yes,2.58677,0.337585,3.248902


Vamos a codificar las variables categóricas, según si tienen orden o no:

- Creamos listas con las agrupaciones por orden.

- Creamos funciones.

- Aplicamos la función.

In [14]:
def agrupar_prod(element):

    producto_0 = ["Rental Vehicle Excess Insurance", "1 way Comprehensive Plan", "2 way Comprehensive Plan", "24 Protect", "Annual Travel Protect Silver", "Basic Plan", "Cancellation Plan", "Child Comprehensive Plan", "Comprehensive Plan", "Premier Plan", "Single Trip Travel Protect Gold", "Single Trip Travel Protect Silver", "Spouse or Parents Comprehensive Plan", "Ticket Protector", "Travel Cruise Protect", "Travel Cruise Protect Family", "Value Plan"]
    product_1= ["Annual Travel Protect Platinum", "Bronze Plan", "Gold Plan", "Individual Comprehensive Plan", "Silver Plan", "Single Trip Travel Protect Platinum"]
    product_2 = ["Annual Gold Plan", "Annual Silver Plan", "Annual Travel Protect Gold"]

    if element in producto_0:
        return 0
    
    elif element in product_1:
        return 1
    
    elif element in product_2:
        return 2
    
    else:
        return "Unknown"

In [15]:
df_travel_est['products'] = df_travel_est['Product Name'].apply(agrupar_prod)
df_travel.sample()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Destination,Commision
53074,SSI,Airlines,Online,Ticket Protector,0,SINGAPORE,yes


In [16]:
def agrupar_agencia(element):

    lista_2 =  ['C2B', 'LWC', 'TTW']
    lista_1 = ['CBH', 'CWT', 'KML', 'CSR', 'CCR']
    lista_0 = ['JZI', 'EPX', 'JWT', 'RAB', 'SSI', 'ART', 'ADM', 'TST']

    if element in lista_0:
        return 0
    
    elif element in lista_1:
        return 1
    
    elif element in lista_2:
        return 2
    
    else:
        return "Unknown"

In [17]:
df_travel_est['agency'] = df_travel_est['Agency'].apply(agrupar_agencia)
df_travel_est.sample()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Destination,Commision,Duration,Net Sales,Age,products,agency
59061,JZI,Airlines,Online,Basic Plan,0,HONG KONG,yes,-0.378575,-0.234784,1.4389,0,0


In [18]:
def agrupar_pais (element):
    
    if element == "BULGARIA":
        return 2
    
    elif element == "COSTA RICA":
        return 3
    
    elif element == "ARGENTINA":
        return 1
    
    else:
        return 0

In [19]:
df_travel_est['country'] = df_travel_est['Destination'].apply(agrupar_pais)
df_travel_est.sample()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Destination,Commision,Duration,Net Sales,Age,products,agency,country
51176,EPX,Travel Agency,Online,Cancellation Plan,0,CHINA,no,-0.450029,0.826859,-0.270547,0,0,0


Las variables categóricas con dos subcategorías las vamos a codificar con el método Ordinal Encoding

In [20]:
# hacemos un reset index antes de aplicar la función para evitar problemas en el encoding
df_travel_est.reset_index(inplace = True, drop = True)

In [21]:
orden = [ "no", "yes" ]

def ordinal_encoder1(df, columna, orden_valores):
    # iniciamos el método y aplicamos la transformación a los datos.
    ordinal = OrdinalEncoder(categories = [orden_valores], dtype = int)
    transformados_oe = ordinal.fit_transform(df[[columna]])
    # lo convertimos a dataframe
    oe_df = pd.DataFrame(transformados_oe)
    # cambiamos el nombre de la columna
    oe_df.columns = ordinal.feature_names_in_
    columna += "_oe"
    # sobre escribimos la columna con los valores de la tranformación
    df[columna] = oe_df
    return df

In [22]:
df_travel_est.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Destination,Commision,Duration,Net Sales,Age,products,agency,country
0,CBH,Travel Agency,Offline,Comprehensive Plan,0,MALAYSIA,yes,2.58677,0.337585,4.254458,0,1,0
1,CBH,Travel Agency,Offline,Comprehensive Plan,0,MALAYSIA,yes,2.58677,0.337585,3.248902,0,1,0
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,AUSTRALIA,yes,0.425283,0.337585,-0.672769,0,1,0
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,AUSTRALIA,yes,0.335966,0.337585,-0.672769,0,1,0
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,ITALY,yes,0.675373,-2.665946,0.232232,0,1,0


In [23]:
df_travel_est['Agency Type'].unique()

array(['Travel Agency', 'Airlines'], dtype=object)

In [24]:
df_travel_est = ordinal_encoder1(df_travel_est, "Commision", orden)

In [25]:
df_travel_est['Agency Type'].unique()

array(['Travel Agency', 'Airlines'], dtype=object)

In [26]:
orden_2 = ["Travel Agency", "Airlines"]

df_travel_est = ordinal_encoder1(df_travel_est, "Agency Type",  orden_2)

In [27]:
df_travel_est["Commision_oe"].unique()

array([1, 0])

In [28]:
df_travel_est.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Claim,Destination,Commision,Duration,Net Sales,Age,products,agency,country,Commision_oe,Agency Type_oe
0,CBH,Travel Agency,Offline,Comprehensive Plan,0,MALAYSIA,yes,2.58677,0.337585,4.254458,0,1,0,1,0
1,CBH,Travel Agency,Offline,Comprehensive Plan,0,MALAYSIA,yes,2.58677,0.337585,3.248902,0,1,0,1,0
2,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,AUSTRALIA,yes,0.425283,0.337585,-0.672769,0,1,0,1,0
3,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,AUSTRALIA,yes,0.335966,0.337585,-0.672769,0,1,0,1,0
4,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,0,ITALY,yes,0.675373,-2.665946,0.232232,0,1,0,1,0


In [29]:
#Borramos las columnas originales
df_travel_est.drop(["Agency", "Agency Type", "Product Name", "Destination", "Commision"], inplace = True, axis = 1)

Usamos el método OneHotEncoder para la columna Distribution Channel	ya que no tiene orden, y este método lo añade sólo a una columna.

In [30]:
oh = OneHotEncoder()

In [31]:
# hacemos la codificación de los datos para la variable dada
transformados = oh.fit_transform(df_travel_est[["Distribution Channel"]])

In [32]:
# convertimos nuestro array con la codificación hecha en un dataframe, donde tendremos dos columnas, una para los "Yes" y otra para los "No"
oh_df = pd.DataFrame(transformados.toarray())

In [33]:
# el método get_feature_names nos va a dar el nombre de las columnas nuevas que se nos generarán
oh_df.columns = oh.get_feature_names_out()
oh_df.columns

Index(['Distribution Channel_Offline', 'Distribution Channel_Online'], dtype='object')

In [34]:
# concatenamos el dataframe original con el dataframe que acabamos de crear
df_travel_enc = pd.concat([df_travel_est,oh_df],axis=1)

In [35]:
df_travel_enc.drop(["Distribution Channel"], inplace = True, axis = 1)

In [36]:
df_travel_enc.to_csv("data/df_travel_estenc.csv")

### MODELO A: balanceo 50-50%

In [41]:
df_travel_a = pd.read_csv("data/df_travel_estenc.csv")

Balanceamos nuestra VR con el método SMOTETomek

In [42]:
# separamos las variables en diferentes df

X = df_travel_a.drop("Claim", axis = 1)
y = df_travel_a["Claim"]

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [44]:
# iniciamos el método elegido y lo ajustamos

os_us = SMOTETomek()
X_train_res, y_train_res = os_us.fit_resample(X_train, y_train)

In [45]:
# antes del modelo

y_train.value_counts()

Claim
0    42860
1      537
Name: count, dtype: int64

In [46]:
# después del modelo

y_train_res.value_counts()

Claim
0    40453
1    40453
Name: count, dtype: int64

In [48]:
df_travel_balanceado_a = pd.concat([X_train_res, y_train_res], axis=1)
df_travel_balanceado_a["Claim"].value_counts()

Claim
0    40453
1    40453
Name: count, dtype: int64

In [49]:
# guardamos df

df_travel_balanceado_a.to_csv('data/df_travel_balanceado.csv')

### MODELO B: balanceo 60-40%

In [51]:
df_travel_b = pd.read_csv("data/df_travel_estenc.csv")

Balanceamos nuestra VR con el método SMOTETomek

In [52]:
A = df_travel_b.drop("Claim", axis = 1)
b = df_travel_b["Claim"]

In [64]:
# Dividimos los datos en conjuntos de entrenamiento y prueba

A_train, A_test, b_train, b_test = train_test_split(A, b, test_size=0.4, random_state=42)

# iniciamos el método elegido y lo ajustamos

smote = SMOTE(sampling_strategy=0.6)
tomek = TomekLinks(sampling_strategy='majority')
os_us_b = SMOTETomek(smote=smote, tomek=tomek)

# Aplicar SMOTETomek al conjunto de entrenamiento
A_train_res, b_train_res = smote_tomek.fit_resample(A_train, b_train)

In [65]:
df_travel_balanceado_b = pd.concat([A_train_res, b_train_res], axis=1)
df_travel_balanceado_b["Claim"].value_counts()

Claim
0    34570
1    22019
Name: count, dtype: int64

In [66]:
# guardamos df

df_travel_balanceado_b.to_csv('data/df_travel_balanceado_b.csv')