In [None]:
import pandas as pd
import numpy as np
import pickle

# Class methods

In [None]:
def __init__(self):
    self.home_path = "/home/marcos/Documentos/comunidade_DS/pa004_health_insurance_cross_sell/"
    self.mms_age = pickle.load(open(home_path + "encoders/age_new_scaler.pkl", "rb"))
    self.region_freq = pickle.load(open(home_path + "encoders/region_freq_new_scaler.pkl", "rb"))
    self.channel_freq = pickle.load(open(home_path + "encoders/channel_freq_new_scaler.pkl", "rb"))
    self.age_encoder = pickle.load(open(home_path + "encoders/v_age_new_encoder.pkl", "rb"))
    self.damage_encoder = pickle.load(open(home_path + "encoders/v_dam_new_encoder.pkl", "rb"))
    self.ss_premium = pickle.load(open(home_path + "encoders/premium_new_scaler.pkl", "rb"))
    self.mms_vintage = pickle.load(open(home_path + "encoders/vintage_new_scaler.pkl", "rb"))

In [None]:
def data_cleaning(self, data):
    data['region_code'] = data['region_code'].astype(int)
    data['policy_sales_channel'] = data['policy_sales_channel'].astype(int)
    return data

In [None]:
def feature_engineering(self, data):
    def damage_map(damage):
        if damage == 'Yes':
            return 1
        else:
            return 0

    vehicle_hist = data['vehicle_damage'].map(damage_map) + 1 - data['previously_insured']

    data.insert(loc=len(data.columns)-1, column='vehicle_hist', value=vehicle_hist)
    return data

In [None]:
def data_preparation(self, data):
    df9 = data.copy()
    
    # lowercase columns
    cols_list = df9.columns.tolist()
    cols_lower = [x.lower() for x in cols_list]
    df9.columns = cols_lower
    
    # apply encoders and scalers
    # gender: "One-hot" encoder
    df9 = pd.get_dummies(df9, prefix='gender', columns=['gender'])

    # age - MinMaxScaler
    df9['age'] = mms_age.transform(df9[['age']].values)

    # region_code - frequency encoding
    df9['region_code'] = df9['region_code'].map(region_freq)

    # policy_sales_channel - frequency encoding
    df9['policy_sales_channel'] = df9['policy_sales_channel'].map(channel_freq)

    # vehicle_age - LabelEncoder
    df9['vehicle_age'] = age_encoder.transform(df9['vehicle_age'])

    # vehicle_damage - LabelEncoder
    df9['vehicle_damage'] = damage_encoder.transform(df9['vehicle_damage'])

    # annual_premium - StandardScaler
    df9['annual_premium'] = ss_premium.transform(df9[['annual_premium']].values)

    # vintage - MinMaxScaler
    df9['vintage'] = mms_vintage.transform(df9[['vintage']].values)

    # select columns
    cols_selected_full = ['id', 'age', 'policy_sales_channel', 'previously_insured', 'annual_premium', 'vintage',
                          'vehicle_hist', 'gender_Female', 'gender_Male']
    df9 = df9[cols_selected_full].copy()
    return df9

In [None]:
def model_predict(trained_model, data_test):
    # drop id
    data_testing = data_test.drop(['id'], axis=1).copy()
    # predict_proba:
    yhat_proba = trained_model.predict_proba(data_testing)
    
    # transform yhat_proba to 1D-array
    yhat_proba_1d = yhat_proba[:, 1].tolist()
    
    # include in dataframe
    testing_data = data_test.copy()
    testing_data['score'] = yhat_proba_1d
    # sort
    testing_data = testing_data.sort_values('score', ascending=False)
    # reset index
    testing_data.reset_index(drop=True, inplace=True)
    return testing_data