# Programa com Funcoes

## Caminhos

In [1]:
import os

#Folder Inicial
path = os.getcwd()

#Subpastas
pathin = path + '\\Entrada\\'
pathfixo = path + '\\Fixo\\'
pathout = path + '\\Saida\\'
pathparcial = path + '\\Parcial\\'
pathaux = path + '\\Auxiliar\\'

## Pacotes

In [2]:
import dill
import pandas as pd
import numpy as np

import nltk
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer

from time import gmtime, strftime

import re

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV

import unidecode
from unicodedata import normalize

import pygtrie

## Funcao para remocao de acentos

In [3]:
rem_acentos = lambda x: normalize('NFKD', x).encode('ASCII', 'ignore').decode('ASCII')

## Funcao de Preprocessamento (mudar conforme a base de dados)

In [4]:
def PreProcess(dataset, y):
        
    #### Variavel Resposta
    
    #Valores possiveis 0 e 1
    dataset['y'] = np.where((dataset[y] == 1) | (dataset[y] == 0), dataset[y], np.nan)
    dataset[y] = dataset['y']
    dataset  = dataset[(dataset[yy] == 1) | (dataset[yy] == 0)] #Excluindo Linhas com Y nulo
    dataset[y] = dataset[y].astype('int64')
    
    #print(dataset['default'].unique())
    #dataset['default'].value_counts()

    #### account_check_status
    
    #data['account_check_status'].value_counts()
    x = 'account_check_status'
    dataset[x] = dataset[x].astype(str)
    dataset[x] = dataset[x].apply(lambda x: x.lower()).apply(rem_acentos).apply(lambda x: re.sub(r'[^a-z0-9]', ' ', x)).apply(lambda x: re.sub(r' +', ' ', x)).apply(lambda x: x.strip())

    dataset['account_check_status_No'] = np.where(dataset[x] == 'no checking account', 1, 0)
    dataset['account_check_status_Less0'] = np.where(dataset[x] == '0 dm', 1, 0)
    dataset['account_check_status_0To200'] = np.where(dataset[x] == '0 200 dm', 1, 0)
    dataset['account_check_status_200OrMore'] = np.where(dataset[x] == '200 dm salary assignments for at least 1 year', 1, 0)

    #print(dataset[x].unique())
    #dataset[x].value_counts()
    del x

    #### duration_in_month
    
    dataset['duration_in_month'] = np.where(dataset['duration_in_month'] < 0, np.nan, dataset['duration_in_month'])

    #### credit_history
    
    x = 'credit_history'
    dataset[x] = dataset[x].astype(str)
    dataset[x] = dataset[x].apply(lambda x: x.lower()).apply(rem_acentos).apply(lambda x: re.sub(r'[^a-z0-9]', ' ', x)).apply(lambda x: re.sub(r' +', ' ', x)).apply(lambda x: x.strip())

    dataset['credit_history_Critical'] = np.where(dataset[x] == 'critical account other credits existing not at this bank', 1, 0)
    dataset['credit_history_ExistingCreditPaidTillNow'] = np.where(dataset[x] == 'existing credits paid back duly till now', 1, 0)
    dataset['credit_history_DelayInPast'] = np.where(dataset[x] == 'delay in paying off in the past', 1, 0)
    dataset['credit_history_NoTakenOrAllPaid'] = np.where(dataset[x] == 'no credits taken all credits paid back duly', 1, 0)
    dataset['credit_history_AllAtBankPaid'] = np.where(dataset[x] == 'all credits at this bank paid back duly', 1, 0)

    #print(dataset[x].unique())
    #dataset[x].value_counts()
    del x

    #### purpose
    
    x = 'purpose'
    dataset[x] = dataset[x].astype(str)
    dataset[x] = dataset[x].apply(lambda x: x.lower()).apply(rem_acentos).apply(lambda x: re.sub(r'[^a-z0-9]', ' ', x)).apply(lambda x: re.sub(r' +', ' ', x)).apply(lambda x: x.strip())

    dataset['purpose_DomesticAppliances'] = np.where(dataset[x] == 'domestic appliances', 1, 0)
    dataset['purpose_VacationOrNone'] = np.where(dataset[x] == 'vacation does not exist', 1, 0)
    dataset['purpose_RadioTelevision'] = np.where(dataset[x] == 'radio television', 1, 0)
    dataset['purpose_CarNew'] = np.where(dataset[x] == 'car new', 1, 0)
    dataset['purpose_CarUsed'] = np.where(dataset[x] == 'car used', 1, 0)
    dataset['purpose_Business'] = np.where(dataset[x] == 'business', 1, 0)
    dataset['purpose_Repairs'] = np.where(dataset[x] == 'repairs', 1, 0)
    dataset['purpose_Education'] = np.where(dataset[x] == 'education', 1, 0)
    dataset['purpose_Furniture'] = np.where(dataset[x] == 'furniture equipment', 1, 0)
    dataset['purpose_Retraining'] = np.where(dataset[x] == 'retraining', 1, 0)

    #print(dataset[x].unique())
    #dataset[x].value_counts()
    del x

    #### credit_amount
    
    dataset['credit_amount'] = np.where(dataset['credit_amount'] < 0, np.nan, dataset['credit_amount'])

    #### savings
    
    x = 'savings'
    dataset[x] = dataset[x].astype(str)
    dataset[x] = dataset[x].apply(lambda x: x.lower()).apply(rem_acentos).apply(lambda x: re.sub(r'[^a-z0-9]', ' ', x)).apply(lambda x: re.sub(r' +', ' ', x)).apply(lambda x: x.strip())

    dataset['savings_Unknown'] = np.where(dataset[x] == 'unknown no savings account', 1, 0)
    dataset['savings_Under100'] = np.where(dataset[x] == '100 dm', 1, 0)
    dataset['savings_100To500'] = np.where(dataset[x] == '100 500 dm', 1, 0)
    dataset['savings_500To100'] = np.where(dataset[x] == '500 1000 dm', 1, 0)
    dataset['savings_Over1000'] = np.where(dataset[x] == '1000 dm', 1, 0)

    #print(dataset[x].unique())
    #dataset[x].value_counts()
    del x

    #### present_emp_since
    
    x = 'present_emp_since'
    dataset[x] = dataset[x].astype(str)
    dataset[x] = dataset[x].apply(lambda x: x.lower()).apply(rem_acentos).apply(lambda x: re.sub(r'[^a-z0-9]', ' ', x)).apply(lambda x: re.sub(r' +', ' ', x)).apply(lambda x: x.strip())

    dataset['present_emp_since_Less1Y'] = np.where(dataset[x] == '1 year', 1, 0)
    dataset['present_emp_since_1To4'] = np.where(dataset[x] == '1 4 years', 1, 0)
    dataset['present_emp_since_4To7'] = np.where(dataset[x] == '4 7 years', 1, 0)
    dataset['present_emp_since_7OrMore'] = np.where(dataset[x] == '7 years', 1, 0)
    dataset['present_emp_since_Unemployed'] = np.where(dataset[x] == 'unemployed', 1, 0)

    #print(dataset[x].unique())
    #dataset[x].value_counts()
    del x

    #### installment_as_income_perc
    
    dataset['installment_as_income_perc'] = np.where(dataset['installment_as_income_perc'] < 0, np.nan, dataset['installment_as_income_perc'])

    #### personal_status_sex
    
    x = 'personal_status_sex'
    dataset[x] = dataset[x].astype(str)
    dataset[x] = dataset[x].apply(lambda x: x.lower()).apply(rem_acentos).apply(lambda x: re.sub(r'[^a-z0-9]', ' ', x)).apply(lambda x: re.sub(r' +', ' ', x)).apply(lambda x: x.strip())

    dataset['personal_status_sex_MaleSigle'] = np.where(dataset[x] == 'male single', 1, 0)
    dataset['personal_status_sex_MaleMarriedWidowed'] = np.where(dataset[x] == 'male married widowed', 1, 0)
    dataset['personal_status_sex_MaleDivorced'] = np.where(dataset[x] == 'male divorced separated', 1, 0)
    dataset['personal_status_sex_FemaleDivorcedMarried'] = np.where(dataset[x] == 'female divorced separated married', 1, 0)

    #print(dataset[x].unique())
    #dataset[x].value_counts()
    del x

    #### other_debtors
    
    x = 'other_debtors'
    dataset[x] = dataset[x].astype(str)
    dataset[x] = dataset[x].apply(lambda x: x.lower()).apply(rem_acentos).apply(lambda x: re.sub(r'[^a-z0-9]', ' ', x)).apply(lambda x: re.sub(r' +', ' ', x)).apply(lambda x: x.strip())

    dataset['other_debtors_None'] = np.where(dataset[x] == 'none', 1, 0)
    dataset['other_debtors_Guarantor'] = np.where(dataset[x] == 'guarantor', 1, 0)
    dataset['other_debtors_CoApplicant'] = np.where(dataset[x] == 'co applicant', 1, 0)

    #print(dataset[x].unique())
    #dataset[x].value_counts()
    del x

    #### present_res_since
    
    dataset['present_res_since'] = np.where(dataset['present_res_since'] < 0, np.nan, dataset['present_res_since'])

    #### property
    
    x = 'property'
    dataset[x] = dataset[x].astype(str)
    dataset[x] = dataset[x].apply(lambda x: x.lower()).apply(rem_acentos).apply(lambda x: re.sub(r'[^a-z0-9]', ' ', x)).apply(lambda x: re.sub(r' +', ' ', x)).apply(lambda x: x.strip())

    dataset['property_RealEstate'] = np.where(dataset[x] == 'real estate', 1, 0)
    dataset['property_BuilginSocietySavingsLifeInsurance'] = np.where(dataset[x] == 'if not a121 building society savings agreement life insurance', 1, 0)
    dataset['property_UnknownOrNoProperty'] = np.where(dataset[x] == 'unknown no property', 1, 0)
    dataset['property_CarOrOther'] = np.where(dataset[x] == 'if not a121 a122 car or other not in attribute 6', 1, 0)

    #print(dataset[x].unique())
    #dataset[x].value_counts()
    del x

    #### age
    
    dataset['age'] = np.where(dataset['age'] < 0, np.nan, dataset['age'])

    #### other_installment_plans
    
    x = 'other_installment_plans'
    dataset[x] = dataset[x].astype(str)
    dataset[x] = dataset[x].apply(lambda x: x.lower()).apply(rem_acentos).apply(lambda x: re.sub(r'[^a-z0-9]', ' ', x)).apply(lambda x: re.sub(r' +', ' ', x)).apply(lambda x: x.strip())

    dataset['other_installment_plans_None'] = np.where(dataset[x] == 'none', 1, 0)
    dataset['other_installment_plans_Bank'] = np.where(dataset[x] == 'bank', 1, 0)
    dataset['other_installment_plans_Stores'] = np.where(dataset[x] == 'stores', 1, 0)

    #print(dataset[x].unique())
    #dataset[x].value_counts()
    del x

    #### housing
    
    x = 'housing'
    dataset[x] = dataset[x].astype(str)
    dataset[x] = dataset[x].apply(lambda x: x.lower()).apply(rem_acentos).apply(lambda x: re.sub(r'[^a-z0-9]', ' ', x)).apply(lambda x: re.sub(r' +', ' ', x)).apply(lambda x: x.strip())

    dataset['housing_Own'] = np.where(dataset[x] == 'own', 1, 0)
    dataset['housing_ForFree'] = np.where(dataset[x] == 'for free', 1, 0)
    dataset['housing_Rent'] = np.where(dataset[x] == 'rent', 1, 0)

    #print(dataset[x].unique())
    #dataset[x].value_counts()
    del x

    #### credits_this_bank
    
    dataset['credits_this_bank'] = np.where(dataset['credits_this_bank'] < 0, np.nan, dataset['credits_this_bank'])

    #### job
    
    x = 'job'
    dataset[x] = dataset[x].astype(str)
    dataset[x] = dataset[x].apply(lambda x: x.lower()).apply(rem_acentos).apply(lambda x: re.sub(r'[^a-z0-9]', ' ', x)).apply(lambda x: re.sub(r' +', ' ', x)).apply(lambda x: x.strip())

    dataset['job_SkilledEmployeeOrOfficial'] = np.where(dataset[x] == 'skilled employee official', 1, 0)
    dataset['job_UnskilledResident'] = np.where(dataset[x] == 'unskilled resident', 1, 0)
    dataset['job_ManagementOrSelfEmployedOrHighlyQualEmployeeOrOfficer'] = np.where(dataset[x] == 'management self employed highly qualified employee officer', 1, 0)
    dataset['job_UnemployedOrUnskilledNonResident'] = np.where(dataset[x] == 'unemployed unskilled non resident', 1, 0)

    #print(dataset[x].unique())
    #dataset[x].value_counts()
    del x

    #### people_under_maintenance
    
    x = 'people_under_maintenance'
    dataset[x] = dataset[x].astype(str)
    dataset[x] = dataset[x].apply(lambda x: x.lower()).apply(rem_acentos).apply(lambda x: re.sub(r'[^a-z0-9]', ' ', x)).apply(lambda x: re.sub(r' +', ' ', x)).apply(lambda x: x.strip())

    dataset['people_under_maintenance_1'] = np.where(dataset[x] == '1', 1, 0)
    dataset['people_under_maintenance_2'] = np.where(dataset[x] == '2', 1, 0)

    #print(dataset[x].unique())
    #dataset[x].value_counts()
    del x

    #### telephone
    x = 'telephone'
    dataset[x] = dataset[x].astype(str)
    dataset[x] = dataset[x].apply(lambda x: x.lower()).apply(rem_acentos).apply(lambda x: re.sub(r'[^a-z0-9]', ' ', x)).apply(lambda x: re.sub(r' +', ' ', x)).apply(lambda x: x.strip())

    dataset['telephone_None'] = np.where(dataset[x] == 'none', 1, 0)
    dataset['telephone_Yes'] = np.where(dataset[x] == 'yes registered under the customers name', 1, 0)

    #print(dataset[x].unique())
    #dataset[x].value_counts()
    del x

    #### foreign_worker
    
    x = 'foreign_worker'
    dataset[x] = dataset[x].astype(str)
    dataset[x] = dataset[x].apply(lambda x: x.lower()).apply(rem_acentos).apply(lambda x: re.sub(r'[^a-z0-9]', ' ', x)).apply(lambda x: re.sub(r' +', ' ', x)).apply(lambda x: x.strip())

    dataset['foreign_worker_Yes'] = np.where(dataset[x] == 'yes', 1, 0)
    dataset['foreign_worker_No'] = np.where(dataset[x] == 'no', 1, 0)

    #print(dataset[x].unique())
    #dataset[x].value_counts()
    del x

    dataset = dataset[(dataset[y] == 1) | (dataset[y] == 0)]
    
    dataset = dataset[['id', y, 
    'account_check_status_No', 'account_check_status_Less0', 'account_check_status_0To200', 'account_check_status_200OrMore',
    'duration_in_month',
    'purpose_DomesticAppliances', 'purpose_VacationOrNone', 'purpose_RadioTelevision', 'purpose_CarNew', 'purpose_CarUsed',
    'purpose_Business', 'purpose_Repairs', 'purpose_Education', 'purpose_Furniture', 'purpose_Retraining',
    'credit_amount',
    'savings_Unknown', 'savings_Under100', 'savings_100To500', 'savings_500To100', 'savings_Over1000',
    'present_emp_since_Less1Y', 'present_emp_since_1To4', 'present_emp_since_4To7',
    'present_emp_since_7OrMore', 'present_emp_since_Unemployed',
    'installment_as_income_perc',
    'personal_status_sex_MaleSigle', 'personal_status_sex_MaleMarriedWidowed',
    'personal_status_sex_MaleDivorced', 'personal_status_sex_FemaleDivorcedMarried',
    'other_debtors_None', 'other_debtors_Guarantor', 'other_debtors_CoApplicant',
    'present_res_since',
    'property_RealEstate', 'property_BuilginSocietySavingsLifeInsurance',
    'property_UnknownOrNoProperty', 'property_CarOrOther',
    'other_installment_plans_None', 'other_installment_plans_Bank', 'other_installment_plans_Stores',
    'housing_Own', 'housing_ForFree', 'housing_Rent',
    'credits_this_bank',
    'job_SkilledEmployeeOrOfficial', 'job_UnskilledResident',
    'job_ManagementOrSelfEmployedOrHighlyQualEmployeeOrOfficer', 'job_UnemployedOrUnskilledNonResident',
    'people_under_maintenance_1', 'people_under_maintenance_2',
    'telephone_None', 'telephone_Yes',
    'foreign_worker_Yes', 'foreign_worker_No']]

    return dataset

## Substituir Missing

In [5]:
#Mediana

def dicMedian(data, variables):
    
    dic = {}

    for v in variables:

        if bool(dic) == False:
            dic = {'median_' + v: data[v].median()}
        if bool(dic) == True:
            dic['median_' + v] = data[v].median()

    return dic

In [6]:
def RemoveNA(data, variables, dic):
    for x in variables:
        data[x] = np.where(data[x] == np.nan, dic['median_' + x], data[x])
    return data

## Scale das Variaveis

In [7]:
def dicMeanSd(data, variables):
    
    dic = {}

    for v in variables:

        if bool(dic) == False:
            dic = {'mean_' + v: data[v].mean()}
        if bool(dic) == True:
            dic['mean_' + v] = data[v].mean()

        dic['sd_' + v] = data[v].std()

    return dic

In [8]:
def Scale(data, variables, dic):
    for x in variables:
        data[x] = (data[x] - dic['mean_' + x]) / dic['sd_' + x]
    return data

## Salvando

In [9]:
with open(pathaux + 'Functions.pickle', 'wb') as f:
    dill.dump((rem_acentos, PreProcess, dicMedian, RemoveNA, dicMeanSd, Scale), f)