# Análise de dados

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("../datasets/application_data.csv")

In [None]:
df.info()

## Origem dos dados
https://www.kaggle.com/c/home-credit-default-risk

## Dicionario do dataset

In [None]:
df.head()

In [None]:
df.select_dtypes(include="object").head()

## Retirar variaveis que não pertence ao estudo do caso (Drop axi=1)

In [None]:
df = df.drop("SK_ID_CURR", axis=1)

## Identificar variaveis Nulas e preencher com a media da categoria (Nan)

In [None]:
df.select_dtypes(include="object").isnull().sum()

In [None]:
df["NAME_TYPE_SUITE"].fillna(axis=0, method="bfill", inplace=True)
df["OCCUPATION_TYPE"].fillna(axis=0, method="bfill", inplace=True)
df["FONDKAPREMONT_MODE"].fillna(axis=0, method="bfill", inplace=True)
df["HOUSETYPE_MODE"].fillna(axis=0, method="bfill", inplace=True)
df["WALLSMATERIAL_MODE"].fillna(axis=0, method="bfill", inplace=True)
df["EMERGENCYSTATE_MODE"].fillna(axis=0, method="bfill", inplace=True)
df["FONDKAPREMONT_MODE"].fillna(axis=0, method="bfill", inplace=True)

In [None]:
df["FONDKAPREMONT_MODE"].fillna(value="reg oper account", inplace=True)

In [None]:
# df.dropna(axis=1)

In [None]:
df.select_dtypes(include="int64").isnull().sum()

In [None]:
df.select_dtypes(include="float64").isnull().sum()

In [None]:
df["AMT_ANNUITY"].fillna(value=df["AMT_ANNUITY"].mean(), inplace=True)
df["AMT_GOODS_PRICE"].fillna(value=df["AMT_GOODS_PRICE"].mean(), inplace=True)
df["AMT_REQ_CREDIT_BUREAU_DAY"].fillna(value=df["AMT_REQ_CREDIT_BUREAU_DAY"].mean(), inplace=True)
df["AMT_REQ_CREDIT_BUREAU_WEEK"].fillna(value=df["AMT_REQ_CREDIT_BUREAU_WEEK"].mean(), inplace=True)
df["AMT_REQ_CREDIT_BUREAU_MON"].fillna(value=df["AMT_REQ_CREDIT_BUREAU_MON"].mean(), inplace=True)
df["AMT_REQ_CREDIT_BUREAU_QRT"].fillna(value=df["AMT_REQ_CREDIT_BUREAU_QRT"].mean(), inplace=True)
df["AMT_REQ_CREDIT_BUREAU_YEAR"].fillna(value=df["AMT_REQ_CREDIT_BUREAU_YEAR"].mean(), inplace=True)


In [None]:
df.fillna(value=df.mean(), inplace=True)

In [None]:
df.isnull().sum().sum()

## Converter variaveis textos/categoricos para numericos/categoricos 

In [None]:
df.select_dtypes(include="object").isnull().sum()

In [None]:
def convert_text_to_numetic(value, lstText):
    for index, text in enumerate(lstText):
        if text == value:
            return index


In [None]:
lst_Coll = ["CODE_GENDER", "FLAG_OWN_CAR", "FLAG_OWN_REALTY", "NAME_TYPE_SUITE", "NAME_INCOME_TYPE", "NAME_EDUCATION_TYPE", "NAME_FAMILY_STATUS", "NAME_HOUSING_TYPE", "OCCUPATION_TYPE", "WEEKDAY_APPR_PROCESS_START", "ORGANIZATION_TYPE", "FONDKAPREMONT_MODE", "HOUSETYPE_MODE", "WALLSMATERIAL_MODE", "EMERGENCYSTATE_MODE"] 

for col in lst_Coll:
    lstText = df[col].unique()
    df["CAT_" + col] = [convert_text_to_numetic(text, lstText) for text in df[col].values]

In [None]:
df.head()

## Retirar registros que tenha variaveis com outliers (Escor Z)

In [None]:
name_columns = df.select_dtypes(include="float64").columns
des_target = df["TARGET"].unique()

In [None]:
name_columns

In [None]:
name_columns = df.select_dtypes(include="int64").columns

In [None]:
name_columns

In [None]:
for variavel in name_columns:
    df_defaut = []
    for value, target in enumerate(des_target):
        dados = df[df["TARGET"]==value]
        print(f"Classe: {des_target[value]} - variavel: {variavel}")

        data_mean, data_std = dados[variavel].mean(), dados[variavel].std()
        data_min, data_max = dados[variavel].min(), dados[variavel].max()
        print("Real Min: %.3f Real Max: %.3f" %  (data_min, data_max))

        cut_off = data_std * 2.5
        lower, upper = data_mean - cut_off, data_mean + cut_off
        print("Limit Min: %.3f Limit Max: %.3f" %  (lower, upper))


        dataset = dados[dados[variavel] >= lower]
        if len(dataset) == 0:
            dataset = dados[dados[variavel] <= upper]
        else:
            dataset = dataset[dataset[variavel] <= upper]


        df_defaut.append(dataset)

        outliers = pd.concat([dados[dados[variavel] < lower], dados[dados[variavel] > upper]])
        print("Identfied outliers: %d \n" % len(outliers))

    df = pd.concat(df_defaut)

In [None]:
import collections

In [None]:
print(collections.Counter(df.TARGET))

## Balancear as categorias (SMOTE)

In [None]:
name = df.select_dtypes(include="object").columns

In [None]:
name

In [None]:
df = df.drop(name, axis=1)


In [None]:
y = df["TARGET"].to_numpy()
X = df
X = X.drop("TARGET", axis=1)
X = X.to_numpy()

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
overSampling = SMOTE()
X, y = overSampling.fit_resample(X, y)
print(collections.Counter(y))

In [None]:
name_coll = df.columns[1:]
df_smote = pd.DataFrame(X, columns=name_coll)

In [None]:
df_smote.head()

## Normalizar as variaveis (StandardScaler)

In [None]:
from sklearn import preprocessing

In [None]:
scaler = preprocessing.StandardScaler().fit(X)

In [None]:
scaler

In [None]:
X_scaled = scaler.transform(X)

In [None]:
df_scaled = pd.DataFrame(X_scaled, columns=name_coll)

In [None]:
df_scaled.head()

## Identificar as melhores relações de variaveis para a variavel taget (random forest)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import fetch_openml
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

In [None]:
categorical_columns = ['CAT_CODE_GENDER', 'CAT_FLAG_OWN_CAR', 'CAT_FLAG_OWN_REALTY',
       'CAT_NAME_TYPE_SUITE', 'CAT_NAME_INCOME_TYPE', 'CAT_NAME_EDUCATION_TYPE',
       'CAT_NAME_FAMILY_STATUS', 'CAT_NAME_HOUSING_TYPE', 'CAT_OCCUPATION_TYPE',
       'CAT_WEEKDAY_APPR_PROCESS_START', 'CAT_ORGANIZATION_TYPE', 'CAT_FONDKAPREMONT_MODE',
       'CAT_HOUSETYPE_MODE', 'CAT_WALLSMATERIAL_MODE', 'CAT_EMERGENCYSTATE_MODE', 'FLAG_DOCUMENT_2',
       'FLAG_DOCUMENT_3', 'FLAG_DOCUMENT_4', 'FLAG_DOCUMENT_5',
       'FLAG_DOCUMENT_6', 'FLAG_DOCUMENT_7', 'FLAG_DOCUMENT_8',
       'FLAG_DOCUMENT_9', 'FLAG_DOCUMENT_10', 'FLAG_DOCUMENT_11',
       'FLAG_DOCUMENT_12', 'FLAG_DOCUMENT_13', 'FLAG_DOCUMENT_14',
       'FLAG_DOCUMENT_15', 'FLAG_DOCUMENT_16', 'FLAG_DOCUMENT_17',
       'FLAG_DOCUMENT_18', 'FLAG_DOCUMENT_19', 'FLAG_DOCUMENT_20',
       'FLAG_DOCUMENT_21', 'FLAG_MOBIL', 'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE',
       'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL']

In [None]:
for col in categorical_columns:
    df_scaled[col] = df_scaled[col].astype("category")

In [None]:
numerical_columns =['CNT_CHILDREN', 'DAYS_BIRTH', 'DAYS_EMPLOYED',
       'DAYS_ID_PUBLISH', 'REGION_RATING_CLIENT',
       'REGION_RATING_CLIENT_W_CITY', 'HOUR_APPR_PROCESS_START',
       'REG_REGION_NOT_LIVE_REGION', 'REG_REGION_NOT_WORK_REGION',
       'LIVE_REGION_NOT_WORK_REGION', 'REG_CITY_NOT_LIVE_CITY',
       'REG_CITY_NOT_WORK_CITY', 'LIVE_CITY_NOT_WORK_CITY']

In [None]:
for col in numerical_columns:
    df_scaled[col] = df_scaled[col].astype("int64")

In [None]:
float_columns = df_scaled.select_dtypes(include="float64").columns

In [None]:
numerical_columns.extend(float_columns)

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', preprocessing.StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('one_hot', OneHotEncoder())])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ])

In [None]:
df_scaled.isnull().sum().sum()

In [None]:
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier',  RandomForestClassifier(random_state=5, n_estimators=10))])
    
model = pipe.fit(df_scaled, y)

In [None]:
#!pip install eli5

In [None]:
onehot_columns = list(pipe.named_steps['preprocessor'].named_transformers_['cat'].named_steps['one_hot'].get_feature_names(input_features=categorical_columns))
numeric_features_list = list(numerical_columns)
numeric_features_list.extend(onehot_columns)

In [None]:
import eli5

In [None]:
eli5.explain_weights(pipe.named_steps['classifier'], top=60, feature_names=numeric_features_list)

## Aplicar o PCA

In [None]:
#from sklearn.decomposition import PCA

In [None]:
#pca = PCA(n_components=2)
#pca.fit(X)
