In [1]:
import pandas as pd
import numpy as np

In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, matthews_corrcoef
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [4]:
from scipy.stats import ks_2samp

In [5]:
dados = pd.read_csv('base_tratada.csv')

In [6]:
dados.tail()

Unnamed: 0,MAU,TPCLIENT_2,LOCALIZA2_1.0,LOCALIZA2_2.0,"TPEMPREG2_(32.0, 130.0]","TPEMPREG2_(130.0, 628.0]",SEXO_M,ESTCIVIL2_1.0,ESTCIVIL2_2.0,SITRESID_P,"LIMITE2_(31.0, 118.0]","LIMITE2_(118.0, 99865.0]","TEMPORES2_(4.0, 70.0]",CEP2_1.0,CEP2_2.0,PROFISSAO2_1.0,PROFISSAO2_2.0,"ANONASCI2_(50.0, 63.0]","ANONASCI2_(63.0, 74.0]","ANONASCI2_(74.0, 87.0]"
7315,1,0,0,1,1,0,1,1,0,1,1,0,1,1,0,0,1,0,1,0
7316,0,0,0,1,1,0,0,1,0,1,0,0,0,1,0,0,1,0,1,0
7317,0,0,0,1,0,1,1,1,0,1,1,0,0,1,0,1,0,1,0,0
7318,0,0,0,1,0,0,1,1,0,1,1,0,1,1,0,0,0,0,1,0
7319,1,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1


In [6]:
dados.shape

(7320, 20)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(dados.iloc[:,1:20], dados['MAU'] , test_size=0.3 , random_state=42)

In [8]:
# Standardize features
scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(X_train)
x_test_scaled = scaler.transform(X_test)

# Reshape the data 
x_train_reshaped = x_train_scaled.reshape(x_train_scaled.shape[0], x_train_scaled.shape[1], 1)
x_test_reshaped = x_test_scaled.reshape(x_test_scaled.shape[0], x_test_scaled.shape[1], 1)

In [415]:
tf.random.set_seed(42)  # Seed TensorFlow random number generator
np.random.seed(42) 
# Define the CNN model
model = Sequential()

# Add a 1D convolutional layer
model.add(Conv1D(filters=64, kernel_size=5, activation='relu', input_shape=(x_train_scaled.shape[1], 1)))

# Add a max-pooling layer
model.add(MaxPooling1D(pool_size=1))

# Flatten the output for the fully connected layers
model.add(Flatten())

# Add one or more fully connected layers (Dense layers) for classification
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [416]:
# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy','AUC'])

# Train the model
model.fit(x_train_reshaped, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x18c8eca3a30>

In [417]:
predictions = model.predict(x_test_reshaped)



In [418]:
roc_auc_score(y_test,  predictions.reshape(-1))

0.7002588637683353

In [422]:
y_preds_bin = pd.Series(predictions.reshape(-1)).apply(lambda x: 1 if x> 0.30 else 0)

In [423]:
matthews_corrcoef(y_test,  y_preds_bin )

0.2902943698805618

In [419]:
ks_2samp(predictions.reshape(-1)[y_test ==0],predictions.reshape(-1)[y_test ==1])

KstestResult(statistic=0.3168775392105402, pvalue=3.9474616684155413e-41, statistic_location=0.30024636, statistic_sign=1)

---
## Testando outras funções de perda e fazendo Grid Search.

In [27]:
# Define a function to create a CNN model
def create_cnn_model(filters=64, kernel_size=5, pool_size=1, dense_units=64):
    model = Sequential()
    model.add(Conv1D(filters, kernel_size, activation='relu', input_shape=(x_train_scaled.shape[1], 1)))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(Flatten())
    model.add(Dense(dense_units, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Create a KerasClassifier with the function and defaults
cnn = KerasClassifier(build_fn=create_cnn_model, epochs=10, batch_size=32, verbose=0)

# Define the hyperparameters and values to search
param_grid = {
    'filters': [64, 128],
    'kernel_size': [3, 5],
    'pool_size': [1, 2],
    'dense_units': [64, 128]
}

# Create a GridSearchCV instance
grid_search = GridSearchCV(estimator=cnn, param_grid=param_grid, cv=5)

# Fit the grid search to your data
grid_search.fit(x_train_reshaped, y_train)

# Print the best parameters and their associated score
print("Best parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

  cnn = KerasClassifier(build_fn=create_cnn_model, epochs=10, batch_size=32, verbose=0)


Best parameters found:  {'dense_units': 64, 'filters': 128, 'kernel_size': 3, 'pool_size': 2}
Best accuracy found:  0.707062304019928


In [18]:
grid_search.best_estimator_.predict_proba

{'epochs': 10,
 'batch_size': 32,
 'verbose': 0,
 'dense_units': 64,
 'filters': 64,
 'kernel_size': 3,
 'pool_size': 1,
 'build_fn': <function __main__.create_cnn_model(filters=64, kernel_size=5, pool_size=1, dense_units=64)>}

In [31]:
predictions2 = grid_search.best_estimator_.predict_proba(x_test_reshaped)



In [38]:
roc_auc_score(y_test,  predictions2[:,1])

0.6865248410612319

In [35]:
predictions2[:,1]

array([0.20812996, 0.3836525 , 0.3815777 , ..., 0.26650733, 0.4005829 ,
       0.17777367], dtype=float32)