# Tratamento da base dados

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import io

In [None]:
base_credit = pd.read_csv('credit_card_clients.csv', header = 1)
base_credit

In [None]:
figura = plt.figure(figsize=(30,30))
sns.heatmap(base_credit.corr(), annot=True);

In [None]:
base_credit.columns

In [None]:
X = base_credit.iloc[:, 1:len(base_credit.columns)].values
X

In [None]:
X.shape

In [None]:
X[0]

In [None]:
# Convertendo para o format aceito no Sagemaker
X = np.array(X).astype('float32')
X[0]

# Configurações do SageMaker

In [None]:
import sagemaker
import boto3
from sagemaker import Session

In [None]:
session = sagemaker.Session()
bucket = 'sagemaker-curso-bucket'
subpasta_modelo = 'modelos/credit-card/pca'
subpasta_dataset = 'datasets/credit-card'
key_train = 'credit-card-train-data-pca'
role = "arn:aws:iam::339712791663:role/service-role/AmazonSageMaker-ExecutionRole-20240702T173175"
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, subpasta_dataset, key_train)
output_location = 's3://{}/{}/output'.format(bucket, subpasta_modelo)
print('Role: ', role)
print('Localização da base de dados de treinamento: ', s3_train_data)
print('Modelo final será salvo em: ', output_location)

In [None]:
import sagemaker.amazon.common as smac
buffer = io.BytesIO()
smac.write_numpy_to_dense_tensor(buffer, X)
buffer.seek(0)

In [None]:
boto3.resource('s3').Bucket(bucket).Object(os.path.join(subpasta_dataset, 'train', key_train)).upload_fileobj(buffer)

# Treinamento do PCA

In [None]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/ecr-sa-east-1.html
container = sagemaker.image_uris.retrieve(framework = 'pca', region = boto3.Session().region_name)

In [None]:
# https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html
pca = sagemaker.estimator.Estimator(image_uri = container,
                                    role = role,
                                    instance_count = 1,
                                    instance_type = 'ml.c4.xlarge',
                                    output_path = output_location,
                                    sagemaker_session = session)

In [None]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/PCA-reference.html
pca.set_hyperparameters(feature_dim = 30000,
                        num_components = 2, # numero de componentes principais que devem ser calculados
                        mini_batch_size = 200) # numero de linhas enviadas para o treinamento

In [None]:
s3_train_data

In [None]:
pca.fit({'train': s3_train_data})

# Redução de dimensionalidade

In [None]:
pca_predictor = pca.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

In [None]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
pca_predictor.serializer = CSVSerializer()
pca_predictor.deserializer = JSONDeserializer()

In [None]:
X[0]

In [None]:
X0_pca = pca_predictor.predict(X[0])
print(X0_pca)

In [None]:
X_pca = pca_predictor.predict(X)

In [None]:
X_pca

In [None]:
X_pca = np.array([r['projection'] for r in X_pca['projections']])

In [None]:
X_pca, X_pca.shape

In [None]:
X_pca = np.array(X_pca).astype('float32')

# Treinamento do k-means

In [None]:
subpasta_modelo_kmeans = 'modelos/credit-card/kmeans'
output_location_kmeans = 's3://{}/{}'.format(bucket, subpasta_modelo_kmeans)
print('Modelo kmeans será salvo em: ', output_location_kmeans)

In [None]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/k-means.html
# https://docs.aws.amazon.com/sagemaker/latest/dg/k-means-api-config.html
from sagemaker import KMeans

In [None]:
kmeans = KMeans(role = role,
                instance_count = 1,
                instance_type = 'ml.c4.xlarge',
                output_path = output_location_kmeans,
                spot_intances = True,
                k = 4)

In [None]:
kmeans.fit(kmeans.record_set(X_pca))

# Agrupamento com o k-means

In [None]:
kmeans_predictor = kmeans.deploy(initial_instance_count = 1, instance_type = 'ml.c4.xlarge')

In [None]:
resultados = kmeans_predictor.predict(X_pca)

In [None]:
resultados

In [None]:
rotulos = [r.label['closest_cluster'].float32_tensor.values[0] for r in resultados]

In [None]:
print(rotulos[0:10])

In [None]:
np.unique(rotulos, return_counts = True)

In [None]:
base_credit.shape, len(rotulos)

In [None]:
base_credit['cluster'] = rotulos

In [None]:
base_credit

In [None]:
plt.figure(figsize=(10,10))
sns.scatterplot(x = 'LIMIT_BAL', y = 'BILL_AMT5', data = base_credit, hue = 'cluster', palette = ['red', 'green', 'blue', 'orange']);

In [None]:
base_credit_3 = base_credit[base_credit['cluster'] == 3.0]
base_credit_3

In [None]:
base_credit_3.to_csv('base_credit_3.csv')

In [None]:
# https://aws.amazon.com/pt/mxnet/
# https://aws.amazon.com/pt/mxnet/get-started/
import mxnet as mx

In [None]:
os.system('tar -zxvf model.tar.gz')

In [None]:
centroides = mx.ndarray.load('model_algo-1')
centroides

In [None]:
pca_predictor.delete_endpoint()
kmeans_predictor.delete_endpoint()