In [0]:
access_key = 'xxxxxxx'
secret_key = 'xxx+xxxxxxxxxxxxxxxxx'
aws_region = "us-east-2"

In [0]:
import time
import pandas as pd

# Configurar acesso ao S3
import boto3
from io import StringIO

In [0]:
# Inicializar o cliente S3
s3_client = boto3.client(
    's3',
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key,
    region_name=aws_region
)


In [0]:
# Ler o arquivo CSV diretamente do S3
bucket_name = "financial-dataset-ada"
file_key = "cards_data.csv"
response = s3_client.get_object(Bucket=bucket_name, Key=file_key)
file_content = response["Body"].read().decode("utf-8")

In [0]:
def read_file(file_content):
    df = pd.read_csv(StringIO(file_content))
    return df

In [0]:
# Função para medir o tempo de execução
def measure_time(func, *args):
    start_time = time.time()
    result = func(*args)
    end_time = time.time()
    print(f"Tempo de execução para {func.__name__}: {end_time - start_time:.4f} segundos")
    return result


In [0]:
# Função para corrigir tipos e substituir dados
def clean_dataframe(df):
    df["has_chip"] = df["has_chip"].replace({"YES": True, "NO": False})
    df["credit_limit"] = df["credit_limit"].str.replace("$", "").astype(int)
    df["expires"] = pd.to_datetime(df["expires"], format="%m/%Y", errors="coerce")
    df["acct_open_date"] = pd.to_datetime(df["acct_open_date"], format="%m/%Y", errors="coerce")
    return df

In [0]:

# Função para ordenar o DataFrame
def sort_dataframe(df):
    return df.sort_values(by="credit_limit", ascending=False)

In [0]:
# Função para agrupar o DataFrame
# Exemplo: Contar quantos cartões de cada marca existem
def group_dataframe(df):
    return df.groupby("card_brand").size().reset_index(name="count")

In [0]:
# Função para filtrar o DataFrame
# Exemplo: Filtrar cartões que estão na dark web e têm chip
def filter_dataframe(df):
    return df[(df["card_on_dark_web"] == "No") & (df["has_chip"] == True)]

In [0]:
def salvar_to_csv(df):
    df.to_csv('output.csv')

In [0]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6146 entries, 0 to 6145
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   id                     6146 non-null   int64         
 1   client_id              6146 non-null   int64         
 2   card_brand             6146 non-null   object        
 3   card_type              6146 non-null   object        
 4   card_number            6146 non-null   int64         
 5   expires                6146 non-null   datetime64[ns]
 6   cvv                    6146 non-null   int64         
 7   has_chip               6146 non-null   bool          
 8   num_cards_issued       6146 non-null   int64         
 9   credit_limit           6146 non-null   int64         
 10  acct_open_date         6146 non-null   datetime64[ns]
 11  year_pin_last_changed  6146 non-null   int64         
 12  card_on_dark_web       6146 non-null   object        
dtypes: 

In [0]:
# Aplicar as transformações com medição de tempo
df = measure_time(read_dataframe)
df_cleaned = measure_time(clean_dataframe, df)
df_sorted = measure_time(sort_dataframe, df_cleaned)
df_grouped = measure_time(group_dataframe, df_cleaned)
df_filtered = measure_time(filter_dataframe, df_cleaned)
df_save = measure_time(salvar_to_csv, df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6146 entries, 0 to 6145
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   id                     6146 non-null   int64 
 1   client_id              6146 non-null   int64 
 2   card_brand             6146 non-null   object
 3   card_type              6146 non-null   object
 4   card_number            6146 non-null   int64 
 5   expires                6146 non-null   object
 6   cvv                    6146 non-null   int64 
 7   has_chip               6146 non-null   object
 8   num_cards_issued       6146 non-null   int64 
 9   credit_limit           6146 non-null   object
 10  acct_open_date         6146 non-null   object
 11  year_pin_last_changed  6146 non-null   int64 
 12  card_on_dark_web       6146 non-null   object
dtypes: int64(6), object(7)
memory usage: 624.3+ KB
Tempo de execução para read_dataframe: 0.0229 segundos
Tempo de execução par

  df["credit_limit"] = df["credit_limit"].str.replace("$", "").astype(int)


Tempo de execução para salvar_to_csv: 0.1937 segundos


In [0]:
# Mostrar os resultados
print("Cleaned DataFrame:")
print(df_cleaned)

print("\nSorted DataFrame:")
print(df_sorted)

print("\nGrouped DataFrame:")
print(df_grouped)

print("\nFiltered DataFrame:")
print(df_filtered)


Cleaned DataFrame:
        id  client_id  card_brand        card_type       card_number  \
0     4524        825        Visa            Debit  4344676511950444   
1     2731        825        Visa            Debit  4956965974959986   
2     3701        825        Visa            Debit  4582313478255491   
3       42        825        Visa           Credit  4879494103069057   
4     4659        825  Mastercard  Debit (Prepaid)  5722874738736011   
...    ...        ...         ...              ...               ...   
6141  5361        185        Amex           Credit   300609782832003   
6142  2711        185        Visa           Credit  4718517475996018   
6143  1305       1007  Mastercard           Credit  5929512204765914   
6144   743       1110  Mastercard            Debit  5589768928167462   
6145  3199       1110        Visa           Credit  4994011318343994   

        expires  cvv  has_chip  num_cards_issued  credit_limit acct_open_date  \
0    2022-12-01  623      True     