In [38]:
from src.clients.spark_client import get_spark_session
from pyspark.sql.types import StringType
from pyspark.sql import SparkSession


spark = get_spark_session()

base_dataframe = (
    spark
    .read
    .option('delimiter', ';')
    .option('header', 'true')
    .option('inferSchema', 'true')
    .option('enconding', 'ISO-8859-1')
    .csv('./data/amostragem.csv')
)


df = base_dataframe.withColumnsRenamed(
    {'Nome da Tarefa': 'nome', 
     'Data de Criação': 'data', 
     'Data de Conclusão': 'data_conclusao', 
     'Status': 'status', 
     'Tipo da Tarefa': 'tipo_tarefa', 
     'ID do Usuário': 'user_id',
     'Usuário': 'usuario'
     }
    )

df = df.withColumn('status', df['status'].cast(StringType()))
df = df.withColumn('data', df['data'].cast(StringType()))
df = df.withColumn('data_conclusao', df['data_conclusao'].cast(StringType()))


df = (
  df
  .filter((df['usuario'] == 'Jeferson Klau') & (df['status'] != 3))
)



In [39]:

from pyspark.sql.functions import when, date_format, lit, col, concat, udf
import uuid

def generate_uuid():
  return str(uuid.uuid4())

uuid_udf = udf(generate_uuid, StringType())
user_id = '035c6ada-4091-703a-1837-677cad18d4a5'

df = df.withColumn('status', when(df['status'] == 1, 'TODO').otherwise('DONE'))
df = df.withColumn('user_id', when(df['user_id'] == 'b4853fc1f03a3a4cec530a98a94d89ad', user_id))
df = df.withColumn('PK', concat(lit('LIST#'), date_format(col('data'), 'yyyyMMdd')))
df = df.withColumn('SK', concat(lit('ITEM#'), uuid_udf()))  
df = df.select(df.PK, df.SK, df.user_id, df.usuario, df.nome, df.data, df.data_conclusao, df.status, df.tipo_tarefa)

df.show()

+-------------+--------------------+--------------------+-------------+--------------------+-------------------+-------------------+------+------------------+
|           PK|                  SK|             user_id|      usuario|                nome|               data|     data_conclusao|status|       tipo_tarefa|
+-------------+--------------------+--------------------+-------------+--------------------+-------------------+-------------------+------+------------------+
|LIST#20240718|ITEM#da8e30c3-ea2...|035c6ada-4091-703...|Jeferson Klau|Cum iure exercita...|2024-07-18 22:51:51|2025-02-12 22:51:51|  DONE|Tarefa a Ser Feita|
|LIST#20250529|ITEM#256acc7e-304...|035c6ada-4091-703...|Jeferson Klau|    Asperiores iusto|2025-05-29 09:07:49|               NULL|  TODO|Tarefa a Ser Feita|
|LIST#20230912|ITEM#f9938eba-1cf...|035c6ada-4091-703...|Jeferson Klau|Asperiores pariat...|2023-09-12 19:56:45|               NULL|  TODO|Tarefa a Ser Feita|
|LIST#20241127|ITEM#feb11093-c07...|035c6ada-4

## Enviando os dados para a tabela do dynamo

In [40]:
from src.clients.dynamodb_client import DynamoDBClient

dynamodb = DynamoDBClient()
table = dynamodb.get_table('teste')

pandas_df = df.toPandas()

with table.batch_writer() as batch:
  for _, row in pandas_df.iterrows():
    item = row.to_dict()

    batch.put_item(Item=item) 

print('inserido no dynamodb') 

inserido no dynamodb


In [None]:
from boto3.dynamodb.conditions import Key
from pyspark.sql.types import StringType, StructField, StructType

# Query para obter as tarefas não concluídas
response = table.query(
    IndexName='user_id-PK-index',
    KeyConditionExpression=Key('user_id').eq(user_id) & Key('PK').begins_with('LIST#'),
    FilterExpression=Key('status').eq('TODO')
)

items = response['Items']

schema = StructType([
    StructField('PK', StringType(), False),
    StructField('SK', StringType(), False),
    StructField('user_id', StringType(), False),
    StructField('usuario', StringType(), False),
    StructField('nome', StringType(), False),
    StructField('data', StringType(), False),
    StructField('data_conclusao', StringType(), True),
    StructField('status', StringType(), False),
    StructField('tipo_tarefa', StringType(), False)
])

dynamodb_dataframe = spark.createDataFrame(items, schema=schema)

dynamodb_dataframe.show()


+-------------+--------------------+--------------------+-------------+--------------------+-------------------+--------------+------+------------------+
|           PK|                  SK|             user_id|      usuario|                nome|               data|data_conclusao|status|       tipo_tarefa|
+-------------+--------------------+--------------------+-------------+--------------------+-------------------+--------------+------+------------------+
|LIST#20230827|ITEM#e6c62e96-f06...|035c6ada-4091-703...|Jeferson Klau|    Laboriosam dolor|2023-08-27 09:30:37|          NULL|  TODO|Tarefa a Ser Feita|
|LIST#20230827|ITEM#ea5e8b60-daa...|035c6ada-4091-703...|Jeferson Klau|    Laboriosam dolor|2023-08-27 09:30:37|          NULL|  TODO|Tarefa a Ser Feita|
|LIST#20230827|ITEM#ecfebe7d-adb...|035c6ada-4091-703...|Jeferson Klau|    Laboriosam dolor|2023-08-27 09:30:37|          NULL|  TODO|Tarefa a Ser Feita|
|LIST#20230912|ITEM#e2653dec-c01...|035c6ada-4091-703...|Jeferson Klau|Asper