In [1]:
from pushcart.sources.rest_api.request import batch_request
from pushcart.sources.rest_api.spark import request_dataframe
from pushcart.metadata import Metadata
from pyspark.sql import SparkSession

import pyspark.sql.functions as F


In [2]:
result = batch_request(
    url="https://reqres.in/api/users", params={"page": 2}, method="GET"
)
result

INFO:httpx:HTTP Request: GET https://reqres.in/api/users?page=2 "HTTP/1.1 200 OK"


[{'page': 2,
  'per_page': 6,
  'total': 12,
  'total_pages': 2,
  'data': [{'id': 7,
    'email': 'michael.lawson@reqres.in',
    'first_name': 'Michael',
    'last_name': 'Lawson',
    'avatar': 'https://reqres.in/img/faces/7-image.jpg'},
   {'id': 8,
    'email': 'lindsay.ferguson@reqres.in',
    'first_name': 'Lindsay',
    'last_name': 'Ferguson',
    'avatar': 'https://reqres.in/img/faces/8-image.jpg'},
   {'id': 9,
    'email': 'tobias.funke@reqres.in',
    'first_name': 'Tobias',
    'last_name': 'Funke',
    'avatar': 'https://reqres.in/img/faces/9-image.jpg'},
   {'id': 10,
    'email': 'byron.fields@reqres.in',
    'first_name': 'Byron',
    'last_name': 'Fields',
    'avatar': 'https://reqres.in/img/faces/10-image.jpg'},
   {'id': 11,
    'email': 'george.edwards@reqres.in',
    'first_name': 'George',
    'last_name': 'Edwards',
    'avatar': 'https://reqres.in/img/faces/11-image.jpg'},
   {'id': 12,
    'email': 'rachel.howell@reqres.in',
    'first_name': 'Rachel',
    '

In [3]:
spark = SparkSession.builder.getOrCreate()

23/07/28 11:24:51 WARN Utils: Your hostname, laptop resolves to a loopback address: 127.0.1.1; using 192.168.2.13 instead (on interface enp0s20f0u1u4u4)
23/07/28 11:24:51 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/28 11:24:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


23/07/28 11:25:02 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [6]:
data_schema = "array<struct<per_page:int, total:int, total_pages:int, page:int, data:array<struct<last_name:string, avatar:string, id:int, first_name:string, email:string>>>>"

result_df = request_dataframe(
    url="https://reqres.in/api/users",
    params={"page": 2},
    method="GET",
    schema=data_schema,
)
result_df.show(truncate=False)

+---------------------------+-------+----+-----------+----+---------+------+-------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|url                        |headers|auth|params     |data|json_data|method|options|result                                                                                                                                                                                                                                                                                 

In [8]:
md = Metadata(result_df, infer_fraction=1.0)
md.get_metadata()


INFO:pushcart.metadata:Attempting to infer timestamp format for url column.
INFO:pushcart.metadata:Attempting to infer JSON schema for url column.
INFO:pushcart.metadata:Attempting to infer timestamp format for data column.
INFO:pushcart.metadata:Attempting to infer JSON schema for data column.
INFO:pushcart.metadata:Attempting to infer timestamp format for method column.
INFO:pushcart.metadata:Attempting to infer JSON schema for method column.


DataGrid(auto_fit_columns=True, auto_fit_params={'area': 'all'}, corner_renderer=None, default_renderer=TextRe…

In [9]:
code = md.generate_code()

INFO:pushcart.metadata:Excluding technical columns: None
INFO:pushcart.metadata:
df = (df
	.withColumn("data", F.from_json(F.col("data"), schema="struct<>"))
	.withColumn("result", F.explode("result"))
	.withColumn("result_per_page", F.col("result.per_page").cast("int"))
	.withColumn("result_total", F.col("result.total").cast("int"))
	.withColumn("result_total_pages", F.col("result.total_pages").cast("int"))
	.withColumn("result_page", F.col("result.page").cast("int"))
	.withColumn("result_data", F.explode("result.data"))
	.withColumn("result_data_last_name", F.col("result_data.last_name").cast("string"))
	.withColumn("result_data_avatar", F.col("result_data.avatar").cast("string"))
	.withColumn("result_data_id", F.col("result_data.id").cast("int"))
	.withColumn("result_data_first_name", F.col("result_data.first_name").cast("string"))
	.withColumn("result_data_email", F.col("result_data.email").cast("string"))
	.select(['url', 'headers', 'auth', 'params', 'data', 'json_data', 'method',

In [10]:
df2 = (
    result_df.withColumn("data", F.from_json(F.col("data"), schema="struct<>"))
    .withColumn("result", F.explode("result"))
    .withColumn("result_per_page", F.col("result.per_page").cast("int"))
    .withColumn("result_total", F.col("result.total").cast("int"))
    .withColumn("result_total_pages", F.col("result.total_pages").cast("int"))
    .withColumn("result_page", F.col("result.page").cast("int"))
    .withColumn("result_data", F.explode("result.data"))
    .withColumn("result_data_last_name", F.col("result_data.last_name").cast("string"))
    .withColumn("result_data_avatar", F.col("result_data.avatar").cast("string"))
    .withColumn("result_data_id", F.col("result_data.id").cast("int"))
    .withColumn(
        "result_data_first_name", F.col("result_data.first_name").cast("string")
    )
    .withColumn("result_data_email", F.col("result_data.email").cast("string"))
    .select(
        [
            "url",
            "headers",
            "auth",
            "params",
            "data",
            "json_data",
            "method",
            "options",
            "result",
            "result_per_page",
            "result_total",
            "result_total_pages",
            "result_page",
            "result_data",
            "result_data_last_name",
            "result_data_avatar",
            "result_data_id",
            "result_data_first_name",
            "result_data_email",
        ]
    )
)
df2.show(truncate=False)

                                                                                

+---------------------------+-------+----+-----------+----+---------+------+-------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+------------+------------------+-----------+-------------------------------------------------------------------------------------------+---------------------+----------------------------------------+--------------+----------------------+--------------------------+
|url                        |headers|auth|params     |data|json_data|method|options|r

                                                                                