In [1]:
from pyspark.sql import SparkSession
import requests
import json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, LongType
from pyspark.sql.functions import col, lit, when, avg, count

In [2]:
spark = SparkSession.builder.appName("Fetch API Data with PySpark DataFrame").getOrCreate()

24/12/05 18:27:05 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
api_url = 'https://dummyjson.com/users'

### Fetch data from the given API URL and return it as a list of dictionaries.

In [4]:
def fetch_data(api_url):
    """
    Fetch data from the given API URL.
    """
    response = requests.get(api_url)
    if response.status_code == 200:
        return response.json().get("users", [])
    else:
        raise Exception(f"Failed to fetch data. Status code: {response.status_code}")

In [5]:
data = fetch_data(api_url)

### Define schema explicitly

In [6]:
schema = StructType([
    StructField("id", LongType(), True),
    StructField("firstName", StringType(), True),
    StructField("lastName", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("email", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("phone", StringType(), True),
    StructField("height", StringType(), True),
    StructField("weight", StringType(), True),
    StructField("eyeColor", StringType(), True),
    StructField("birthDate", StringType(), True),
    StructField("bloodGroup", StringType(), True),
    StructField("image", StringType(), True)
])

### Create DataFrame using the explicit schema

In [7]:
df = spark.createDataFrame(data, schema=schema)
# Show the DataFrame
df.show(truncate=False)

                                                                                

+---+---------+---------+---+---------------------------------+------+----------------+------+------+--------+----------+----------+-----------------------------------------+
|id |firstName|lastName |age|email                            |gender|phone           |height|weight|eyeColor|birthDate |bloodGroup|image                                    |
+---+---------+---------+---+---------------------------------+------+----------------+------+------+--------+----------+----------+-----------------------------------------+
|1  |Emily    |Johnson  |28 |emily.johnson@x.dummyjson.com    |female|+81 965-431-3024|193.24|63.16 |Green   |1996-5-30 |O-        |https://dummyjson.com/icon/emilys/128    |
|2  |Michael  |Williams |35 |michael.williams@x.dummyjson.com |male  |+49 258-627-6644|186.22|76.32 |Red     |1989-8-10 |B+        |https://dummyjson.com/icon/michaelw/128  |
|3  |Sophia   |Brown    |42 |sophia.brown@x.dummyjson.com     |female|+81 210-652-2785|177.72|52.6  |Hazel   |1982-11-6 |O-  

In [8]:
# Print schema to understand the structure
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- email: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- phone: string (nullable = true)
 |-- height: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- eyeColor: string (nullable = true)
 |-- birthDate: string (nullable = true)
 |-- bloodGroup: string (nullable = true)
 |-- image: string (nullable = true)



In [9]:
df = df.withColumn("height", col("height").cast("float"))
df = df.withColumn("weight", col("weight").cast("float"))

In [10]:
df.show(truncate=False)
df.printSchema()

+---+---------+---------+---+---------------------------------+------+----------------+------+------+--------+----------+----------+-----------------------------------------+
|id |firstName|lastName |age|email                            |gender|phone           |height|weight|eyeColor|birthDate |bloodGroup|image                                    |
+---+---------+---------+---+---------------------------------+------+----------------+------+------+--------+----------+----------+-----------------------------------------+
|1  |Emily    |Johnson  |28 |emily.johnson@x.dummyjson.com    |female|+81 965-431-3024|193.24|63.16 |Green   |1996-5-30 |O-        |https://dummyjson.com/icon/emilys/128    |
|2  |Michael  |Williams |35 |michael.williams@x.dummyjson.com |male  |+49 258-627-6644|186.22|76.32 |Red     |1989-8-10 |B+        |https://dummyjson.com/icon/michaelw/128  |
|3  |Sophia   |Brown    |42 |sophia.brown@x.dummyjson.com     |female|+81 210-652-2785|177.72|52.6  |Hazel   |1982-11-6 |O-  

In [11]:
df.select("firstName", "lastName", "age", "height", "weight").filter(df.age > 30).show()

+---------+---------+---+------+------+
|firstName| lastName|age|height|weight|
+---------+---------+---+------+------+
|  Michael| Williams| 35|186.22| 76.32|
|   Sophia|    Brown| 42|177.72|  52.6|
|    James|    Davis| 45|193.31|  62.1|
|Alexander|    Jones| 38|153.89| 77.42|
|    Ethan| Martinez| 33|159.19| 68.81|
| Isabella| Anderson| 31|150.56|  50.1|
|     Noah|Hernandez| 40|188.62| 69.49|
|Charlotte|    Lopez| 36|178.92| 82.46|
|  William| Gonzalez| 32|173.21| 82.41|
|   Evelyn|  Sanchez| 37|184.08| 83.15|
|    Logan|   Torres| 31|190.04| 72.43|
|  Jackson|    Evans| 34|162.57| 74.37|
|   Elijah|  Stewart| 33|195.33| 81.64|
|    Chloe|  Morales| 39|185.07| 63.97|
|   Evelyn| Gonzalez| 35|168.94| 58.47|
|   Daniel|     Cook| 41|186.21| 83.72|
|    Henry|     Hill| 38|180.25| 95.84|
|  Addison|   Wright| 32|179.32| 76.93|
+---------+---------+---+------+------+



In [12]:
# Group by gender and count users
gender_count = df.groupBy("gender").count()
gender_count.show()

+------+-----+
|gender|count|
+------+-----+
|female|   17|
|  male|   13|
+------+-----+



In [13]:
#  Add a new column: Calculate BMI (weight / height^2)
df = df.withColumn("BMI", (col("weight") / (col("height") / 100) ** 2))

In [14]:
# Calculate average age and average BMI by gender
avg_stats = df.groupBy("gender").agg(
    avg("age").alias("avg_age"),
    avg("BMI").alias("avg_BMI")
)

In [15]:
avg_stats.show()

+------+------------------+------------------+
|gender|           avg_age|           avg_BMI|
+------+------------------+------------------+
|female|30.470588235294116|20.474549256956323|
|  male| 35.30769230769231|24.908442895659554|
+------+------------------+------------------+



In [16]:
spark.stop()