In [0]:
from pyspark.sql.functions import struct, collect_list, to_json,map_from_entries,collect_list
from pyspark.sql.types import *

In [0]:
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("custname", StringType(), True),
    StructField("item", StringType(), True),
    StructField("quantity", IntegerType(), True)
])

# Sample data with repeating customers
data = [
    (1, 'John', 'ProductA', 5),
    (2, 'Alice', 'ProductB', 3),
    (3, 'Bob', 'ProductC', 8),
    (1, 'John', 'ProductD', 2),
    (2, 'Alice', 'ProductE', 6),
    (1, 'John', 'ProductF', 4)
]

# Create DataFrame using the defined schema and data
df = spark.createDataFrame(data, schema=schema)
df.display()

id,custname,item,quantity
1,John,ProductA,5
2,Alice,ProductB,3
3,Bob,ProductC,8
1,John,ProductD,2
2,Alice,ProductE,6
1,John,ProductF,4


In [0]:
df2 = df.groupBy("id", "custname").agg(map_from_entries(collect_list(struct("item", "quantity"))).alias("Purchases"))
df2.show(truncate=False)



+---+--------+---------------------------------------------+
|id |custname|Purchases                                    |
+---+--------+---------------------------------------------+
|1  |John    |{ProductA -> 5, ProductD -> 2, ProductF -> 4}|
|2  |Alice   |{ProductB -> 3, ProductE -> 6}               |
|3  |Bob     |{ProductC -> 8}                              |
+---+--------+---------------------------------------------+



In [0]:
my_format = [row.asDict(recursive=True) for row in df2.collect() ]
print(my_format)

[{'id': 1, 'custname': 'John', 'Purchases': {'ProductA': 5, 'ProductF': 4, 'ProductD': 2}}, {'id': 2, 'custname': 'Alice', 'Purchases': {'ProductE': 6, 'ProductB': 3}}, {'id': 3, 'custname': 'Bob', 'Purchases': {'ProductC': 8}}]


In [0]:
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("custname", StringType(), True),
    StructField("item", StringType(), True)
])

# Sample data with repeating customers
data = [
    (1, 'John', 'ProductA'),
    (2, 'Alice', 'ProductB'),
    (3, 'Bob', 'ProductC'),
    (1, 'John', 'ProductD'),
    (2, 'Alice', 'ProductE'),
    (1, 'John', 'ProductF')
]

# Create DataFrame using the defined schema and data
df = spark.createDataFrame(data, schema=schema)
df.display()
df2 = df.groupBy("id", "custname").agg(map_from_entries(collect_list(struct("item"))).alias("Purchases"))
df2.show(truncate=False)

id,custname,item
1,John,ProductA
2,Alice,ProductB
3,Bob,ProductC
1,John,ProductD
2,Alice,ProductE
1,John,ProductF


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-4382232317541909>:20[0m
[1;32m     18[0m df [38;5;241m=[39m spark[38;5;241m.[39mcreateDataFrame(data, schema[38;5;241m=[39mschema)
[1;32m     19[0m df[38;5;241m.[39mdisplay()
[0;32m---> 20[0m df2 [38;5;241m=[39m df[38;5;241m.[39mgroupBy([38;5;124m"[39m[38;5;124mid[39m[38;5;124m"[39m, [38;5;124m"[39m[38;5;124mcustname[39m[38;5;124m"[39m)[38;5;241m.[39magg(map_from_entries(collect_list(struct([38;5;124m"[39m[38;5;124mitem[39m[38;5;124m"[39m)))[38;5;241m.[39malias([38;5;124m"[39m[38;5;124mPurchases[39m[38;5;124m"[39m))
[1;32m     21[0m df2[38;5;241m.[39mshow(truncate[38;5;241m=[39m[38;5;28;01mFalse[39;00m)

File [0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:48[0m, in [0;36m_wrap_function.<locals>.wrapper[0;34m(*args, **kw

In [0]:
from pyspark.sql.functions import struct, collect_list, expr
data = [
    (1, 'John', 'ProductA', 5),
    (2, 'Alice', 'ProductB', 3),
    (3, 'Bob', 'ProductC', 8),
    (1, 'John', 'ProductD', 2),
    (2, 'Alice', 'ProductE', 6),
    (1, 'John', 'ProductF', 4)
]
schema = ["id", "custname", "item", "quantity"]
df = spark.createDataFrame(data, schema=schema)

In [0]:
# Group by 'custname' and aggregate purchases as a map
df2 = df.groupBy("id", "custname").agg(
    expr("map_from_entries(collect_list(struct(item, quantity)))").alias("Purchases"))
# Show the aggregated DataFrame
df2.show(truncate=False)

+---+--------+---------------------------------------------+
|id |custname|Purchases                                    |
+---+--------+---------------------------------------------+
|1  |John    |{ProductA -> 5, ProductD -> 2, ProductF -> 4}|
|2  |Alice   |{ProductB -> 3, ProductE -> 6}               |
|3  |Bob     |{ProductC -> 8}                              |
+---+--------+---------------------------------------------+

