In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("lab4_2").getOrCreate()

In [0]:
data = [
    (1, '2011-01-01', 500), (1, '2011-01-15', 50), (1, '2011-01-22', 250),
    (1, '2011-01-24', 75), (1, '2011-01-26', 125), (1, '2011-01-28', 175),
    (2, '2011-01-01', 500), (2, '2011-01-15', 50), (2, '2011-01-22', 25),
    (2, '2011-01-23', 125), (2, '2011-01-26', 200), (2, '2011-01-29', 250),
    (3, '2011-01-01', 500), (3, '2011-01-15', 50), (3, '2011-01-22', 5000),
    (3, '2011-01-25', 550), (3, '2011-01-27', 95), (3, '2011-01-30', 2500)
]

df = spark.createDataFrame(data, ["AccountId", "TranDate", "TranAmt"])

display(df.limit(10))

AccountId,TranDate,TranAmt
1,2011-01-01,500
1,2011-01-15,50
1,2011-01-22,250
1,2011-01-24,75
1,2011-01-26,125
1,2011-01-28,175
2,2011-01-01,500
2,2011-01-15,50
2,2011-01-22,25
2,2011-01-23,125


In [0]:
data = [
    (1, 'George', 800),
    (2, 'Sam', 950),
    (3, 'Diane', 1100),
    (4, 'Nicholas', 1250),
    (5, 'Samuel', 1250),
    (6, 'Patricia', 1300),
    (7, 'Brian', 1500),
    (8, 'Thomas', 1600),
    (9, 'Fran', 2450),
    (10, 'Debbie', 2850),
    (11, 'Mark', 2975),
    (12, 'James', 3000),
    (13, 'Cynthia', 3000),
    (14, 'Christopher', 5000)
]

# Utworzenie DataFrame
df_logical = spark.createDataFrame(data, ["RowID", "FName", "Salary"])

# Wyświetlenie danych
display(df_logical.limit(10))

RowID,FName,Salary
1,George,800
2,Sam,950
3,Diane,1100
4,Nicholas,1250
5,Samuel,1250
6,Patricia,1300
7,Brian,1500
8,Thomas,1600
9,Fran,2450
10,Debbie,2850


In [0]:
windowSpec = Window.partitionBy("AccountId").orderBy("TranDate")

df_with_running_total = df.withColumn(
    "RunTotalAmt", 
    sum("TranAmt").over(windowSpec)
)

result = df_with_running_total.orderBy("AccountId", "TranDate")

display(result.limit(10))

AccountId,TranDate,TranAmt,RunTotalAmt
1,2011-01-01,500,500
1,2011-01-15,50,550
1,2011-01-22,250,800
1,2011-01-24,75,875
1,2011-01-26,125,1000
1,2011-01-28,175,1175
2,2011-01-01,500,500
2,2011-01-15,50,550
2,2011-01-22,25,575
2,2011-01-23,125,700


In [0]:
result = df.withColumn("RunAvg", avg("TranAmt").over(windowSpec)) \
           .withColumn("RunTranQty", count("*").over(windowSpec)) \
           .withColumn("RunSmallAmt", min("TranAmt").over(windowSpec)) \
           .withColumn("RunLargeAmt", max("TranAmt").over(windowSpec)) \
           .withColumn("RunTotalAmt", sum("TranAmt").over(windowSpec)) \
           .orderBy("AccountId", "TranDate")

display(result.limit(10))

AccountId,TranDate,TranAmt,RunAvg,RunTranQty,RunSmallAmt,RunLargeAmt,RunTotalAmt
1,2011-01-01,500,500.0,1,500,500,500
1,2011-01-15,50,275.0,2,50,500,550
1,2011-01-22,250,266.6666666666667,3,50,500,800
1,2011-01-24,75,218.75,4,50,500,875
1,2011-01-26,125,200.0,5,50,500,1000
1,2011-01-28,175,195.83333333333331,6,50,500,1175
2,2011-01-01,500,500.0,1,500,500,500
2,2011-01-15,50,275.0,2,50,500,550
2,2011-01-22,25,191.66666666666663,3,25,500,575
2,2011-01-23,125,175.0,4,25,500,700


In [0]:
sliding_window = windowSpec.rowsBetween(-2, Window.currentRow)

result = df.withColumn("SlideAvg", avg("TranAmt").over(sliding_window)) \
           .withColumn("SlideQty", count("*").over(sliding_window)) \
           .withColumn("SlideMin", min("TranAmt").over(sliding_window)) \
           .withColumn("SlideMax", max("TranAmt").over(sliding_window)) \
           .withColumn("SlideTotal", sum("TranAmt").over(sliding_window)) \
           .withColumn("RN", row_number().over(windowSpec)) \
           .orderBy("AccountId", "TranDate", "RN")

display(result.limit(10))

AccountId,TranDate,TranAmt,SlideAvg,SlideQty,SlideMin,SlideMax,SlideTotal,RN
1,2011-01-01,500,500.0,1,500,500,500,1
1,2011-01-15,50,275.0,2,50,500,550,2
1,2011-01-22,250,266.6666666666667,3,50,500,800,3
1,2011-01-24,75,125.0,3,50,250,375,4
1,2011-01-26,125,150.0,3,75,250,450,5
1,2011-01-28,175,125.0,3,75,175,375,6
2,2011-01-01,500,500.0,1,500,500,500,1
2,2011-01-15,50,275.0,2,50,500,550,2
2,2011-01-22,25,191.66666666666663,3,25,500,575,3
2,2011-01-23,125,66.66666666666667,3,25,125,200,4


In [0]:
rows_window = Window.orderBy("Salary").rowsBetween(Window.unboundedPreceding, Window.currentRow)

range_window = Window.orderBy("Salary").rangeBetween(Window.unboundedPreceding, Window.currentRow)

result = df_logical.withColumn("SumByRows", sum("Salary").over(rows_window)) \
           .withColumn("SumByRange", sum("Salary").over(range_window)) \
           .orderBy("RowID")

display(result.limit(10))

RowID,FName,Salary,SumByRows,SumByRange
1,George,800,800,800
2,Sam,950,1750,1750
3,Diane,1100,2850,2850
4,Nicholas,1250,4100,5350
5,Samuel,1250,5350,5350
6,Patricia,1300,6650,6650
7,Brian,1500,8150,8150
8,Thomas,1600,9750,9750
9,Fran,2450,12200,12200
10,Debbie,2850,15050,15050


In [0]:
#użycie funkcji okienkowych
# z nieznanego mi powodu wyskakuje NameError: name 'last_value' is not defined, to samo dla first_value
window_spec = Window.partitionBy("Salary").orderBy("RowID")

window_spec_ordered = Window.orderBy("Salary")
#.withColumn("FirstRowID", first_value("RowID").over(window_spec_ordered)) \
#.withColumn("LastRowID", last_value("RowID").over(window_spec_ordered)) \
result = df_logical.withColumn("NextRowID", lead("RowID").over(window_spec)) \
           .withColumn("PrevRowID", lag("RowID").over(window_spec)) \
           .withColumn("RowNum", row_number().over(window_spec_ordered)) \
           .orderBy("RowID")

display(result.limit(10))

RowID,FName,Salary,NextRowID,PrevRowID,RowNum
1,George,800,,,1
2,Sam,950,,,2
3,Diane,1100,,,3
4,Nicholas,1250,5.0,,4
5,Samuel,1250,,4.0,5
6,Patricia,1300,,,6
7,Brian,1500,,,7
8,Thomas,1600,,,8
9,Fran,2450,,,9
10,Debbie,2850,,,10
