In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, DateType, StringType, FloatType
from pyspark.sql.functions import *
from pyspark.sql.functions import monotonically_increasing_id, row_number
from pyspark.sql.window import Window


spark = SparkSession.builder.master("local[1]")\
          .appName("task3")\
          .getOrCreate()

df_trans = spark.read.option("header",True) \
    .option("delimiter", ";")\
     .csv("work/transactions.csv")

In [2]:
df_trans = df_trans.withColumn("amount", df_trans["amount"].cast(FloatType()))

In [3]:
df_trans = df_trans.orderBy(df_trans.amount.asc())

In [4]:
df_trans.head(5)

[Row(id='449549', amount=-1000.0, account_type='Business', transaction_date='2011-02-24', country='BD'),
 Row(id='183734', amount=-1000.0, account_type='Professional', transaction_date='2021-11-04', country='GY'),
 Row(id='72842', amount=-1000.0, account_type='Business', transaction_date='2020-05-04', country='YE'),
 Row(id='324722', amount=-999.989990234375, account_type='Business', transaction_date='2018-11-09', country='QA'),
 Row(id='416224', amount=-999.989990234375, account_type='Personal', transaction_date='2021-03-23', country='AU')]

In [5]:
df_trans.tail(5)

[Row(id='259119', amount=9999.98046875, account_type='Personal', transaction_date='2017-03-11', country='UG'),
 Row(id='317820', amount=9999.98046875, account_type='Professional', transaction_date='2019-11-07', country='EE'),
 Row(id='303197', amount=9999.98046875, account_type='Personal', transaction_date='2016-11-11', country='UA'),
 Row(id='426326', amount=9999.990234375, account_type='Business', transaction_date='2015-08-07', country='NL'),
 Row(id='281332', amount=9999.990234375, account_type='Professional', transaction_date='2019-05-17', country='GT')]

In [6]:
class_list = []
df_count = df_trans.count()

for i in range(df_count):
    if i < df_count * 0.25:
        class_list.append("low")
    elif i < df_count * 0.75:
        class_list.append("average")
    else:
        class_list.append("high")

In [7]:
class_list[0]

'low'

In [8]:
class_list[df_count-1]

'high'

In [9]:
class_list[df_count//2]

'average'

In [10]:
classDf = spark.createDataFrame(class_list, "string").toDF("level")

In [11]:
df_trans = df_trans.withColumn('row_index', row_number().over(Window.orderBy(monotonically_increasing_id())))
classDf = classDf.withColumn('row_index', row_number().over(Window.orderBy(monotonically_increasing_id())))

final_df = df_trans.join(classDf, on=["row_index"]).drop("row_index")

In [12]:
final_df.head(5)

[Row(id='449549', amount=-1000.0, account_type='Business', transaction_date='2011-02-24', country='BD', level='low'),
 Row(id='183734', amount=-1000.0, account_type='Professional', transaction_date='2021-11-04', country='GY', level='low'),
 Row(id='72842', amount=-1000.0, account_type='Business', transaction_date='2020-05-04', country='YE', level='low'),
 Row(id='324722', amount=-999.989990234375, account_type='Business', transaction_date='2018-11-09', country='QA', level='low'),
 Row(id='69322', amount=-999.989990234375, account_type='Professional', transaction_date='2019-04-06', country='SE', level='low')]

In [13]:
final_df.tail(5)

[Row(id='259119', amount=9999.98046875, account_type='Personal', transaction_date='2017-03-11', country='UG', level='high'),
 Row(id='317820', amount=9999.98046875, account_type='Professional', transaction_date='2019-11-07', country='EE', level='high'),
 Row(id='303197', amount=9999.98046875, account_type='Personal', transaction_date='2016-11-11', country='UA', level='high'),
 Row(id='426326', amount=9999.990234375, account_type='Business', transaction_date='2015-08-07', country='NL', level='high'),
 Row(id='281332', amount=9999.990234375, account_type='Professional', transaction_date='2019-05-17', country='GT', level='high')]

In [None]:
final_df.groupBy("level").count().show()