## Calculation of financial indicators

### Install neccessary packages

In [1]:
!pip install delta-spark==3.2.0



### Import neccessary packages

In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, DateType
from pyspark.sql.functions import when, col, sum, round

### Initialize Spark-session with DeltaLake support

In [3]:
spark = SparkSession.builder \
                    .appName("CalculationOfFinancialIndicators") \
                    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.2.0") \
                    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
                    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
                    .getOrCreate()

### Data schema

In [6]:
schema = StructType([
    StructField("transaction_id", IntegerType(), nullable=False),
    StructField("user_id", IntegerType(), nullable=False),
    StructField("amount", FloatType(), nullable=False),
    StructField("currency", StringType(), nullable=False),
    StructField("date", DateType(), nullable=False)
])

### Load data from csv-file

In [8]:
df = spark.read.csv("/home/jovyan/work/data/transactions.csv", header=True, schema=schema)
df.show()

+--------------+-------+-------+--------+----------+
|transaction_id|user_id| amount|currency|      date|
+--------------+-------+-------+--------+----------+
|             1|   1001|  150.5|     USD|2024-12-25|
|             2|   1002| 230.75|     EUR|2024-12-25|
|             3|   1003| 450.25|     GBP|2024-12-26|
|             4|   1004| 9000.5|     UAH|2024-12-26|
|             5|   1005|1200.75|     JPY|2024-12-28|
|             6|   1006|  250.0|     USD|2024-12-29|
|             7|   1007|  300.5|     EUR|2024-12-30|
|             8|   1008| 500.25|     GBP|2024-12-31|
|             9|   1009| 7500.6|     UAH|2024-12-30|
|            10|   1010|  950.3|     JPY|2024-12-28|
|            11|   1001|  350.2|     USD|2024-12-29|
|            12|   1002|  420.1|     EUR|2024-12-28|
|            13|   1003|  870.6|     GBP|2024-12-25|
|            14|   1004| 1500.7|     UAH|2024-12-26|
|            15|   1005|2500.45|     JPY|2024-12-25|
|            16|   1006| 390.55|     USD|2024-

### Currency conversion to USD

In [15]:
currency_exchange_rates = {
    "USD": 1.0,
    "EUR": 1.1,
    "GBP": 1.3,
    "UAH": 0.027,
    "JPY": 0.007
}

df_usd = df.withColumn(
    "amount_usd",
    when(col("currency") == "USD", round(col("amount") * currency_exchange_rates["USD"], 2))
    .when(col("currency") == "EUR", round(col("amount") * currency_exchange_rates["EUR"], 2))
    .when(col("currency") == "GBP", round(col("amount") * currency_exchange_rates["GBP"], 2))
    .when(col("currency") == "UAH", round(col("amount") * currency_exchange_rates["UAH"], 2))
    .when(col("currency") == "JPY", round(col("amount") * currency_exchange_rates["JPY"], 2))
)

df_usd.show()

+--------------+-------+-------+--------+----------+----------+
|transaction_id|user_id| amount|currency|      date|amount_usd|
+--------------+-------+-------+--------+----------+----------+
|             1|   1001|  150.5|     USD|2024-12-25|     150.5|
|             2|   1002| 230.75|     EUR|2024-12-25|    253.83|
|             3|   1003| 450.25|     GBP|2024-12-26|    585.33|
|             4|   1004| 9000.5|     UAH|2024-12-26|    243.01|
|             5|   1005|1200.75|     JPY|2024-12-28|      8.41|
|             6|   1006|  250.0|     USD|2024-12-29|     250.0|
|             7|   1007|  300.5|     EUR|2024-12-30|    330.55|
|             8|   1008| 500.25|     GBP|2024-12-31|    650.33|
|             9|   1009| 7500.6|     UAH|2024-12-30|    202.52|
|            10|   1010|  950.3|     JPY|2024-12-28|      6.65|
|            11|   1001|  350.2|     USD|2024-12-29|     350.2|
|            12|   1002|  420.1|     EUR|2024-12-28|    462.11|
|            13|   1003|  870.6|     GBP

### Filtering data for the last two days of 2024

In [16]:
df_filtered = df_usd.filter(
    (col("date") == "2024-12-30") | (col("date") == "2024-12-31")
)

df_filtered.show()

+--------------+-------+------+--------+----------+----------+
|transaction_id|user_id|amount|currency|      date|amount_usd|
+--------------+-------+------+--------+----------+----------+
|             7|   1007| 300.5|     EUR|2024-12-30|    330.55|
|             8|   1008|500.25|     GBP|2024-12-31|    650.33|
|             9|   1009|7500.6|     UAH|2024-12-30|    202.52|
|            19|   1009|8800.8|     UAH|2024-12-30|    237.62|
|            20|   1010| 310.9|     JPY|2024-12-30|      2.18|
|            21|   1001|520.25|     USD|2024-12-31|    520.25|
|            27|   1007|550.45|     EUR|2024-12-30|     605.5|
|            28|   1008|670.95|     GBP|2024-12-31|    872.24|
|            30|   1010|340.75|     JPY|2024-12-30|      2.39|
+--------------+-------+------+--------+----------+----------+



### Aggregating by user_id

In [20]:
aggregated_df = df_filtered.groupBy("user_id").agg(
    round(sum("amount_usd"), 2).alias("total_amount_usd")
)

aggregated_df.show()

+-------+----------------+
|user_id|total_amount_usd|
+-------+----------------+
|   1008|         1522.57|
|   1010|            4.57|
|   1001|          520.25|
|   1007|          936.05|
|   1009|          440.14|
+-------+----------------+



### Save processed data to Delta table

In [21]:
aggregated_df.write.format("delta").mode("overwrite").saveAsTable("total_amount_usd_by_user_id_last_two_days_2024")

### Read saved data from Delta table

In [22]:
delta_df = spark.read.format("delta").table("total_amount_usd_by_user_id_last_two_days_2024")
delta_df.show()

+-------+----------------+
|user_id|total_amount_usd|
+-------+----------------+
|   1008|         1522.57|
|   1010|            4.57|
|   1001|          520.25|
|   1007|          936.05|
|   1009|          440.14|
+-------+----------------+

