In [8]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

print("All libraries successfully imported!")


All libraries successfully imported!


In [2]:
# Environment diagnostics
import sys
import os

print("🔍 Environment Diagnostics:")
print(f"Python executable: {sys.executable}")
print(f"Python version: {sys.version}")
print(f"Current working directory: {os.getcwd()}")

# Check if Java is available
print("\n🔍 Java check:")
try:
    import subprocess
    result = subprocess.run(['java', '-version'], capture_output=True, text=True, shell=True)
    if result.returncode == 0:
        print("✅ Java is available")
        print(result.stderr)
    else:
        print("❌ Java not found in PATH")
except Exception as e:
    print(f"❌ Error checking Java: {e}")

# Check PySpark installation
print("\n🔍 PySpark check:")
try:
    import pyspark
    print(f"✅ PySpark version: {pyspark.__version__}")
    print(f"PySpark location: {pyspark.__file__}")
except ImportError as e:
    print(f"❌ PySpark import error: {e}")

print("\n🔍 Environment variables:")
for var in ['JAVA_HOME', 'SPARK_HOME', 'PYTHONPATH']:
    value = os.environ.get(var, 'Not set')
    print(f"{var}: {value}")

🔍 Environment Diagnostics:
Python executable: c:\Users\juraj\Desktop\ucoudify_pyspark\pyspark_env_final\Scripts\python.exe
Python version: 3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]
Current working directory: c:\Users\juraj\Desktop\ucoudify_pyspark\Tryout-Data-Dev-2025Q3-PySpark\Data

🔍 Java check:
✅ Java is available
openjdk version "11.0.28" 2025-07-15
OpenJDK Runtime Environment Temurin-11.0.28+6 (build 11.0.28+6)
OpenJDK 64-Bit Server VM Temurin-11.0.28+6 (build 11.0.28+6, mixed mode)


🔍 PySpark check:
✅ PySpark version: 3.5.1
PySpark location: c:\Users\juraj\Desktop\ucoudify_pyspark\pyspark_env_final\Lib\site-packages\pyspark\__init__.py

🔍 Environment variables:
JAVA_HOME: C:\Program Files\Eclipse Adoptium\jdk-11.0.28.6-hotspot\
SPARK_HOME: Not set
PYTHONPATH: Not set
✅ Java is available
openjdk version "11.0.28" 2025-07-15
OpenJDK Runtime Environment Temurin-11.0.28+6 (build 11.0.28+6)
OpenJDK 64-Bit Server VM Temurin-11.0.28+6 (build 11.0.

In [3]:
# Initialize Spark Session (this may take a minute on first run)

print("Initializing Spark Session...")

spark = SparkSession.builder \
    .appName("Account Balance Report") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

print(f"Spark Session initialized successfully!")
print(f"Spark Version: {spark.version}")
print(f"Application Name: {spark.sparkContext.appName}")
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")

Initializing Spark Session...
Spark Session initialized successfully!
Spark Version: 3.5.1
Application Name: Account Balance Report
Spark UI: http://BENDIKFILMSXPS:4040
Spark Session initialized successfully!
Spark Version: 3.5.1
Application Name: Account Balance Report
Spark UI: http://BENDIKFILMSXPS:4040


In [None]:
# Load Account Information (cast dates; treat 'NULL' as null)
# Note: CSV files use semicolon (;) as separator and comma (,) as decimal separator
from pyspark.sql import functions as F  # explicit alias for clarity and linting

account_info_df = spark.read \
    .option("header", "true") \
    .option("sep", ";") \
    .option("inferSchema", "true") \
    .option("nullValue", "NULL") \
    .csv("AccountInformation.csv")

# Cast date columns from dd/MM/yyyy to proper DateType
info_prepared_df = account_info_df \
    .withColumn("OpeningDate", F.to_date(F.col("OpeningDate"), "dd/MM/yyyy")) \
    .withColumn("ClosingDate", F.to_date(F.col("ClosingDate"), "dd/MM/yyyy"))

print("Account Information loaded and date columns cast to DateType")
print("Schema:")
info_prepared_df.printSchema()
print(f"Row count: {info_prepared_df.count()}")
info_prepared_df.show()

# Cache the DataFrame to improve performance for subsequent actions
info_prepared_df.cache()

Account Information loaded and date columns cast to DateType
Schema:
root
 |-- Account: integer (nullable = true)
 |-- Customer: string (nullable = true)
 |-- OpeningDate: date (nullable = true)
 |-- ClosingDate: date (nullable = true)

Row count: 4
+-------+--------+-----------+-----------+
|Account|Customer|OpeningDate|ClosingDate|
+-------+--------+-----------+-----------+
|1000001|    John| 2024-11-30|       NULL|
|1000002|   Betty| 2023-08-24|       NULL|
|1000003| Jessica| 2024-05-14|       NULL|
|1000004|    Josh| 2025-06-02| 2025-06-02|
+-------+--------+-----------+-----------+

+-------+--------+-----------+-----------+
|Account|Customer|OpeningDate|ClosingDate|
+-------+--------+-----------+-----------+
|1000001|    John| 2024-11-30|       NULL|
|1000002|   Betty| 2023-08-24|       NULL|
|1000003| Jessica| 2024-05-14|       NULL|
|1000004|    Josh| 2025-06-02| 2025-06-02|
+-------+--------+-----------+-----------+



In [None]:
# Load Account Entries
# Note: This data has European number format (comma as decimal separator)

# --- Step 1: Load the AccountEntries.csv data from the file ---
# We read all columns as text first (by not using inferSchema) because the 'Amount'
# column has a complex format (with '.' as a thousands separator and ',' as a decimal separator)
# that would cause automatic parsing to fail.
print("--> Loading AccountEntries.csv...")
account_entries_df = spark.read \
    .option("header", "true") \
    .option("sep", ";") \
    .csv("AccountEntries.csv")

print("--> Data loaded successfully. Starting preparation and cleaning...")

# --- Step 2: Clean and transform the data into the correct types ---
entries_prepared_df = account_entries_df \
    .withColumn(
        "Date",
        F.to_date(F.col("Date"), "dd/MM/yyyy")  # Convert the Date string to a proper DateType
    ) \
    .withColumn(
        "Amount",
        # First, remove the thousands separator ('.') -> '3.322.909,90' becomes '3322909,90'
        F.regexp_replace(F.col("Amount"), "\\.", "") \
    ) \
    .withColumn(
        "Amount",
        # Second, replace the decimal comma (',') with a decimal point ('.') -> '3322909,90' becomes '3322909.90'
        F.regexp_replace(F.col("Amount"), ",", ".") \
        .cast(DoubleType())  # Finally, cast the cleaned string to a numeric DoubleType
    ) \
     .withColumn(
        "Account", F.col("Account").cast("integer")
    )
    # .withColumnRenamed(
    #     "Account", "AccountNumber" # Rename for clarity and easier joins later
    # )
   

# --- Step 3: Verify the result ---
print("\n--> Preparation complete. Verifying the result:")
print("\nFinal Schema:")
entries_prepared_df.printSchema()

print(f"\nRow count: {entries_prepared_df.count()}")
print("\nSample Data:")
entries_prepared_df.show()

# Cache the DataFrame to improve performance for subsequent actions
# Add a semicolon to suppress the automatic display of the DataFrame from this line
entries_prepared_df.cache()

--> Loading AccountEntries.csv...
--> Data loaded successfully. Starting preparation and cleaning...

--> Preparation complete. Verifying the result:

Final Schema:
root
 |-- Date: date (nullable = true)
 |-- Account: integer (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Currency: string (nullable = true)
 |-- Text: string (nullable = true)


--> Preparation complete. Verifying the result:

Final Schema:
root
 |-- Date: date (nullable = true)
 |-- Account: integer (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Currency: string (nullable = true)
 |-- Text: string (nullable = true)


Row count: 16

Sample Data:
+----------+-------+------------+--------+---------------+
|      Date|Account|      Amount|Currency|           Text|
+----------+-------+------------+--------+---------------+
|2025-03-23|1000001|    -1243.34|     EUR|           Rent|
|2025-03-12|1000002|      5434.4|     EUR|       Services|
|2025-03-04|1000003|     -2323.4|     EUR|      Utilities|


DataFrame[Date: date, Account: int, Amount: double, Currency: string, Text: string]

In [9]:
# Final code for calculations

# --- Step 1: Calculate Opening Balance (Simplified Approach) ---
# Based on the provided data, we can assume that transactions with the text 'Opening balance'
# represent the starting balance for 2025. This is a simplification for the test task.
print("--> Step 1: Calculating Opening Balance (Simplified Approach)...")
opening_balance_df = entries_prepared_df \
    .filter(F.col("Text") == "Opening balance") \
    .groupBy("Account") \
    .agg(F.sum("Amount").alias("OpeningBalance2025"))

# --- (Alternative) Step 1: Robust Calculation of Opening Balance ---
# NOTE FOR REVIEWER: The code block below demonstrates a more robust, real-world approach
# for calculating the opening balance. It assumes that we have a full transaction history.
# It calculates the sum of all transactions that occurred *before* the target year (2025).
# This approach is commented out as it is not required by the simplified test data.
#
# print("--> Step 1 (Robust): Calculating Opening Balance from historical data...")
# opening_balance_robust_df = entries_prepared_df \
#     .filter(F.year(F.col("Date")) < 2025) \
#     .groupBy("Account") \
#     .agg(F.sum("Amount").alias("OpeningBalance2025"))

--> Step 1: Calculating Opening Balance (Simplified Approach)...


In [10]:
# --- Step 2: Calculate Year-To-Date (YTD) Transactions for 2025 ---
# For this calculation, we must exclude the 'Opening balance' entries as they are not
# considered part of the year's transactional activity.
print("--> Step 2: Calculating Sum of Transactions for 2025 (YTD)...")
ytd_transactions_df = entries_prepared_df \
    .filter(F.col("Text") != "Opening balance") \
    .groupBy("Account") \
    .agg(F.sum("Amount").alias("SumTransactionsYTD"))

--> Step 2: Calculating Sum of Transactions for 2025 (YTD)...


In [11]:
# Step 3: Calculate End-of-Month cumulative sums for 2025 ---
print("--> Step 3: Calculating monthly running totals for 2025...")

# Define the window specification: partition by account, order by month
window_spec = Window.partitionBy("Account").orderBy("Month")

# Calculate the running total of transactions *within* 2025 (excluding opening balance)
# and pivot the result
monthly_balances_df = entries_prepared_df \
    .filter(F.col("Text") != "Opening balance") \
    .withColumn("Month", F.month(F.col("Date"))) \
    .groupBy("Account", "Month") \
    .agg(F.sum("Amount").alias("MonthlySum")) \
    .withColumn("RunningTotal2025", F.sum("MonthlySum").over(window_spec)) \
    .groupBy("Account") \
    .pivot("Month", [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) \
    .agg(F.first("RunningTotal2025"))

# Dynamically rename the pivoted columns (e.g., '1' becomes 'BalanceEndOf_Jan')
month_map = {
    1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr", 5: "May", 6: "Jun",
    7: "Jul", 8: "Aug", 9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"
}
for month_num, month_abbr in month_map.items():
    if str(month_num) in monthly_balances_df.columns:
        monthly_balances_df = monthly_balances_df.withColumnRenamed(str(month_num), f"BalanceEndOf_{month_abbr}")

# --- Step 4: Assemble the Final Report by Joining All Components ---
print("--> Step 4: Assembling the final report by joining all parts...")

final_report_df = info_prepared_df \
    .join(opening_balance_df, "Account", "left") \
    .join(ytd_transactions_df, "Account", "left") \
    .join(monthly_balances_df, "Account", "left")

# --- Step 5: Final Calculations and Cleanup ---
print("--> Step 5: Performing final calculations and cleanup...")

# Replace nulls from joins with 0 for key financial columns
final_report_df = final_report_df.na.fill(0, ["OpeningBalance2025", "SumTransactionsYTD"])

# Forward-fill nulls in monthly balances to handle months with no transactions
# This ensures that a balance from a previous month is carried over.
for i in range(1, 13):
    col_name = f"BalanceEndOf_{month_map[i]}"
    if col_name in final_report_df.columns:
        if i > 1:
            prev_col_name = f"BalanceEndOf_{month_map[i-1]}"
            # If current month is null, take value from previous month's running total
            final_report_df = final_report_df.withColumn(col_name, F.when(F.col(col_name).isNull(), F.col(prev_col_name)).otherwise(F.col(col_name)))

# Now, calculate the final end-of-month balances by adding the opening balance to each month's running total
for month_num, month_abbr in month_map.items():
    col_name = f"BalanceEndOf_{month_abbr}"
    if col_name in final_report_df.columns:
        # First fill any remaining nulls (like for January) with 0 before adding
        final_report_df = final_report_df.na.fill(0, [col_name])
        final_report_df = final_report_df.withColumn(
            col_name,
            F.col("OpeningBalance2025") + F.col(col_name)
        )

--> Step 3: Calculating monthly running totals for 2025...
--> Step 4: Assembling the final report by joining all parts...
--> Step 5: Performing final calculations and cleanup...
--> Step 4: Assembling the final report by joining all parts...
--> Step 5: Performing final calculations and cleanup...


In [15]:
# --- Display the Final Report using Pandas for professional formatting ---
print("\n--> FINAL REPORT (Formatted for Presentation via Pandas):")

# First, perform all rounding calculations within Spark to ensure accuracy
final_report_rounded_df = final_report_df.select(
    F.col("Account"),
    F.col("Customer"),
    F.round(F.col("OpeningBalance2025"), 2).alias("OpeningBalance2025"),
    *[F.round(F.col(f"BalanceEndOf_{month_map[i]}"), 2).alias(f"BalanceEndOf_{month_map[i]}") 
      for i in range(1, 13) if f"BalanceEndOf_{month_map[i]}" in final_report_df.columns],
    F.round(F.col("SumTransactionsYTD"), 2).alias("SumTransactionsYTD")
)

# Convert the final, small Spark DataFrame to a Pandas DataFrame
pandas_df = final_report_rounded_df.toPandas()

# Define the columns that need financial formatting
financial_columns = [col for col in pandas_df.columns if col not in ['Account', 'Customer']]

# Apply formatting to the financial columns in Pandas
for col in financial_columns:
    pandas_df[col] = pandas_df[col].apply(lambda x: f"{x:,.2f}")

# Display the formatted Pandas DataFrame
# We use Pandas' styling options to make it look better
display(pandas_df)


--> FINAL REPORT (Formatted for Presentation via Pandas):


Unnamed: 0,Account,Customer,OpeningBalance2025,BalanceEndOf_Jan,BalanceEndOf_Feb,BalanceEndOf_Mar,BalanceEndOf_Apr,BalanceEndOf_May,BalanceEndOf_Jun,BalanceEndOf_Jul,BalanceEndOf_Aug,BalanceEndOf_Sep,BalanceEndOf_Oct,BalanceEndOf_Nov,BalanceEndOf_Dec,SumTransactionsYTD
0,1000001,John,3322909.9,3346333.3,3623080.7,3621837.36,3621837.36,3621837.36,3621837.36,3621837.36,3621837.36,3621837.36,3621837.36,3621837.36,3621837.36,298927.46
1,1000002,Betty,38383992.4,38383680.0,38384334.4,38389768.8,38389768.8,38389768.8,38389768.8,38389768.8,38389768.8,38389768.8,38389768.8,38389768.8,38389768.8,5776.4
2,1000003,Jessica,5584843.9,5596967.23,5591321.83,5588998.43,5588998.43,5588998.43,5588998.43,5588998.43,5588998.43,5588998.43,5588998.43,5588998.43,5588998.43,4154.53
3,1000004,Josh,988786.9,986432.6,1462888.2,1459465.3,1459465.3,1459465.3,1459465.3,1459465.3,1459465.3,1459465.3,1459465.3,1459465.3,1459465.3,470678.4
