In [None]:
# PySpark Account Balance Report

## Task Description
Create a report with the following information:
- Account number
- Customer  
- Opening balance 2025
- Balance end of January, February, March, ... through December
- Sum of transactions YTD (Year-to-Date)

## Data Sources
1. **Account Entries** - transaction data (AccountEntries.csv)
2. **Account Information** - account details (AccountInformation.csv)

In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

print("All libraries successfully imported!")


All libraries successfully imported!


In [2]:
# Environment diagnostics
import sys
import os

print("🔍 Environment Diagnostics:")
print(f"Python executable: {sys.executable}")
print(f"Python version: {sys.version}")
print(f"Current working directory: {os.getcwd()}")

# Check if Java is available
print("\n🔍 Java check:")
try:
    import subprocess
    result = subprocess.run(['java', '-version'], capture_output=True, text=True, shell=True)
    if result.returncode == 0:
        print("✅ Java is available")
        print(result.stderr)
    else:
        print("❌ Java not found in PATH")
except Exception as e:
    print(f"❌ Error checking Java: {e}")

# Check PySpark installation
print("\n🔍 PySpark check:")
try:
    import pyspark
    print(f"✅ PySpark version: {pyspark.__version__}")
    print(f"PySpark location: {pyspark.__file__}")
except ImportError as e:
    print(f"❌ PySpark import error: {e}")

print("\n🔍 Environment variables:")
for var in ['JAVA_HOME', 'SPARK_HOME', 'PYTHONPATH']:
    value = os.environ.get(var, 'Not set')
    print(f"{var}: {value}")

🔍 Environment Diagnostics:
Python executable: c:\Users\juraj\Desktop\ucoudify_pyspark\pyspark_env_final\Scripts\python.exe
Python version: 3.11.9 (tags/v3.11.9:de54cf5, Apr  2 2024, 10:12:12) [MSC v.1938 64 bit (AMD64)]
Current working directory: c:\Users\juraj\Desktop\ucoudify_pyspark\Tryout-Data-Dev-2025Q3-PySpark\Data

🔍 Java check:
✅ Java is available
openjdk version "11.0.28" 2025-07-15
OpenJDK Runtime Environment Temurin-11.0.28+6 (build 11.0.28+6)
OpenJDK 64-Bit Server VM Temurin-11.0.28+6 (build 11.0.28+6, mixed mode)


🔍 PySpark check:
✅ PySpark version: 3.5.1
PySpark location: c:\Users\juraj\Desktop\ucoudify_pyspark\pyspark_env_final\Lib\site-packages\pyspark\__init__.py

🔍 Environment variables:
JAVA_HOME: C:\Program Files\Eclipse Adoptium\jdk-11.0.28.6-hotspot\
SPARK_HOME: Not set
PYTHONPATH: Not set
✅ Java is available
openjdk version "11.0.28" 2025-07-15
OpenJDK Runtime Environment Temurin-11.0.28+6 (build 11.0.28+6)
OpenJDK 64-Bit Server VM Temurin-11.0.28+6 (build 11.0.

In [None]:
# Initialize Spark Session (this may take a minute on first run)

print("Initializing Spark Session...")

spark = SparkSession.builder \
    .appName("Account Balance Report") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

print(f"Spark Session initialized successfully!")
print(f"Spark Version: {spark.version}")
print(f"Application Name: {spark.sparkContext.appName}")
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")

Initializing Spark Session...
Spark Session initialized successfully!
Spark Version: 3.5.1
Application Name: Account Balance Report
Spark UI: http://BENDIKFILMSXPS:4040
Spark Session initialized successfully!
Spark Version: 3.5.1
Application Name: Account Balance Report
Spark UI: http://BENDIKFILMSXPS:4040


In [20]:
# Load Account Information (cast dates; treat 'NULL' as null)
# Note: CSV files use semicolon (;) as separator and comma (,) as decimal separator
from pyspark.sql import functions as F  # explicit alias for clarity and linting

account_info_df = spark.read \
    .option("header", "true") \
    .option("sep", ";") \
    .option("inferSchema", "true") \
    .option("nullValue", "NULL") \
    .csv("AccountInformation.csv")

# Cast date columns from dd/MM/yyyy to proper DateType
account_info_df = account_info_df \
    .withColumn("OpeningDate", F.to_date(F.col("OpeningDate"), "dd/MM/yyyy")) \
    .withColumn("ClosingDate", F.to_date(F.col("ClosingDate"), "dd/MM/yyyy"))

print("Account Information loaded and date columns cast to DateType")
print("Schema:")
account_info_df.printSchema()
print(f"Row count: {account_info_df.count()}")
account_info_df.show()

Account Information loaded and date columns cast to DateType
Schema:
root
 |-- Account: integer (nullable = true)
 |-- Customer: string (nullable = true)
 |-- OpeningDate: date (nullable = true)
 |-- ClosingDate: date (nullable = true)

Row count: 4
+-------+--------+-----------+-----------+
|Account|Customer|OpeningDate|ClosingDate|
+-------+--------+-----------+-----------+
|1000001|    John| 2024-11-30|       NULL|
|1000002|   Betty| 2023-08-24|       NULL|
|1000003| Jessica| 2024-05-14|       NULL|
|1000004|    Josh| 2025-06-02| 2025-06-02|
+-------+--------+-----------+-----------+

+-------+--------+-----------+-----------+
|Account|Customer|OpeningDate|ClosingDate|
+-------+--------+-----------+-----------+
|1000001|    John| 2024-11-30|       NULL|
|1000002|   Betty| 2023-08-24|       NULL|
|1000003| Jessica| 2024-05-14|       NULL|
|1000004|    Josh| 2025-06-02| 2025-06-02|
+-------+--------+-----------+-----------+



In [24]:
# Load Account Entries
# Note: This data has European number format (comma as decimal separator)

# --- Step 1: Load the AccountEntries.csv data from the file ---
# We read all columns as text first (by not using inferSchema) because the 'Amount'
# column has a complex format (with '.' as a thousands separator and ',' as a decimal separator)
# that would cause automatic parsing to fail.
print("--> Loading AccountEntries.csv...")
account_entries_df = spark.read \
    .option("header", "true") \
    .option("sep", ";") \
    .csv("AccountEntries.csv")

print("--> Data loaded successfully. Starting preparation and cleaning...")

# --- Step 2: Clean and transform the data into the correct types ---
entries_prepared_df = account_entries_df \
    .withColumn(
        "Date",
        F.to_date(F.col("Date"), "dd/MM/yyyy")  # Convert the Date string to a proper DateType
    ) \
    .withColumn(
        "Amount",
        # First, remove the thousands separator ('.') -> '3.322.909,90' becomes '3322909,90'
        F.regexp_replace(F.col("Amount"), "\\.", "") \
    ) \
    .withColumn(
        "Amount",
        # Second, replace the decimal comma (',') with a decimal point ('.') -> '3322909,90' becomes '3322909.90'
        F.regexp_replace(F.col("Amount"), ",", ".") \
        .cast(DoubleType())  # Finally, cast the cleaned string to a numeric DoubleType
    ) \
     .withColumn(
        "Account", F.col("Account").cast("integer")
    )
    # .withColumnRenamed(
    #     "Account", "AccountNumber" # Rename for clarity and easier joins later
    # )
   

# --- Step 3: Verify the result ---
print("\n--> Preparation complete. Verifying the result:")
print("\nFinal Schema:")
entries_prepared_df.printSchema()

print(f"\nRow count: {entries_prepared_df.count()}")
print("\nSample Data:")
entries_prepared_df.show()

--> Loading AccountEntries.csv...
--> Data loaded successfully. Starting preparation and cleaning...

--> Preparation complete. Verifying the result:

Final Schema:
root
 |-- Date: date (nullable = true)
 |-- Account: integer (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Currency: string (nullable = true)
 |-- Text: string (nullable = true)


Row count: 16

Sample Data:
--> Data loaded successfully. Starting preparation and cleaning...

--> Preparation complete. Verifying the result:

Final Schema:
root
 |-- Date: date (nullable = true)
 |-- Account: integer (nullable = true)
 |-- Amount: double (nullable = true)
 |-- Currency: string (nullable = true)
 |-- Text: string (nullable = true)


Row count: 16

Sample Data:
+----------+-------+------------+--------+---------------+
|      Date|Account|      Amount|Currency|           Text|
+----------+-------+------------+--------+---------------+
|2025-03-23|1000001|    -1243.34|     EUR|           Rent|
|2025-03-12|1000002|  