In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max, min, sum, count

# Initialize Spark session
spark = SparkSession.builder.appName("TransactionsAnalysis").getOrCreate()

# Load the CSV file
file_path = "/FileStore/tables/txn.csv"  # Replace with your file path
txn_df = spark.read.option("header", "true").csv(file_path, inferSchema=True)

# Display the schema and a sample of the data
txn_df.printSchema()
txn_df.show(5)


root
 |-- Account No: string (nullable = true)
 |-- TRANSACTION DETAILS: string (nullable = true)
 |-- VALUE DATE: string (nullable = true)
 |--  WITHDRAWAL AMT : double (nullable = true)
 |--  DEPOSIT AMT : double (nullable = true)
 |-- BALANCE AMT: double (nullable = true)

+-------------+--------------------+----------+----------------+-------------+-----------+
|   Account No| TRANSACTION DETAILS|VALUE DATE| WITHDRAWAL AMT | DEPOSIT AMT |BALANCE AMT|
+-------------+--------------------+----------+----------------+-------------+-----------+
|409000611074'|TRF FROM  Indiafo...| 29-Jun-17|            null|    1000000.0|  1000000.0|
|409000611074'|TRF FROM  Indiafo...|  5-Jul-17|            null|    1000000.0|  2000000.0|
|409000611074'|FDRL/INTERNAL FUN...| 18-Jul-17|            null|     500000.0|  2500000.0|
|409000611074'|TRF FRM  Indiafor...|  1-Aug-17|            null|    3000000.0|  5500000.0|
|409000611074'|FDRL/INTERNAL FUN...| 16-Aug-17|            null|     500000.0|  600000

In [0]:
#Maximum Withdrawal Amount in Transactions
max_withdrawal = txn_df.agg(max(" WITHDRAWAL AMT ").alias("MaxWithdrawal")).collect()[0][0]
print(f"Maximum withdrawal amount: {max_withdrawal}")


Maximum withdrawal amount: 459447546.4


In [0]:
#Minimum Withdrawal Amount of an Account
min_withdrawal_per_account = txn_df.groupBy("Account No").agg(min(" WITHDRAWAL AMT ").alias("MinWithdrawal"))
min_withdrawal_per_account.show()


+-------------+-------------+
|   Account No|MinWithdrawal|
+-------------+-------------+
|409000438611'|          0.2|
|     1196711'|         0.25|
|     1196428'|         0.25|
|409000493210'|         0.01|
|409000611074'|        120.0|
|409000425051'|         1.25|
|409000405747'|         21.0|
|409000493201'|          2.1|
|409000438620'|         0.34|
|409000362497'|         0.97|
+-------------+-------------+



In [0]:
#Maximum Deposit Amount of an Account
max_deposit_per_account = txn_df.groupBy("Account No").agg(max(" DEPOSIT AMT ").alias("MaxDeposit"))
max_deposit_per_account.show()


+-------------+-------------+
|   Account No|   MaxDeposit|
+-------------+-------------+
|409000438611'|     1.7025E8|
|     1196711'|        5.0E8|
|     1196428'|2.119594422E8|
|409000493210'|        1.5E7|
|409000611074'|    3000000.0|
|409000425051'|        1.5E7|
|409000405747'|      2.021E8|
|409000493201'|    1000000.0|
|409000438620'|      5.448E8|
|409000362497'|        2.0E8|
+-------------+-------------+



In [0]:
#Minimum Deposit Amount of an Account
min_deposit_per_account = txn_df.groupBy("Account No").agg(min(" DEPOSIT AMT ").alias("MinDeposit"))
min_deposit_per_account.show()


+-------------+----------+
|   Account No|MinDeposit|
+-------------+----------+
|409000438611'|      0.03|
|     1196711'|      1.01|
|     1196428'|       1.0|
|409000493210'|      0.01|
|409000611074'|    1320.0|
|409000425051'|       1.0|
|409000405747'|     500.0|
|409000493201'|       0.9|
|409000438620'|      0.07|
|409000362497'|      0.03|
+-------------+----------+



In [0]:
#Sum of Balance in Every Bank Account
total_balance_per_account = txn_df.groupBy("Account No").agg(sum("BALANCE AMT").alias("TotalBalance"))
total_balance_per_account.show()


+-------------+--------------------+
|   Account No|        TotalBalance|
+-------------+--------------------+
|409000438611'|-2.49486577068339...|
|     1196711'|-1.60476498101275E13|
|     1196428'| -8.1418498130721E13|
|409000493210'|-3.27584952132095...|
|409000611074'|       1.615533622E9|
|409000425051'|-3.77211841164998...|
|409000405747'|-2.43108047067000...|
|409000493201'|1.0420831829499985E9|
|409000438620'|-7.12291867951358...|
|409000362497'| -5.2860004792808E13|
+-------------+--------------------+



In [0]:
#Number of Transactions on Each Date
transactions_per_date = txn_df.groupBy("VALUE DATE").agg(count("*").alias("TransactionCount"))
transactions_per_date.show()


+----------+----------------+
|VALUE DATE|TransactionCount|
+----------+----------------+
| 23-Dec-16|             143|
|  7-Feb-19|              98|
| 21-Jul-15|              80|
|  9-Sep-15|              91|
| 17-Jan-15|              16|
| 18-Nov-17|              53|
| 21-Feb-18|              77|
| 20-Mar-18|              71|
| 19-Apr-18|              71|
| 21-Jun-16|              97|
| 17-Oct-17|             101|
|  3-Jan-18|              70|
|  8-Jun-18|             223|
| 15-Dec-18|              62|
|  8-Aug-16|              97|
| 17-Dec-16|              74|
|  3-Sep-15|              83|
| 21-Jan-16|              76|
|  4-May-18|              92|
|  7-Sep-17|              94|
+----------+----------------+
only showing top 20 rows



In [0]:
#List of Customers with Withdrawal Amount More Than 1 Lakh
high_withdrawals = txn_df.filter(col(" WITHDRAWAL AMT ") > 100000).select("Account No", " WITHDRAWAL AMT ")
high_withdrawals.show()


+-------------+----------------+
|   Account No| WITHDRAWAL AMT |
+-------------+----------------+
|409000611074'|        133900.0|
|409000611074'|        195800.0|
|409000611074'|        143800.0|
|409000611074'|        331650.0|
|409000611074'|        129000.0|
|409000611074'|        230013.0|
|409000611074'|        367900.0|
|409000611074'|        108000.0|
|409000611074'|        141000.0|
|409000611074'|        206000.0|
|409000611074'|        242300.0|
|409000611074'|        113250.0|
|409000611074'|        206900.0|
|409000611074'|        276000.0|
|409000611074'|        171000.0|
|409000611074'|        189800.0|
|409000611074'|        271323.0|
|409000611074'|        200600.0|
|409000611074'|        176900.0|
|409000611074'|        150050.0|
+-------------+----------------+
only showing top 20 rows

