In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("OnlineBankingAnalysis").getOrCreate()
loan_df = spark.read.csv("/FileStore/tables/loan.csv", header=True, inferSchema=True)
credit_df = spark.read.csv("/FileStore/tables/credit_card.csv", header=True, inferSchema=True)
txn_df = spark.read.csv("/FileStore/tables/txn.csv", header=True, inferSchema=True)

In [0]:
# Perform cleaning operations
loan_df = loan_df.dropna().dropDuplicates()
credit_df = credit_df.dropna().dropDuplicates()
txn_df = txn_df.dropna().dropDuplicates()

In [0]:
# Number of loans in each category
loan_df.groupBy("Loan Category").count().show()

+------------------+-----+
|     Loan Category|count|
+------------------+-----+
|           HOUSING|   61|
|        TRAVELLING|   48|
|       BOOK STORES|    7|
|       AGRICULTURE|   12|
|         GOLD LOAN|   72|
|  EDUCATIONAL LOAN|   17|
|        AUTOMOBILE|   53|
|          BUSINESS|   24|
|COMPUTER SOFTWARES|   25|
|           DINNING|   11|
|          SHOPPING|   30|
|       RESTAURANTS|   37|
|       ELECTRONICS|   13|
|          BUILDING|    6|
|        RESTAURANT|   20|
|   HOME APPLIANCES|   13|
+------------------+-----+



In [0]:
# Number of people who have taken more than 1 lakh loan
loan_df.filter(loan_df["Loan Amount"] > 100000).count()

Out[14]: 0

In [0]:
# Number of people with income greater than 60,000 rupees
loan_df.filter(loan_df["income"] > 60000).count()

Out[15]: 192

In [0]:
# Number of people with expenditure over 50,000 a month
loan_df.filter(loan_df["Expenditure"] > 50000).count()

Out[19]: 6

In [0]:
# Number of members eligible for a credit card
loan_df.filter((loan_df["Income"] > 60000) & (loan_df["Loan Amount"] < 100000)).count()

Out[20]: 0

In [0]:
# Credit card users in Spain
credit_df.filter(credit_df["Geography"] == "Spain").count()

Out[21]: 2477

In [0]:
# Number of members who are eligible and active in the bank
credit_df.filter((credit_df["creditscore"] > 650) & (credit_df["isactivemember"] == 1)).count()

Out[22]: 2655

In [0]:
# Maximum withdrawal amount in transactions
from pyspark.sql.functions import col
txn_df_filtered = txn_df.filter(col(" WITHDRAWAL AMT ").isNotNull())
max_withdrawal_row = txn_df_filtered.agg({" WITHDRAWAL AMT ": "max"}).collect()[0]
max_withdrawal_amt = max_withdrawal_row[0]
formatted_max_withdrawal_amt = f"₹{max_withdrawal_amt:,.1f}"
print("Maximum Withdrawal Amount:", formatted_max_withdrawal_amt)


Maximum Withdrawal Amount: ₹459,447,546.4


In [0]:
# Minimum withdrawal amount of an account
from pyspark.sql.functions import col
txn_df_filtered = txn_df.filter(col(" WITHDRAWAL AMT ").isNotNull())
min_withdrawal_row= txn_df_filtered.agg({" WITHDRAWAL AMT ": "min"}).collect()[0]
min_withdrawal_amt = min_withdrawal_row[0]
formatted_min_withdrawal_amt = f"₹{min_withdrawal_amt:,.2f}"
print("Minimum Withdrawal Amount:", formatted_min_withdrawal_amt)

Minimum Withdrawal Amount: ₹0.01


In [0]:
# Maximum deposit amount of an account
from pyspark.sql.functions import col
txn_df_filtered = txn_df.filter(col(" DEPOSIT AMT ").isNotNull())
max_deposit_row = txn_df_filtered.agg({" DEPOSIT AMT ": "max"}).collect()[0]
max_deposit_amt = max_deposit_row[0]
formatted_max_deposit_amt = f"₹{max_deposit_amt:,.1f}"
print("Maximum Deposit Amount:", formatted_max_deposit_amt)

Maximum Deposit Amount: ₹544,800,000.0


In [0]:
# Minimum deposit amount of an account
from pyspark.sql.functions import col
txn_df_filtered = txn_df.filter(col(" DEPOSIT AMT ").isNotNull())
min_deposit_row = txn_df_filtered.agg({" DEPOSIT AMT ": "min"}).collect()[0]
min_deposit_amt = min_deposit_row[0]
formatted_min_deposit_amt = f"₹{min_deposit_amt:,.2f}"
print("Minimum Deposit Amount:", formatted_min_deposit_amt)

Minimum Deposit Amount: ₹0.01


In [0]:
# Sum of balance in every bank account
from pyspark.sql.functions import col, format_number
txn_df = txn_df.withColumn("BALANCE AMT", col("BALANCE AMT").cast("double"))
txn_df_filtered = txn_df.filter((col("BALANCE AMT") > 0) & (col("BALANCE AMT").isNotNull()))
result_df = txn_df_filtered.groupBy("Account no").agg({"BALANCE AMT": "sum"})
result_df = result_df.withColumnRenamed("sum(BALANCE AMT)", "Total Balance")
result_df = result_df.withColumn("Formatted Balance", format_number("Total Balance", 2))
result_df.show(truncate=False)

+-------------+--------------------+-----------------+
|Account no   |Total Balance       |Formatted Balance|
+-------------+--------------------+-----------------+
|409000611074'|1.615533622E9       |1,615,533,622.00 |
|409000425051'|8.649102501000117E8 |864,910,250.10   |
|409000493201'|1.0420831829499985E9|1,042,083,182.95 |
+-------------+--------------------+-----------------+



In [0]:
# Number of transactions on each date
from pyspark.sql.functions import col
txn_df_filtered = txn_df.filter(col("VALUE DATE").isNotNull())
txn_df_filtered.groupBy("VALUE DATE").count().show()

+----------+-----+
|VALUE DATE|count|
+----------+-----+
| 23-Dec-16|  143|
|  7-Feb-19|   98|
| 21-Jul-15|   80|
|  9-Sep-15|   91|
| 17-Jan-15|   16|
| 18-Nov-17|   53|
| 21-Feb-18|   77|
| 20-Mar-18|   71|
| 19-Apr-18|   71|
| 21-Jun-16|   97|
| 17-Oct-17|  101|
|  3-Jan-18|   70|
|  8-Jun-18|  223|
| 15-Dec-18|   62|
|  8-Aug-16|   97|
| 17-Dec-16|   74|
|  3-Sep-15|   83|
| 21-Jan-16|   76|
|  4-May-18|   92|
|  7-Sep-17|   94|
+----------+-----+
only showing top 20 rows



In [0]:
# List of customers with withdrawal amount more than 1 lakh
from pyspark.sql.functions import col
txn_df_filtered = txn_df.filter(col(" WITHDRAWAL AMT ").isNotNull())
txn_df.filter(txn_df[" WITHDRAWAL AMT "] > 100000).select("Account No").show()

+-------------+
|   Account No|
+-------------+
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
|409000611074'|
+-------------+
only showing top 20 rows

