In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Start SparkSession
spark = SparkSession.builder \
    .appName("Online Banking Analysis") \
    .getOrCreate()
# Load loan.csv
loan_df = spark.read.csv("/content/loan.csv", header=True, inferSchema=True)

# Load credit card.csv
credit_df = spark.read.csv("/content/credit card.csv", header=True, inferSchema=True)

# Load txn.csv
txn_df = spark.read.csv("/content/txn.csv", header=True, inferSchema=True)

# Display schema
loan_df.printSchema()
credit_df.printSchema()
txn_df.printSchema()

# Show first few records
loan_df.show(5)
credit_df.show(5)
txn_df.show(5)



root
 |-- Customer_ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Family Size: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Expenditure: integer (nullable = true)
 |-- Use Frequency: integer (nullable = true)
 |-- Loan Category: string (nullable = true)
 |-- Loan Amount: string (nullable = true)
 |-- Overdue: integer (nullable = true)
 |--  Debt Record: string (nullable = true)
 |--  Returned Cheque: integer (nullable = true)
 |--  Dishonour of Bill: integer (nullable = true)

root
 |-- RowNumber: integer (nullable = true)
 |-- CustomerId: integer (nullable = true)
 |-- Surname: string (nullable = true)
 |-- CreditScore: integer (nullable = true)
 |-- Geography: string (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Tenure: integer (nullable = true)
 |-- Balance: dou

In [4]:
# 1. Number of loans in each category
loan_df.groupBy("Loan_Category").count().show()



+------------------+-----+
|     Loan_Category|count|
+------------------+-----+
|           HOUSING|   67|
|        TRAVELLING|   53|
|       BOOK STORES|    7|
|       AGRICULTURE|   12|
|         GOLD LOAN|   77|
|  EDUCATIONAL LOAN|   20|
|        AUTOMOBILE|   60|
|          BUSINESS|   24|
|COMPUTER SOFTWARES|   35|
|           DINNING|   14|
|          SHOPPING|   35|
|       RESTAURANTS|   41|
|       ELECTRONICS|   14|
|          BUILDING|    7|
|        RESTAURANT|   20|
|   HOME APPLIANCES|   14|
+------------------+-----+



In [3]:
for col_name in loan_df.columns:
    loan_df = loan_df.withColumnRenamed(col_name, col_name.strip().replace(" ", "_").replace("-", "_"))


In [6]:
# Filtering customers who took a loan greater than 1 lakh
loan_df.filter(loan_df["Loan_Amount"] > 100000).count()

0

In [7]:
# Filtering rows where income is greater than 60,000
loan_df.filter(col("Income") > 60000).count()

198

In [8]:
# Applying multiple conditions using logical AND (&)
loan_df.filter((col("Returned_Cheque") >= 2) & (col("Income") < 50000)).count()

137

In [9]:
# Filtering by returned cheques and marital status
loan_df.filter((col("Returned_Cheque") >= 2) & (col("Marital_Status") == "Single")).count()

0

In [10]:
# Filtering high monthly spenders
loan_df.filter(col("Expenditure") > 50000).count()

6

In [11]:
# Example rule: Eligible if debt record is good and they use banking services frequently
loan_df.filter((col("Use_Frequency") > 10) & (col("Debt_Record") == "Good")).count()

0

In [12]:
for col_name in credit_df.columns:
    credit_df = credit_df.withColumnRenamed(col_name, col_name.strip().replace(" ", "_").replace("-", "_"))


In [13]:
for col_name in txn_df.columns:
    txn_df = txn_df.withColumnRenamed(col_name, col_name.strip().replace(" ", "_").replace("-", "_"))


In [15]:
# 1. Show number of credit card users located in Spain

credit_df.filter(col("Geography") == "Spain").count()



2477

In [16]:
# Example threshold: Credit Score >= 650 is eligible
credit_df.filter((col("CreditScore") >= 650) & (col("IsActiveMember") == 1)).count()


2672

In [17]:
credit_df.filter((col("CreditScore") >= 650) & (col("IsActiveMember") == 1)).show()


+---------+----------+---------+-----------+---------+------+---+------+---------+-------------+--------------+---------------+------+
|RowNumber|CustomerId|  Surname|CreditScore|Geography|Gender|Age|Tenure|  Balance|NumOfProducts|IsActiveMember|EstimatedSalary|Exited|
+---------+----------+---------+-----------+---------+------+---+------+---------+-------------+--------------+---------------+------+
|        5|  15737888| Mitchell|        850|    Spain|Female| 43|     2|125510.82|            1|             1|        79084.1|     0|
|        7|  15592531| Bartlett|        822|   France|  Male| 50|     7|      0.0|            2|             1|        10062.8|     0|
|       10|  15592389|       H?|        684|   France|  Male| 27|     2|134603.88|            1|             1|       71725.73|     0|
|       20|  15568982|      Hao|        726|   France|Female| 24|     6|      0.0|            2|             1|       54724.03|     0|
|       21|  15577657| McDonald|        732|   France| 

In [18]:
# Assuming 'TransactionType' column has values like 'withdrawal' or 'deposit'
from pyspark.sql.functions import col, max

txn_df.filter(col("TransactionType") == "withdrawal") \
      .agg(max("Amount").alias("Max_Withdrawal")).show()


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `TransactionType` cannot be resolved. Did you mean one of the following? [`Account_No`, `BALANCE_AMT`, `DEPOSIT_AMT`, `VALUE_DATE`, `WITHDRAWAL_AMT`].;
'Filter ('TransactionType = withdrawal)
+- Project [Account_No#894, TRANSACTION_DETAILS#901, VALUE_DATE#908, WITHDRAWAL_AMT#915, DEPOSIT_AMT#922, BALANCE AMT#112 AS BALANCE_AMT#929]
   +- Project [Account_No#894, TRANSACTION_DETAILS#901, VALUE_DATE#908, WITHDRAWAL_AMT#915,  DEPOSIT AMT #111 AS DEPOSIT_AMT#922, BALANCE AMT#112]
      +- Project [Account_No#894, TRANSACTION_DETAILS#901, VALUE_DATE#908,  WITHDRAWAL AMT #110 AS WITHDRAWAL_AMT#915,  DEPOSIT AMT #111, BALANCE AMT#112]
         +- Project [Account_No#894, TRANSACTION_DETAILS#901, VALUE DATE#109 AS VALUE_DATE#908,  WITHDRAWAL AMT #110,  DEPOSIT AMT #111, BALANCE AMT#112]
            +- Project [Account_No#894, TRANSACTION DETAILS#108 AS TRANSACTION_DETAILS#901, VALUE DATE#109,  WITHDRAWAL AMT #110,  DEPOSIT AMT #111, BALANCE AMT#112]
               +- Project [Account No#107 AS Account_No#894, TRANSACTION DETAILS#108, VALUE DATE#109,  WITHDRAWAL AMT #110,  DEPOSIT AMT #111, BALANCE AMT#112]
                  +- Relation [Account No#107,TRANSACTION DETAILS#108,VALUE DATE#109, WITHDRAWAL AMT #110, DEPOSIT AMT #111,BALANCE AMT#112] csv


In [19]:
 #1. Maximum Withdrawal Amount
txn_df.select(max("WITHDRAWAL_AMT").alias("Max_Withdrawal_Amount")).show()

+---------------------+
|Max_Withdrawal_Amount|
+---------------------+
|        4.594475464E8|
+---------------------+



In [20]:
#Minimum Withdrawal Amount
txn_df.select(min("WITHDRAWAL_AMT").alias("Min_Withdrawal_Amount")).show()

+---------------------+
|Min_Withdrawal_Amount|
+---------------------+
|                 0.01|
+---------------------+



In [21]:
#Maximum Deposit Amount
txn_df.select(max("DEPOSIT_AMT").alias("Max_Deposit_Amount")).show()


+------------------+
|Max_Deposit_Amount|
+------------------+
|           5.448E8|
+------------------+



In [22]:
# Minimum Deposit Amount
txn_df.select(min("DEPOSIT_AMT").alias("Min_Deposit_Amount")).show()

+------------------+
|Min_Deposit_Amount|
+------------------+
|              0.01|
+------------------+



In [23]:
 #Sum of Balance Amount in Each Bank Account

txn_df.groupBy("Account_No") \
      .sum("BALANCE_AMT") \
      .withColumnRenamed("sum(BALANCE_AMT)", "Total_Balance") \
      .show()


+-------------+--------------------+
|   Account_No|       Total_Balance|
+-------------+--------------------+
|409000438611'|-2.49486577068339...|
|     1196711'|-1.60476498101275E13|
|     1196428'| -8.1418498130721E13|
|409000493210'|-3.27584952132095...|
|409000611074'|       1.615533622E9|
|409000425051'|-3.77211841164998...|
|409000405747'|-2.43108047067000...|
|409000362497'| -5.2860004792808E13|
|409000493201'|1.0420831829499985E9|
|409000438620'|-7.12291867951358...|
+-------------+--------------------+



In [24]:
# Number of Transactions Happening on Each Date

txn_df.groupBy("VALUE_DATE") \
      .agg(count("*").alias("Transaction_Count")) \
      .orderBy("VALUE_DATE") \
      .show()


+----------+-----------------+
|VALUE_DATE|Transaction_Count|
+----------+-----------------+
|  1-Apr-17|                1|
|  1-Aug-15|               75|
|  1-Aug-16|               85|
|  1-Aug-17|               65|
|  1-Aug-18|              144|
|  1-Dec-15|               96|
|  1-Dec-16|              106|
|  1-Dec-17|               45|
|  1-Dec-18|               97|
|  1-Feb-16|               97|
|  1-Feb-17|               81|
|  1-Feb-18|               87|
|  1-Feb-19|               79|
|  1-Jan-15|                3|
|  1-Jan-16|               59|
|  1-Jan-18|               53|
|  1-Jan-19|               57|
|  1-Jul-15|               25|
|  1-Jul-16|              111|
|  1-Jul-17|              243|
+----------+-----------------+
only showing top 20 rows



In [28]:
# Customers With Withdrawal Amount Greater Than ₹1,00,000

txn_df.filter(col("WITHDRAWAL_AMT") > 100000) \
      .select("Account_No", "WITHDRAWAL_AMT", "VALUE_DATE") \
      .orderBy(col("WITHDRAWAL_AMT").desc()) \
      .show()

+-------------+--------------+----------+
|   Account_No|WITHDRAWAL_AMT|VALUE_DATE|
+-------------+--------------+----------+
|     1196711'| 4.594475464E8| 26-Jun-18|
|     1196711'| 4.482072231E8| 26-May-17|
|409000438620'|         4.0E8|  8-Mar-16|
|409000425051'|        3.54E8| 31-Oct-18|
|     1196711'| 2.671403184E8| 20-Jun-15|
|409000438611'|         2.4E8| 31-Mar-16|
|     1196711'|       2.021E8|  3-May-16|
|409000438620'|         2.0E8| 11-Mar-16|
|409000438620'|         2.0E8| 18-Mar-16|
|     1196711'|         2.0E8| 21-Oct-15|
|     1196711'|         2.0E8|  3-Oct-15|
|409000405747'|         1.7E8| 30-Jan-16|
|     1196711'|        1.54E8| 24-Sep-15|
|     1196711'|         1.5E8| 22-Aug-15|
|     1196711'|         1.5E8| 17-Oct-15|
|     1196711'|         1.5E8|  7-Apr-16|
|     1196428'|         1.5E8| 13-Apr-16|
|409000362497'| 1.413662392E8| 16-Aug-16|
|409000362497'| 1.317762365E8| 14-Sep-16|
|409000362497'| 1.316962119E8| 10-Oct-16|
+-------------+--------------+----