In [0]:
from pyspark.sql import SparkSession

# Initialize Spark Session
spark = SparkSession.builder.appName("Local File Loading").getOrCreate()


loan_df = spark.read.csv("/FileStore/tables/loan.csv", header=True, inferSchema=True)
credit_df = spark.read.csv("/FileStore/tables/credit_card-1.csv", header=True, inferSchema=True)
txn_df = spark.read.csv("/FileStore/tables/txn.csv", header=True, inferSchema=True)

# Display the first few rows of each dataset
loan_df.show(5)
credit_df.show(5)
txn_df.show(5)


+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|  Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| Dishonour of Bill|
+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|    IB14001| 30|  MALE|BANK MANAGER|        SINGLE|          4| 50000|      22199|            6|      HOUSING| 10,00,000 |      5|      42,898|               6|                 9|
|    IB14008| 44|  MALE|   PROFESSOR|       MARRIED|          6| 51000|      19999|            4|     SHOPPING|     50,000|      3|      33,999|               1|                 5|
|    IB14012| 30|FEMALE|     DENTIST|        SINGLE|          3| 58450|      27675|            

In [0]:
# Number of loans in each category
loan_df.groupBy("`Loan Category`").count().show()

+------------------+-----+
|     Loan Category|count|
+------------------+-----+
|           HOUSING|   67|
|        TRAVELLING|   53|
|       BOOK STORES|    7|
|       AGRICULTURE|   12|
|         GOLD LOAN|   77|
|  EDUCATIONAL LOAN|   20|
|        AUTOMOBILE|   60|
|          BUSINESS|   24|
|COMPUTER SOFTWARES|   35|
|           DINNING|   14|
|          SHOPPING|   35|
|       RESTAURANTS|   41|
|       ELECTRONICS|   14|
|          BUILDING|    7|
|        RESTAURANT|   20|
|   HOME APPLIANCES|   14|
+------------------+-----+



In [0]:
# Number of people with income > 60,000
loan_df.filter(loan_df["Income"] > 60000).count()

Out[13]: 198

In [0]:
# People with 2+ returned cheques and income < 50,000
loan_df.filter((loan_df["` Returned Cheque`"] >= 2) & (loan_df["Income"] < 50000)).count()

Out[16]: 137

In [0]:
# People with monthly expenditure > 50,000
loan_df.filter(loan_df["Expenditure"] > 50000).count()

Out[20]: 6

In [0]:
# People with 2+ returned cheques and are single
loan_df.filter((loan_df["` Returned Cheque`"] >= 2) & (loan_df["`Marital Status`"] == "Single")).count()

Out[19]: 0

In [0]:
# Credit card users in Spain
credit_df.filter(credit_df["`Geography`"] == "Spain").count()

Out[26]: 2477

In [0]:
# Assuming "eligible" is determined by a CreditScore > 650
credit_df.filter((credit_df["CreditScore"] > 650) & (credit_df["IsActiveMember"] == 1)).count()

Out[28]: 2655

In [0]:
# Maximum withdrawal amount
txn_df.select("` WITHDRAWAL AMT `").agg({"` WITHDRAWAL AMT `": "max"}).show()



+---------------------+
|max( WITHDRAWAL AMT )|
+---------------------+
|        4.594475464E8|
+---------------------+



In [0]:
# Minimum withdrawal amount
txn_df.select("` WITHDRAWAL AMT `").agg({"` WITHDRAWAL AMT `": "min"}).show()

+---------------------+
|min( WITHDRAWAL AMT )|
+---------------------+
|                 0.01|
+---------------------+



In [0]:
# Maximum deposit amount
txn_df.select("` DEPOSIT AMT `").agg({"` DEPOSIT AMT `": "max"}).show()

+------------------+
|max( DEPOSIT AMT )|
+------------------+
|           5.448E8|
+------------------+



In [0]:
# Minimum deposit amount
txn_df.select("` DEPOSIT AMT `").agg({"` DEPOSIT AMT `": "min"}).show()

+------------------+
|min( DEPOSIT AMT )|
+------------------+
|              0.01|
+------------------+



In [0]:
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Sum of Balances from CSV") \
    .getOrCreate()

# Load the CSV file
file_path = "/FileStore/tables/txn.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)
total_balance = df.agg({"balance amt": "sum"}).collect()[0][0]
print(f"Total balance across all accounts: {total_balance}")

Total balance across all accounts: -163245212011488.44


In [0]:
# Number of transactions per date
txn_df.groupBy("VALUE DATE").count().show()

+----------+-----+
|VALUE DATE|count|
+----------+-----+
| 23-Dec-16|  143|
|  7-Feb-19|   98|
| 21-Jul-15|   80|
|  9-Sep-15|   91|
| 17-Jan-15|   16|
| 18-Nov-17|   53|
| 21-Feb-18|   77|
| 20-Mar-18|   71|
| 19-Apr-18|   71|
| 21-Jun-16|   97|
| 17-Oct-17|  101|
|  3-Jan-18|   70|
|  8-Jun-18|  223|
| 15-Dec-18|   62|
|  8-Aug-16|   97|
| 17-Dec-16|   74|
|  3-Sep-15|   83|
| 21-Jan-16|   76|
|  4-May-18|   92|
|  7-Sep-17|   94|
+----------+-----+
only showing top 20 rows



In [0]:
# Customers with withdrawal > 1 lakh
txn_df.filter(txn_df["` WITHDRAWAL AMT `"] > 100000).select("Account No").distinct().show()

+-------------+
|   Account No|
+-------------+
|409000438611'|
|     1196711'|
|     1196428'|
|409000493210'|
|409000611074'|
|409000425051'|
|409000405747'|
|409000493201'|
|409000438620'|
|409000362497'|
+-------------+

