In [0]:
# Import necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, lit

# Initialize Spark session
spark = SparkSession.builder.appName("LoanDataAnalysis").getOrCreate()

# Load the CSV file
file_path = "/FileStore/tables/Loan/loan.csv"  # Replace with your actual file path
loan_df = spark.read.option("header", "true").csv(file_path, inferSchema=True)

# Display the schema and a sample of the data
loan_df.printSchema()
loan_df.show(5)


root
 |-- Customer_ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Occupation: string (nullable = true)
 |-- Marital Status: string (nullable = true)
 |-- Family Size: integer (nullable = true)
 |-- Income: integer (nullable = true)
 |-- Expenditure: integer (nullable = true)
 |-- Use Frequency: integer (nullable = true)
 |-- Loan Category: string (nullable = true)
 |-- Loan Amount: string (nullable = true)
 |-- Overdue: integer (nullable = true)
 |--  Debt Record: string (nullable = true)
 |--  Returned Cheque: integer (nullable = true)
 |--  Dishonour of Bill: integer (nullable = true)

+-----------+---+------+------------+--------------+-----------+------+-----------+-------------+-------------+-----------+-------+------------+----------------+------------------+
|Customer_ID|Age|Gender|  Occupation|Marital Status|Family Size|Income|Expenditure|Use Frequency|Loan Category|Loan Amount|Overdue| Debt Record| Returned Cheque| D

In [0]:
#Number of loans in each category
loan_category_count = loan_df.groupBy("Loan Category").count()
loan_category_count.show()


+------------------+-----+
|     Loan Category|count|
+------------------+-----+
|           HOUSING|   67|
|        TRAVELLING|   53|
|       BOOK STORES|    7|
|       AGRICULTURE|   12|
|         GOLD LOAN|   77|
|  EDUCATIONAL LOAN|   20|
|        AUTOMOBILE|   60|
|          BUSINESS|   24|
|COMPUTER SOFTWARES|   35|
|           DINNING|   14|
|          SHOPPING|   35|
|       RESTAURANTS|   41|
|       ELECTRONICS|   14|
|          BUILDING|    7|
|        RESTAURANT|   20|
|   HOME APPLIANCES|   14|
+------------------+-----+



In [0]:
#Number of people who have taken more than 1 lakh loan
high_loan_count = loan_df.filter(col("Loan Amount") > 100000).count()
print(f"Number of people who have taken more than 1 lakh loan: {high_loan_count}")



Number of people who have taken more than 1 lakh loan: 0


In [0]:
#Number of people with income greater than 60,000 rupees
high_income_count = loan_df.filter(col("Income") > 60000).count()
print(f"Number of people with income greater than 60,000 rupees: {high_income_count}")


Number of people with income greater than 60,000 rupees: 198


In [0]:
#Number of people with 2 or more returned cheques and income less than 50,000
cheques_and_income_count = loan_df.filter((col(" Returned Cheque") >= 2) & (col("Income") < 50000)).count()
print(f"Number of people with 2 or more returned cheques and income less than 50,000: {cheques_and_income_count}")


Number of people with 2 or more returned cheques and income less than 50,000: 137


In [0]:
#Number of people with 2 or more returned cheques and are single
cheques_and_single_count = loan_df.filter((col(" Returned Cheque") >= 2) & (col("Marital Status") == "Single")).count()
print(f"Number of people with 2 or more returned cheques and are single: {cheques_and_single_count}")


Number of people with 2 or more returned cheques and are single: 0


In [0]:
#Number of people with expenditure over 50,000 a month
high_expenditure_count = loan_df.filter(col("Expenditure") > 50000).count()
print(f"Number of people with expenditure over 50,000 a month: {high_expenditure_count}")


Number of people with expenditure over 50,000 a month: 6


In [0]:
#Number of members eligible for a credit card
credit_card_eligible = loan_df.filter((col("Income") > 50000) & (col(" Returned Cheque") == 0)).count()
print(f"Number of members eligible for a credit card: {credit_card_eligible}")


Number of members eligible for a credit card: 22
