# Credit Card Default Prediction
####The data set consists of 2000 samples from each of two categories. Five variables are

1. Income
2. Age
3. Loan
4. Loan to Income (engineered feature)
5. Default

In [0]:
dbutils.fs.ls("dbfs:/FileStore/tables/")

[FileInfo(path='dbfs:/FileStore/tables/Admission_Chance.csv', name='Admission_Chance.csv', size=12905, modificationTime=1720190058000),
 FileInfo(path='dbfs:/FileStore/tables/Cancer.csv', name='Cancer.csv', size=125204, modificationTime=1720190099000),
 FileInfo(path='dbfs:/FileStore/tables/Credit_Default.csv', name='Credit_Default.csv', size=101152, modificationTime=1720190106000),
 FileInfo(path='dbfs:/FileStore/tables/Customer_Purchase.csv', name='Customer_Purchase.csv', size=1489, modificationTime=1720190113000),
 FileInfo(path='dbfs:/FileStore/tables/Fish.csv', name='Fish.csv', size=6349, modificationTime=1720190119000),
 FileInfo(path='dbfs:/FileStore/tables/Ice_Cream.csv', name='Ice_Cream.csv', size=4872, modificationTime=1720190124000),
 FileInfo(path='dbfs:/FileStore/tables/Test1.csv', name='Test1.csv', size=108, modificationTime=1720158698000),
 FileInfo(path='dbfs:/FileStore/tables/Test2.csv', name='Test2.csv', size=192, modificationTime=1720158698000),
 FileInfo(path='dbfs:

In [0]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.sql.functions import col

In [0]:
spark = SparkSession.builder.appName('Credit Card Default Prediction').getOrCreate() 

In [0]:
spark

In [0]:
df_pyspark = spark.read.csv('dbfs:/FileStore/tables/Credit_Default.csv',header=True,inferSchema=True)

In [0]:
df_pyspark.printSchema()

root
 |-- Income: double (nullable = true)
 |-- Age: double (nullable = true)
 |-- Loan: double (nullable = true)
 |-- Loan to Income: double (nullable = true)
 |-- Default: integer (nullable = true)



In [0]:
df_pyspark

DataFrame[Income: double, Age: double, Loan: double, Loan to Income: double, Default: int]

In [0]:
df_pyspark.show()

+-----------+-----------+-----------+--------------+-------+
|     Income|        Age|       Loan|Loan to Income|Default|
+-----------+-----------+-----------+--------------+-------+
| 66155.9251|59.01701507|8106.532131|   0.122536751|      0|
|34415.15397| 48.1171531|6564.745018|   0.190751581|      0|
|57317.17006|63.10804949|8020.953296|     0.1399398|      0|
| 42709.5342|45.75197235| 6103.64226|   0.142910532|      0|
|66952.68885|18.58433593|8770.099235|     0.1309895|      1|
|24904.06414| 57.4716071|15.49859844|    6.22332E-4|      0|
|48430.35961|26.80913242|5722.581981|   0.118161047|      0|
|24500.14198|32.89754832| 2971.00331|   0.121264738|      1|
|40654.89254|55.49685254| 4755.82528|   0.116980392|      0|
|25075.87277|39.77637806|1409.230371|   0.056198657|      0|
|64131.41537|25.67957535|4351.028971|   0.067845516|      0|
|59436.84712|60.47193585|9254.244538|   0.155698779|      0|
|61050.34608|26.35504385|5893.264659|   0.096531224|      0|
|27267.99546|61.57677582

In [0]:
# Drop rows with null values if necessary
df_pyspark= df_pyspark.dropna()

In [0]:
# Assemble features into a vector
feature_cols = df_pyspark.columns[:4]  # Excluding 'diagnosis'
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_assembled = assembler.transform(df_pyspark)

In [0]:
df_assembled.select("features","Default").show()

+--------------------+-------+
|            features|Default|
+--------------------+-------+
|[66155.9251,59.01...|      0|
|[34415.15397,48.1...|      0|
|[57317.17006,63.1...|      0|
|[42709.5342,45.75...|      0|
|[66952.68885,18.5...|      1|
|[24904.06414,57.4...|      0|
|[48430.35961,26.8...|      0|
|[24500.14198,32.8...|      1|
|[40654.89254,55.4...|      0|
|[25075.87277,39.7...|      0|
|[64131.41537,25.6...|      0|
|[59436.84712,60.4...|      0|
|[61050.34608,26.3...|      0|
|[27267.99546,61.5...|      0|
|[63061.96017,39.2...|      0|
|[50501.72669,28.2...|      0|
|[43548.65471,39.5...|      0|
|[43378.17519,60.8...|      0|
|[20542.36507,61.6...|      0|
|[58887.35755,26.0...|      0|
+--------------------+-------+
only showing top 20 rows



In [0]:
# Split data into training and testing sets
train_data, test_data = df_assembled.randomSplit([0.8, 0.2], seed=42)

In [0]:
# Initialize logistic regression model
lr = LogisticRegression(featuresCol="features", labelCol="Default")

# Fit the model
lr_model = lr.fit(train_data)

In [0]:
# Make predictions
predictions = lr_model.transform(test_data)

# Evaluate the model using BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol="Default")
accuracy_binary = evaluator.evaluate(predictions)

# Compute confusion matrix
confusion_matrix = predictions.groupBy('Default').pivot('prediction').count().na.fill(0).orderBy('Default')
confusion_matrix.show()

# Compute classification report (precision, recall, f1-score)
tp = predictions.filter((col("Default") == 1.0) & (col("prediction") == 1.0)).count()
tn = predictions.filter((col("Default") == 0.0) & (col("prediction") == 0.0)).count()
fp = predictions.filter((col("Default") == 0.0) & (col("prediction") == 1.0)).count()
fn = predictions.filter((col("Default") == 1.0) & (col("prediction") == 0.0)).count()

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Binary Classification Accuracy: {accuracy_binary}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1_score}")

+-------+---+---+
|Default|0.0|1.0|
+-------+---+---+
|      0|294|  8|
|      1| 16| 40|
+-------+---+---+

Binary Classification Accuracy: 0.9804280983916746
Precision: 0.8333333333333334
Recall: 0.7142857142857143
F1-score: 0.7692307692307692


In [0]:
# Save the trained logistic regression model
model_path = "./Internship_Sem-6_models/Credit_Card_Default_Prediction_model"
lr_model.save(model_path)

In [0]:
dbutils.fs.ls("dbfs:/Internship_Sem-6_models/Credit_Card_Default_Prediction_model")

[FileInfo(path='dbfs:/Internship_Sem-6_models/Credit_Card_Default_Prediction_model/data/', name='data/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/Internship_Sem-6_models/Credit_Card_Default_Prediction_model/metadata/', name='metadata/', size=0, modificationTime=0)]