In [30]:
import os
os.environ["JAVA_HOME"] = "C:\\jdk-22.0.2"
os.environ["SPARK_HOME"] = "C:\\spark-3.5.2-bin-hadoop3\\spark-3.5.2-bin-hadoop3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"
os.environ["PYSPARK_PYTHON"] = "python"

In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col, lit, regexp_extract
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import BinaryClassificationMetrics as metric
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from sklearn.metrics import roc_curve, auc
import pandas as pd
import numpy as np

In [32]:
spark = SparkSession.builder.appName('Pyspark_Loan').getOrCreate()

In [33]:
df = spark.read.csv('C:\\01-FaculdadeSemestreAtual\\Processamento_de_Dados_Massivos\\Loan_Approval\\train.csv',header = 'True',inferSchema='True')
final_test = spark.read.csv('C:\\01-FaculdadeSemestreAtual\\Processamento_de_Dados_Massivos\\Loan_Approval\\test.csv',header = 'True',inferSchema='True')

In [34]:
df.show(5)

+---+----------+-------------+---------------------+-----------------+-----------+----------+---------+-------------+-------------------+-------------------------+--------------------------+-----------+
| id|person_age|person_income|person_home_ownership|person_emp_length|loan_intent|loan_grade|loan_amnt|loan_int_rate|loan_percent_income|cb_person_default_on_file|cb_person_cred_hist_length|loan_status|
+---+----------+-------------+---------------------+-----------------+-----------+----------+---------+-------------+-------------------+-------------------------+--------------------------+-----------+
|  0|        37|        35000|                 RENT|              0.0|  EDUCATION|         B|     6000|        11.49|               0.17|                        N|                        14|          0|
|  1|        22|        56000|                  OWN|              6.0|    MEDICAL|         C|     4000|        13.35|               0.07|                        N|                         

In [35]:
df.count()

58645

In [36]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- person_age: integer (nullable = true)
 |-- person_income: integer (nullable = true)
 |-- person_home_ownership: string (nullable = true)
 |-- person_emp_length: double (nullable = true)
 |-- loan_intent: string (nullable = true)
 |-- loan_grade: string (nullable = true)
 |-- loan_amnt: integer (nullable = true)
 |-- loan_int_rate: double (nullable = true)
 |-- loan_percent_income: double (nullable = true)
 |-- cb_person_default_on_file: string (nullable = true)
 |-- cb_person_cred_hist_length: integer (nullable = true)
 |-- loan_status: integer (nullable = true)



In [37]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+----------+-------------+---------------------+-----------------+-----------+----------+---------+-------------+-------------------+-------------------------+--------------------------+-----------+
| id|person_age|person_income|person_home_ownership|person_emp_length|loan_intent|loan_grade|loan_amnt|loan_int_rate|loan_percent_income|cb_person_default_on_file|cb_person_cred_hist_length|loan_status|
+---+----------+-------------+---------------------+-----------------+-----------+----------+---------+-------------+-------------------+-------------------------+--------------------------+-----------+
|  0|         0|            0|                    0|                0|          0|         0|        0|            0|                  0|                        0|                         0|          0|
+---+----------+-------------+---------------------+-----------------+-----------+----------+---------+-------------+-------------------+-------------------------+-------------------------

In [38]:
df.limit(100).toPandas().sample(10)

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
30,30,30,48000,RENT,12.0,EDUCATION,B,2000,9.62,0.04,N,8,0
57,57,37,60000,MORTGAGE,11.0,VENTURE,A,2000,7.66,0.03,N,17,0
68,68,31,62900,MORTGAGE,2.0,MEDICAL,D,18000,14.09,0.24,N,5,1
63,63,22,50000,RENT,5.0,PERSONAL,C,4200,15.23,0.08,Y,2,0
65,65,25,90000,MORTGAGE,2.0,VENTURE,A,15000,9.38,0.17,N,4,0
94,94,22,33000,RENT,4.0,PERSONAL,B,9800,10.62,0.28,N,4,0
38,38,30,54000,RENT,0.0,MEDICAL,B,12500,11.71,0.24,N,10,1
52,52,27,70000,RENT,9.0,EDUCATION,A,20000,6.54,0.29,N,9,0
82,82,24,75000,RENT,3.0,PERSONAL,B,15000,11.49,0.2,N,2,0
17,17,29,60000,OWN,13.0,MEDICAL,A,15000,6.62,0.25,N,9,0


In [39]:
df.groupby('loan_intent').count().show()

+-----------------+-----+
|      loan_intent|count|
+-----------------+-----+
|DEBTCONSOLIDATION| 9133|
|          VENTURE|10011|
|         PERSONAL|10016|
|        EDUCATION|12271|
|  HOMEIMPROVEMENT| 6280|
|          MEDICAL|10934|
+-----------------+-----+



In [40]:
df.groupby('loan_grade').count().show()

+----------+-----+
|loan_grade|count|
+----------+-----+
|         F|  149|
|         E| 1009|
|         B|20400|
|         D| 5034|
|         C|11036|
|         A|20984|
|         G|   33|
+----------+-----+



In [41]:
df.groupBy('loan_status').count().show()

+-----------+-----+
|loan_status|count|
+-----------+-----+
|          1| 8350|
|          0|50295|
+-----------+-----+



In [42]:
# Definir as colunas categóricas se deseja codificar
categorical_columns = ["loan_grade", "person_home_ownership", "loan_intent", 'cb_person_default_on_file']

# Criar os indexadores
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index") for column in categorical_columns]

# Criar os OneHotEncoders
encoders = [OneHotEncoder(inputCol=column + "_index", outputCol=column + "_onehot") for column in ["person_home_ownership", "loan_intent"]]

# Criar um pipeline que aplica primeiro os indexadores e depois os encoders
pipeline = Pipeline(stages=indexers + encoders)

# Ajustar e transformar o DataFrame com o pipeline
df_encoded = pipeline.fit(df).transform(df)

# Mostrar o resultado
df_encoded.show()


+---+----------+-------------+---------------------+-----------------+-----------------+----------+---------+-------------+-------------------+-------------------------+--------------------------+-----------+----------------+---------------------------+-----------------+-------------------------------+----------------------------+------------------+
| id|person_age|person_income|person_home_ownership|person_emp_length|      loan_intent|loan_grade|loan_amnt|loan_int_rate|loan_percent_income|cb_person_default_on_file|cb_person_cred_hist_length|loan_status|loan_grade_index|person_home_ownership_index|loan_intent_index|cb_person_default_on_file_index|person_home_ownership_onehot|loan_intent_onehot|
+---+----------+-------------+---------------------+-----------------+-----------------+----------+---------+-------------+-------------------+-------------------------+--------------------------+-----------+----------------+---------------------------+-----------------+-------------------------

In [43]:
# Agora, são elimindas as colunas que não são úteis
df_encoded = df_encoded.drop("id","person_home_ownership","loan_intent","loan_grade","cb_person_default_on_file", 'person_home_ownership_index', 'loan_intent_index')

# Antes de modelar, todas as colunas são colocadas em único vetor
feature = VectorAssembler(inputCols = df_encoded.columns[1:],outputCol="features")
feature_vector=feature.transform(df_encoded)
feature_vector.limit(3).toPandas()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status,loan_grade_index,cb_person_default_on_file_index,person_home_ownership_onehot,loan_intent_onehot,features
0,37,35000,0.0,6000,11.49,0.17,14,0,1.0,0.0,"(1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(35000.0, 0.0, 6000.0, 11.49, 0.17, 14.0, 0.0,..."
1,22,56000,6.0,4000,13.35,0.07,2,0,2.0,0.0,"(0.0, 0.0, 1.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(56000.0, 6.0, 4000.0, 13.35, 0.07, 2.0, 0.0, ..."
2,29,28800,8.0,6000,8.9,0.21,10,0,0.0,0.0,"(0.0, 0.0, 1.0)","(0.0, 0.0, 1.0, 0.0, 0.0)","(28800.0, 8.0, 6000.0, 8.9, 0.21, 10.0, 0.0, 0..."


In [44]:
# Seleciona as colunas usadas no treinamento
loan_df = feature_vector.select(['features','loan_status'])
# Split the dataset to train_df and test_df
train_df,test_df = loan_df.randomSplit([0.7,0.3])

In [45]:
loan_df.groupBy('loan_status').count().show()

+-----------+-----+
|loan_status|count|
+-----------+-----+
|          1| 8350|
|          0|50295|
+-----------+-----+



In [46]:
test_df.groupBy('loan_status').count().show()

+-----------+-----+
|loan_status|count|
+-----------+-----+
|          1| 2505|
|          0|15091|
+-----------+-----+



In [47]:
train_df.groupBy('loan_status').count().show()

+-----------+-----+
|loan_status|count|
+-----------+-----+
|          1| 5845|
|          0|35204|
+-----------+-----+



In [48]:
lr = LogisticRegression(labelCol="loan_status")

model = lr.fit(train_df)
model_predictions= model.transform(test_df)

print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='loan_status',metricName='accuracy').evaluate(model_predictions))
print('Precision: ', MulticlassClassificationEvaluator(labelCol='loan_status',metricName='weightedPrecision').evaluate(model_predictions))
print('Recall: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='weightedRecall').evaluate(model_predictions))
print('F1-Score: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='f1').evaluate(model_predictions))

Accuracy:  1.0
Precision:  1.0
Recall:  1.0
F1-Score:  1.0


In [49]:
rf = RandomForestClassifier(labelCol='loan_status')

model = rf.fit(train_df)
model_predictions= model.transform(test_df)


print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='loan_status',metricName='accuracy').evaluate(model_predictions))
print('Precision: ', MulticlassClassificationEvaluator(labelCol='loan_status',metricName='weightedPrecision').evaluate(model_predictions))
print('Recall: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='weightedRecall').evaluate(model_predictions))
print('F1-Score: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='f1').evaluate(model_predictions))

Accuracy:  1.0
Precision:  1.0
Recall:  1.0
F1-Score:  1.0


In [50]:
final_test.show()

+-----+----------+-------------+---------------------+-----------------+-----------------+----------+---------+-------------+-------------------+-------------------------+--------------------------+
|   id|person_age|person_income|person_home_ownership|person_emp_length|      loan_intent|loan_grade|loan_amnt|loan_int_rate|loan_percent_income|cb_person_default_on_file|cb_person_cred_hist_length|
+-----+----------+-------------+---------------------+-----------------+-----------------+----------+---------+-------------+-------------------+-------------------------+--------------------------+
|58645|        23|        69000|                 RENT|              3.0|  HOMEIMPROVEMENT|         F|    25000|        15.76|               0.36|                        N|                         2|
|58646|        26|        96000|             MORTGAGE|              6.0|         PERSONAL|         C|    10000|        12.68|                0.1|                        Y|                         4|
|5864

In [52]:
# Transformando o final_test da mesma maneira que foi feito no treino
final_test_transformed = pipeline.fit(final_test).transform(final_test)

# Agora, são elimindas as colunas que não são úteis
final_test_transformed = final_test_transformed.drop("id","person_home_ownership","loan_intent","loan_grade","cb_person_default_on_file", 'person_home_ownership_index', 'loan_intent_index')

# Antes de modelar, todas as colunas são colocadas em único vetor
feature = VectorAssembler(inputCols = final_test_transformed.columns[1:],outputCol="features")
feature_vector=feature.transform(final_test_transformed)

In [53]:
feature_vector.show()

+----------+-------------+-----------------+---------+-------------+-------------------+--------------------------+----------------+-------------------------------+----------------------------+------------------+--------------------+
|person_age|person_income|person_emp_length|loan_amnt|loan_int_rate|loan_percent_income|cb_person_cred_hist_length|loan_grade_index|cb_person_default_on_file_index|person_home_ownership_onehot|loan_intent_onehot|            features|
+----------+-------------+-----------------+---------+-------------+-------------------+--------------------------+----------------+-------------------------------+----------------------------+------------------+--------------------+
|        23|        69000|              3.0|    25000|        15.76|               0.36|                         2|             5.0|                            0.0|               (3,[0],[1.0])|         (5,[],[])|(16,[0,1,2,3,4,5,...|
|        26|        96000|              6.0|    10000|        12

In [64]:
feature_vector = feature_vector.withColumn("loan_status", lit(None).cast(IntegerType()))

In [None]:
# Usando o modelo treinado para prever no final_test
predictions = model.transform(feature_vector)


print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='loan_status',metricName='accuracy').evaluate(predictions))
print('Precision: ', MulticlassClassificationEvaluator(labelCol='loan_status',metricName='weightedPrecision').evaluate(predictions))
print('Recall: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='weightedRecall').evaluate(predictions))
print('F1-Score: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='f1').evaluate(predictions))