In [17]:
import os
os.environ["JAVA_HOME"] = "C:\\jdk-22.0.2"
os.environ["SPARK_HOME"] = "C:\\spark-3.5.2-bin-hadoop3\\spark-3.5.2-bin-hadoop3"
os.environ["PYSPARK_DRIVER_PYTHON"] = "jupyter"
os.environ["PYSPARK_PYTHON"] = "python"

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col, lit, regexp_extract
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import BinaryClassificationMetrics as metric
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from sklearn.metrics import roc_curve, auc
import pandas as pd
import numpy as np

In [19]:
spark = SparkSession.builder.appName('Pyspark_Loan').getOrCreate()

In [20]:
df = spark.read.csv('C:\\01-FaculdadeSemestreAtual\\Processamento_de_Dados_Massivos\\Loan_Approval\\train.csv',header = 'True',inferSchema='True')
final_test = spark.read.csv('C:\\01-FaculdadeSemestreAtual\\Processamento_de_Dados_Massivos\\Loan_Approval\\test.csv',header = 'True',inferSchema='True')

In [21]:
df.show(5)

+---+----------+-------------+---------------------+-----------------+-----------+----------+---------+-------------+-------------------+-------------------------+--------------------------+-----------+
| id|person_age|person_income|person_home_ownership|person_emp_length|loan_intent|loan_grade|loan_amnt|loan_int_rate|loan_percent_income|cb_person_default_on_file|cb_person_cred_hist_length|loan_status|
+---+----------+-------------+---------------------+-----------------+-----------+----------+---------+-------------+-------------------+-------------------------+--------------------------+-----------+
|  0|        37|        35000|                 RENT|              0.0|  EDUCATION|         B|     6000|        11.49|               0.17|                        N|                        14|          0|
|  1|        22|        56000|                  OWN|              6.0|    MEDICAL|         C|     4000|        13.35|               0.07|                        N|                         

In [22]:
df.count()

58645

In [23]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- person_age: integer (nullable = true)
 |-- person_income: integer (nullable = true)
 |-- person_home_ownership: string (nullable = true)
 |-- person_emp_length: double (nullable = true)
 |-- loan_intent: string (nullable = true)
 |-- loan_grade: string (nullable = true)
 |-- loan_amnt: integer (nullable = true)
 |-- loan_int_rate: double (nullable = true)
 |-- loan_percent_income: double (nullable = true)
 |-- cb_person_default_on_file: string (nullable = true)
 |-- cb_person_cred_hist_length: integer (nullable = true)
 |-- loan_status: integer (nullable = true)



In [24]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+---+----------+-------------+---------------------+-----------------+-----------+----------+---------+-------------+-------------------+-------------------------+--------------------------+-----------+
| id|person_age|person_income|person_home_ownership|person_emp_length|loan_intent|loan_grade|loan_amnt|loan_int_rate|loan_percent_income|cb_person_default_on_file|cb_person_cred_hist_length|loan_status|
+---+----------+-------------+---------------------+-----------------+-----------+----------+---------+-------------+-------------------+-------------------------+--------------------------+-----------+
|  0|         0|            0|                    0|                0|          0|         0|        0|            0|                  0|                        0|                         0|          0|
+---+----------+-------------+---------------------+-----------------+-----------+----------+---------+-------------+-------------------+-------------------------+-------------------------

In [25]:
df.limit(100).toPandas().sample(10)

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
93,93,27,70000,MORTGAGE,0.0,EDUCATION,C,6000,12.53,0.09,Y,8,0
79,79,24,90000,MORTGAGE,8.0,PERSONAL,A,10000,5.99,0.11,N,2,0
74,74,23,100000,MORTGAGE,7.0,EDUCATION,A,6000,7.74,0.06,N,3,0
53,53,23,48000,RENT,7.0,MEDICAL,A,8000,8.9,0.17,N,4,1
12,12,25,33000,MORTGAGE,1.0,EDUCATION,B,4000,10.75,0.12,N,3,0
20,20,28,45000,OWN,0.0,PERSONAL,A,3200,7.88,0.07,N,5,0
85,85,30,74000,MORTGAGE,14.0,DEBTCONSOLIDATION,D,7000,15.62,0.09,N,7,1
39,39,22,32000,RENT,6.0,EDUCATION,D,8000,15.58,0.25,N,3,1
83,83,37,75000,RENT,0.0,PERSONAL,A,10000,6.99,0.13,N,15,0
60,60,26,54000,RENT,1.0,PERSONAL,A,9600,9.32,0.18,N,4,1


In [26]:
df.groupby('loan_intent').count().show()

+-----------------+-----+
|      loan_intent|count|
+-----------------+-----+
|DEBTCONSOLIDATION| 9133|
|          VENTURE|10011|
|         PERSONAL|10016|
|        EDUCATION|12271|
|  HOMEIMPROVEMENT| 6280|
|          MEDICAL|10934|
+-----------------+-----+



In [27]:
df.groupby('loan_grade').count().show()

+----------+-----+
|loan_grade|count|
+----------+-----+
|         F|  149|
|         E| 1009|
|         B|20400|
|         D| 5034|
|         C|11036|
|         A|20984|
|         G|   33|
+----------+-----+



In [28]:
# Definir as colunas categóricas se deseja codificar
categorical_columns = ["loan_grade", "person_home_ownership", "loan_intent", 'cb_person_default_on_file']

# Criar os indexadores
indexers = [StringIndexer(inputCol=column, outputCol=column + "_index") for column in categorical_columns]

# Criar os OneHotEncoders
encoders = [OneHotEncoder(inputCol=column + "_index", outputCol=column + "_onehot") for column in ["person_home_ownership", "loan_intent"]]

# Criar um pipeline que aplica primeiro os indexadores e depois os encoders
pipeline = Pipeline(stages=indexers + encoders)

# Ajustar e transformar o DataFrame com o pipeline
df_encoded = pipeline.fit(df).transform(df)

# Mostrar o resultado
df_encoded.show()


+---+----------+-------------+---------------------+-----------------+-----------------+----------+---------+-------------+-------------------+-------------------------+--------------------------+-----------+----------------+---------------------------+-----------------+-------------------------------+----------------------------+------------------+
| id|person_age|person_income|person_home_ownership|person_emp_length|      loan_intent|loan_grade|loan_amnt|loan_int_rate|loan_percent_income|cb_person_default_on_file|cb_person_cred_hist_length|loan_status|loan_grade_index|person_home_ownership_index|loan_intent_index|cb_person_default_on_file_index|person_home_ownership_onehot|loan_intent_onehot|
+---+----------+-------------+---------------------+-----------------+-----------------+----------+---------+-------------+-------------------+-------------------------+--------------------------+-----------+----------------+---------------------------+-----------------+-------------------------

In [29]:
# Agora, são elimindas as colunas que não são úteis
df_encoded = df_encoded.drop("id","person_home_ownership","loan_intent","loan_grade","cb_person_default_on_file", 'person_home_ownership_index', 'loan_intent_index')

feature_columns = df_encoded.columns

# retirando a variácvel a ser predita
feature_columns.remove('loan_status')

# Antes de modelar, todas as colunas são colocadas em único vetor
feature = VectorAssembler(inputCols = feature_columns,outputCol="features")
feature_vector=feature.transform(df_encoded)
feature_vector.limit(3).toPandas()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status,loan_grade_index,cb_person_default_on_file_index,person_home_ownership_onehot,loan_intent_onehot,features
0,37,35000,0.0,6000,11.49,0.17,14,0,1.0,0.0,"(1.0, 0.0, 0.0)","(1.0, 0.0, 0.0, 0.0, 0.0)","(37.0, 35000.0, 0.0, 6000.0, 11.49, 0.17, 14.0..."
1,22,56000,6.0,4000,13.35,0.07,2,0,2.0,0.0,"(0.0, 0.0, 1.0)","(0.0, 1.0, 0.0, 0.0, 0.0)","(22.0, 56000.0, 6.0, 4000.0, 13.35, 0.07, 2.0,..."
2,29,28800,8.0,6000,8.9,0.21,10,0,0.0,0.0,"(0.0, 0.0, 1.0)","(0.0, 0.0, 1.0, 0.0, 0.0)","(29.0, 28800.0, 8.0, 6000.0, 8.9, 0.21, 10.0, ..."


In [30]:
# Seleciona as colunas usadas no treinamento
loan_df = feature_vector.select(['features','loan_status'])
# Split the dataset to train_df and test_df
train_df,test_df = loan_df.randomSplit([0.7,0.3])

In [31]:
lr = LogisticRegression(labelCol="loan_status")

model_lr = lr.fit(train_df)
model_predictions= model_lr.transform(test_df)

print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='loan_status',metricName='accuracy').evaluate(model_predictions))
print('Precision: ', MulticlassClassificationEvaluator(labelCol='loan_status',metricName='weightedPrecision').evaluate(model_predictions))
print('Recall: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='weightedRecall').evaluate(model_predictions))
print('F1-Score: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='f1').evaluate(model_predictions))

Accuracy:  0.9000398701372672
Precision:  0.8904997678694603
Recall:  0.9000398701372672
F1-Score:  0.8899539568242163


In [32]:
# Ajustando os hiperparâmetros
lr = LogisticRegression(labelCol="loan_status")

# Definindo as faixas dos hiperparâmtros para o Grid Search
paramGrid = ParamGridBuilder().addGrid(lr.regParam, (0.01, 0.1))\
                              .addGrid(lr.maxIter, (5, 10))\
                              .addGrid(lr.tol, (1e-4, 1e-5))\
                              .addGrid(lr.elasticNetParam, (0.25,0.75))\
                              .build()

from pyspark.ml.tuning import CrossValidator

# Etapa de cross-validation usando 5 folds
cv = CrossValidator( estimator=lr
                           ,estimatorParamMaps=paramGrid
                           ,evaluator=MulticlassClassificationEvaluator(labelCol='loan_status')
                           ,numFolds=5)
model_lr_h = cv.fit(train_df)
model_predictions= model_lr_h.transform(test_df)

print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='loan_status',metricName='accuracy').evaluate(model_predictions))
print('Precision: ', MulticlassClassificationEvaluator(labelCol='loan_status',metricName='weightedPrecision').evaluate(model_predictions))
print('Recall: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='weightedRecall').evaluate(model_predictions))
print('F1-Score: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='f1').evaluate(model_predictions))

Accuracy:  0.8963376431053142
Precision:  0.8862685315859369
Recall:  0.896337643105314
F1-Score:  0.8819604155807382


In [33]:
rf = RandomForestClassifier(labelCol='loan_status')

model_rf = rf.fit(train_df)
model_predictions= model_rf.transform(test_df)


print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='loan_status',metricName='accuracy').evaluate(model_predictions))
print('Precision: ', MulticlassClassificationEvaluator(labelCol='loan_status',metricName='weightedPrecision').evaluate(model_predictions))
print('Recall: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='weightedRecall').evaluate(model_predictions))
print('F1-Score: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='f1').evaluate(model_predictions))

Accuracy:  0.9385999886085322
Precision:  0.9388798009023898
Recall:  0.9385999886085322
F1-Score:  0.9327447652592937


In [34]:
# Ajustando os hiperparâmetros (demora mais...)
rf = RandomForestClassifier(labelCol='loan_status')
paramGrid = ParamGridBuilder()\
                                .addGrid(rf.maxDepth, [5, 10, 20]) \
                                .addGrid(rf.maxBins, [20, 32, 50]) \
                                .addGrid(rf.numTrees, [20, 40, 60 ]) \
                                .addGrid(rf.impurity, ["gini", "entropy"]) \
                                .addGrid(rf.minInstancesPerNode, [1, 5, 10]) \
                                .build()


tvs = TrainValidationSplit(
    estimator=rf,
    estimatorParamMaps=paramGrid,
    evaluator=MulticlassClassificationEvaluator(labelCol='loan_status'),
    trainRatio=0.8
)

model_rf_h = tvs.fit(train_df)
model_predictions= model_rf_h.transform(test_df)


print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='loan_status',metricName='accuracy').evaluate(model_predictions))
print('Precision: ', MulticlassClassificationEvaluator(labelCol='loan_status',metricName='weightedPrecision').evaluate(model_predictions))
print('Recall: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='weightedRecall').evaluate(model_predictions))
print('F1-Score: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='f1').evaluate(model_predictions))

Accuracy:  0.948624480264282
Precision:  0.9487205389714849
Recall:  0.948624480264282
F1-Score:  0.9448386484999469


In [35]:
from pyspark.ml.classification import GBTClassifier

# Gradient Boosting Classifier
gbt = GBTClassifier(labelCol='loan_status')

# Treinando o modelo
model_gbt = gbt.fit(train_df)

# Fazendo previsões
model_predictions_gbt = model_gbt.transform(test_df)

# Avaliação
print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='accuracy').evaluate(model_predictions_gbt))
print('Precision: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='weightedPrecision').evaluate(model_predictions_gbt))
print('Recall: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='weightedRecall').evaluate(model_predictions_gbt))
print('F1-Score: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='f1').evaluate(model_predictions_gbt))

Accuracy:  0.9474853334852196
Precision:  0.9475619759249985
Recall:  0.9474853334852196
F1-Score:  0.943512875720298


In [36]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Gradient Boosting Classifier
gbt = GBTClassifier(labelCol='loan_status')

# Definindo a grade de hiperparâmetros
paramGrid_gbt = ParamGridBuilder()\
                                .addGrid(gbt.maxDepth, [5, 10, 20]) \
                                .addGrid(gbt.maxIter, [10, 20, 30]) \
                                .addGrid(gbt.stepSize, [0.01, 0.1, 0.3]) \
                                .build()

# Configuração de TrainValidationSplit
tvs_gbt = TrainValidationSplit(
    estimator=gbt,
    estimatorParamMaps=paramGrid_gbt,
    evaluator=MulticlassClassificationEvaluator(labelCol='loan_status'),
    trainRatio=0.8
)

# Ajustando o modelo
model_gbt_h = tvs_gbt.fit(train_df)

# Fazendo previsões
model_predictions_gbt_h = model_gbt_h.transform(test_df)

# Avaliação
print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='accuracy').evaluate(model_predictions_gbt_h))
print('Precision: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='weightedPrecision').evaluate(model_predictions_gbt_h))
print('Recall: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='weightedRecall').evaluate(model_predictions_gbt_h))
print('F1-Score: ', MulticlassClassificationEvaluator(labelCol='loan_status', metricName='f1').evaluate(model_predictions_gbt_h))

Accuracy:  0.9484536082474226
Precision:  0.9481934306677513
Recall:  0.9484536082474228
F1-Score:  0.944835666060526


# Model Fit

In [47]:
#df_submission=final_test.drop('id')
df_submission_encoded = pipeline.fit(final_test).transform(final_test)

# Mostrar o resultado
df_submission_encoded.show()

+-----+----------+-------------+---------------------+-----------------+-----------------+----------+---------+-------------+-------------------+-------------------------+--------------------------+----------------+---------------------------+-----------------+-------------------------------+----------------------------+------------------+
|   id|person_age|person_income|person_home_ownership|person_emp_length|      loan_intent|loan_grade|loan_amnt|loan_int_rate|loan_percent_income|cb_person_default_on_file|cb_person_cred_hist_length|loan_grade_index|person_home_ownership_index|loan_intent_index|cb_person_default_on_file_index|person_home_ownership_onehot|loan_intent_onehot|
+-----+----------+-------------+---------------------+-----------------+-----------------+----------+---------+-------------+-------------------+-------------------------+--------------------------+----------------+---------------------------+-----------------+-------------------------------+-----------------------

In [48]:
df_submission_encoded = df_submission_encoded.drop(
 'person_home_ownership',
 'loan_intent',
 'loan_grade',
 'cb_person_default_on_file',
 'person_home_ownership_index',
 'loan_intent_index')

In [49]:
# Antes de modelar, todas as colunas são colocadas em único vetor
feature_columns = df_submission_encoded.columns

feature = VectorAssembler(inputCols = feature_columns[1:],outputCol="features")
feature_vector=feature.transform(df_submission_encoded)
feature_vector.limit(3).toPandas()

Unnamed: 0,id,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_grade_index,cb_person_default_on_file_index,person_home_ownership_onehot,loan_intent_onehot,features
0,58645,23,69000,3.0,25000,15.76,0.36,2,5.0,0.0,"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 0.0, 0.0)","(23.0, 69000.0, 3.0, 25000.0, 15.76, 0.36, 2.0..."
1,58646,26,96000,6.0,10000,12.68,0.1,4,2.0,1.0,"(0.0, 1.0, 0.0)","(0.0, 0.0, 1.0, 0.0, 0.0)","[26.0, 96000.0, 6.0, 10000.0, 12.68, 0.1, 4.0,..."
2,58647,26,30000,5.0,4000,17.19,0.13,2,4.0,1.0,"(1.0, 0.0, 0.0)","(0.0, 0.0, 0.0, 1.0, 0.0)","[26.0, 30000.0, 5.0, 4000.0, 17.19, 0.13, 2.0,..."


# Kagle submission

In [50]:
model_predictions= model_gbt_h.transform(feature_vector.select('features'))

In [51]:
model_predictions.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|35135|
|       1.0| 3963|
+----------+-----+



In [52]:
submission = feature_vector.select('id', 'features').join(model_predictions, 'features').select('id', 'probability')

In [53]:
submission.show()

+-----+--------------------+
|   id|         probability|
+-----+--------------------+
|58645|[0.02281190336471...|
|58646|[0.97059503704207...|
|58647|[0.61636551070259...|
|58648|[0.97582273464598...|
|58649|[0.89801901014930...|
|58650|[0.07325868076327...|
|58651|[0.98078374063663...|
|58652|[0.97840206760779...|
|58653|[0.73267263174067...|
|58654|[0.96115781786807...|
|58655|[0.94542249400807...|
|58656|[0.88956568272238...|
|58657|[0.84117706691961...|
|58658|[0.98090034942212...|
|58659|[0.96488133296817...|
|58660|[0.80249833081583...|
|58661|[0.97609383176999...|
|58662|[0.97318294383093...|
|58663|[0.96137694555046...|
|58664|[0.97465368779427...|
+-----+--------------------+
only showing top 20 rows



In [54]:
# Preparação do arquivo de submissão conforme orientado

from pyspark.ml.functions import vector_to_array

kaggle_submission = submission.withColumn("loan_status", vector_to_array("probability")[1]).drop('probability')


In [55]:
kaggle_submission.show()

+-----+--------------------+
|   id|         loan_status|
+-----+--------------------+
|58645|  0.9771880966352884|
|58646| 0.02940496295792061|
|58647|  0.3836344892974045|
|58648| 0.02417726535401654|
|58649| 0.10198098985069937|
|58650|  0.9267413192367256|
|58651|0.019216259363362687|
|58652|0.021597932392208974|
|58653| 0.26732736825932935|
|58654| 0.03884218213192869|
|58655| 0.05457750599192801|
|58656| 0.11043431727761088|
|58657| 0.15882293308038864|
|58658|  0.0190996505778791|
|58659| 0.03511866703182176|
|58660| 0.19750166918416445|
|58661|0.023906168230003355|
|58662| 0.02681705616906116|
|58663|  0.0386230544495304|
|58664|0.025346312205720145|
+-----+--------------------+
only showing top 20 rows



In [56]:
kaggle_submission.toPandas().to_csv('submission_gbt_h.csv', index=False)

## Tentei diversos modelos mas não consegui uma precisão maior que 60%

![image.png](attachment:image.png)