In [1]:
!pip install pyspark

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()




In [2]:
from google.colab import files
from pyspark.sql.functions import when, col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [3]:
print("Выберите файл loan_default.csv")
uploaded = files.upload()

Выберите файл loan_default.csv


Saving loan_default.csv to loan_default (1).csv


In [4]:
df = spark.read.csv('loan_default.csv', header=True, inferSchema=True)
df.show(5)
df.printSchema()

+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+--------------+-------------+-----------+-------------+-----------+-----------+-------+
|    LoanID|Age|Income|LoanAmount|CreditScore|MonthsEmployed|NumCreditLines|InterestRate|LoanTerm|DTIRatio|  Education|EmploymentType|MaritalStatus|HasMortgage|HasDependents|LoanPurpose|HasCoSigner|Default|
+----------+---+------+----------+-----------+--------------+--------------+------------+--------+--------+-----------+--------------+-------------+-----------+-------------+-----------+-----------+-------+
|I38PQUQS96| 56| 85994|     50587|        520|            80|             4|       15.23|      36|    0.44| Bachelor's|     Full-time|     Divorced|        Yes|          Yes|      Other|        Yes|      0|
|HPSK72WA7R| 69| 50432|    124440|        458|            15|             1|        4.81|      60|    0.68|   Master's|     Full-time|      Married|         No|           N

## Статистика числовых значений:

In [5]:
numeric_stats = df.describe(['Age', 'Income', 'LoanAmount', 'CreditScore',
                                   'MonthsEmployed', 'InterestRate', 'DTIRatio'])
numeric_stats.show()

+-------+------------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|summary|               Age|           Income|        LoanAmount|       CreditScore|    MonthsEmployed|      InterestRate|           DTIRatio|
+-------+------------------+-----------------+------------------+------------------+------------------+------------------+-------------------+
|  count|            255347|           255347|            255347|            255347|            255347|            255347|             255347|
|   mean|43.498306226429136|82499.30459727351|127578.86551242035| 574.2643461642392|59.541976212761455|13.492773480792918| 0.5002120643673071|
| stddev|14.990258418709619|38963.01372937807|  70840.7061415011|158.90386659497565| 34.64337563735208| 6.636443064944343|0.23091661543575906|
|    min|                18|            15000|              5000|               300|                 0|               2.0|                0.1|

## Сделаем переменные бинарными

In [6]:
df = df.withColumn("HasMortgage_bin", when(col("HasMortgage") == "Yes", 1).otherwise(0))
df = df.withColumn("HasDependents_bin", when(col("HasDependents") == "Yes", 1).otherwise(0))
df = df.withColumn("HasCoSigner_bin", when(col("HasCoSigner") == "Yes", 1).otherwise(0))

## Оставим только числовые значения

In [7]:
df = df.drop("LoanID")
numeric_cols = ['Age', 'Income', 'LoanAmount', 'CreditScore',
            'MonthsEmployed', 'InterestRate', 'DTIRatio',
            'HasMortgage_bin', 'HasDependents_bin', 'HasCoSigner_bin']

## Создаем векторы

In [8]:
assembler = VectorAssembler(inputCols=numeric_cols, outputCol="numeric_cols")

## Преобразовываем датасет для ML

In [9]:
ml_data = assembler.transform(df).select("numeric_cols", col("Default").alias("label"))

## Разделяем выборки на обучающую и тестовую

In [10]:
train_data, test_data = ml_data.randomSplit([0.8, 0.2], seed=69)

## Построим логистическую регрессию

In [11]:
log_reg = LogisticRegression(featuresCol="numeric_cols", labelCol="label", maxIter=10)

In [12]:
model = log_reg.fit(train_data) #обучение

In [13]:
predictions = model.transform(test_data) #предсказания

## Оценка модели

In [14]:
predictions.groupBy("label", "prediction").count().show() #матрица (ну скорее таблица) ошибок

+-----+----------+-----+
|label|prediction|count|
+-----+----------+-----+
|    1|       0.0| 5730|
|    0|       0.0|45577|
|    1|       1.0|  170|
|    0|       1.0|  112|
+-----+----------+-----+



In [15]:
auc = BinaryClassificationEvaluator(labelCol="label", metricName="areaUnderROC").evaluate(predictions)

In [16]:
accuracy = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy").evaluate(predictions)

In [17]:
precision = MulticlassClassificationEvaluator(labelCol="label", metricName="weightedPrecision").evaluate(predictions)

In [18]:
recall = MulticlassClassificationEvaluator(labelCol="label", metricName="weightedRecall").evaluate(predictions)

In [19]:
print(f"AUC-ROC: {auc:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")


AUC-ROC: 0.7424
Accuracy: 0.8868
Precision: 0.8557
Recall: 0.8868


In [20]:
spark.stop()