### Настройка рабочего места

In [1]:
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._

import org.apache.spark.ml.feature.{VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{RandomForestClassifier, LogisticRegression, GBTClassifier}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator}

In [2]:
// Spark сессия
val spark = SparkSession.builder()
  .appName("maxb_lab05")
  .config("spark.executor.memory", "16g")
  .config("spark.driver.memory", "8g")
  .config("spark.executor.instances", "20")
  .config("spark.executor.cores", "10")
  .getOrCreate()

spark = org.apache.spark.sql.SparkSession@db466c0


org.apache.spark.sql.SparkSession@db466c0

In [3]:
spark

org.apache.spark.sql.SparkSession@db466c0

### Загрузка данных

In [14]:
// Тренировочные данные
val df = spark.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv("/labs/slaba05/lab05_train.csv")

df.show(numRows=1, truncate=10, vertical=true)

-RECORD 0---------------------------------
 _c0                         | 333149     
 ID                          | 479990     
 CR_PROD_CNT_IL              | 0          
 AMOUNT_RUB_CLO_PRC          | 0.0        
 PRC_ACCEPTS_A_EMAIL_LINK    | 0.0        
 APP_REGISTR_RGN_CODE        | null       
 PRC_ACCEPTS_A_POS           | 0.0        
 PRC_ACCEPTS_A_TK            | 0.0        
 TURNOVER_DYNAMIC_IL_1M      | 0.0        
 CNT_TRAN_AUT_TENDENCY1M     | null       
 SUM_TRAN_AUT_TENDENCY1M     | null       
 AMOUNT_RUB_SUP_PRC          | 0.0        
 PRC_ACCEPTS_A_AMOBILE       | 0.0        
 SUM_TRAN_AUT_TENDENCY3M     | null       
 CLNT_TRUST_RELATION         | null       
 PRC_ACCEPTS_TK              | 0.0        
 PRC_ACCEPTS_A_MTP           | 0.0        
 REST_DYNAMIC_FDEP_1M        | 0.0        
 CNT_TRAN_AUT_TENDENCY3M     | null       
 CNT_ACCEPTS_TK              | 0.0        
 APP_MARITAL_STATUS          | null       
 REST_DYNAMIC_SAVE_3M        | 0.0        
 CR_PROD_CN

lastException = null
df = [_c0: int, ID: int ... 115 more fields]


[_c0: int, ID: int ... 115 more fields]

### Удалить колонки с большим количеством пропусков

In [6]:
val totalRows = df.count()

val columnNames = df.columns

// Процент заполнения каждого столбца
val nullPercentages = columnNames.map { column =>
  val countEmpty = df.filter(col(column).isNull).count()
  (column, countEmpty  * 100.0 / totalRows.toDouble)
}.toMap

// Столбцы, удовлетворяющие условию < 90% пропусков
val filteredColumns = columnNames.filter(c => nullPercentages.getOrElse(c, 0.0) < 90.0)

// Датафрейм с очищенным набором столбцов
val cleanedDf = df.select(filteredColumns.map(col(_)):_*)

totalRows = 320764
columnNames = Array(_c0, ID, CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC, PRC_ACCEPTS_A_EMAIL_LINK, APP_REGISTR_RGN_CODE, PRC_ACCEPTS_A_POS, PRC_ACCEPTS_A_TK, TURNOVER_DYNAMIC_IL_1M, CNT_TRAN_AUT_TENDENCY1M, SUM_TRAN_AUT_TENDENCY1M, AMOUNT_RUB_SUP_PRC, PRC_ACCEPTS_A_AMOBILE, SUM_TRAN_AUT_TENDENCY3M, CLNT_TRUST_RELATION, PRC_ACCEPTS_TK, PRC_ACCEPTS_A_MTP, REST_DYNAMIC_FDEP_1M, CNT_TRAN_AUT_TENDENCY3M, CNT_ACCEPTS_TK, APP_MARITAL_STATUS, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, CNT_TRAN_MED_TENDENCY1M, APP_KIND_OF_PROP_HABITATION, CLNT_JOB_POSITION_TYPE, AMOUNT_RUB_NAS_PRC, CLNT_JOB_POSITION, APP_DRIVING_LICENSE, TRANS_COUNT_SUP_PRC, APP_EDUCATION, CNT_TRAN_CLO_TENDENCY1M, SUM_TRAN_MED_TENDENCY1M, PRC_ACCEPTS_A_ATM, PRC_ACCEPTS_MTP, TRANS_COUNT_N...


Array(_c0, ID, CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC, PRC_ACCEPTS_A_EMAIL_LINK, APP_REGISTR_RGN_CODE, PRC_ACCEPTS_A_POS, PRC_ACCEPTS_A_TK, TURNOVER_DYNAMIC_IL_1M, CNT_TRAN_AUT_TENDENCY1M, SUM_TRAN_AUT_TENDENCY1M, AMOUNT_RUB_SUP_PRC, PRC_ACCEPTS_A_AMOBILE, SUM_TRAN_AUT_TENDENCY3M, CLNT_TRUST_RELATION, PRC_ACCEPTS_TK, PRC_ACCEPTS_A_MTP, REST_DYNAMIC_FDEP_1M, CNT_TRAN_AUT_TENDENCY3M, CNT_ACCEPTS_TK, APP_MARITAL_STATUS, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, CNT_TRAN_MED_TENDENCY1M, APP_KIND_OF_PROP_HABITATION, CLNT_JOB_POSITION_TYPE, AMOUNT_RUB_NAS_PRC, CLNT_JOB_POSITION, APP_DRIVING_LICENSE, TRANS_COUNT_SUP_PRC, APP_EDUCATION, CNT_TRAN_CLO_TENDENCY1M, SUM_TRAN_MED_TENDENCY1M, PRC_ACCEPTS_A_ATM, PRC_ACCEPTS_MTP, TRANS_COUNT_N...

### Заполнение пропусков

In [7]:
/* Заполление числовых колонок средними значениями */

// Перечень числовых колонок
val columnsToFill = Seq(
    "CR_PROD_CNT_IL",
    "AMOUNT_RUB_CLO_PRC",
    "TURNOVER_DYNAMIC_IL_1M",
    "AMOUNT_RUB_SUP_PRC",
    "REST_DYNAMIC_FDEP_1M",
    "REST_DYNAMIC_SAVE_3M",
    "CR_PROD_CNT_VCU",
    "REST_AVG_CUR",
    "AMOUNT_RUB_NAS_PRC",
    "TRANS_COUNT_SUP_PRC",
    "TRANS_COUNT_NAS_PRC",
    "CR_PROD_CNT_TOVR",
    "CR_PROD_CNT_PIL",
    "TURNOVER_CC",
    "TRANS_COUNT_ATM_PRC",
    "AMOUNT_RUB_ATM_PRC",
    "TURNOVER_PAYM",
    "AGE",
    "CR_PROD_CNT_CC",
    "REST_DYNAMIC_FDEP_3M",
    "REST_DYNAMIC_IL_1M",
    "CR_PROD_CNT_CCFP",
    "REST_DYNAMIC_CUR_1M",
    "REST_AVG_PAYM",
    "LDEAL_GRACE_DAYS_PCT_MED",
    "REST_DYNAMIC_CUR_3M",
    "CNT_TRAN_SUP_TENDENCY3M",
    "TURNOVER_DYNAMIC_CUR_1M",
    "REST_DYNAMIC_PAYM_3M",
    "SUM_TRAN_SUP_TENDENCY3M",
    "REST_DYNAMIC_IL_3M",
    "CNT_TRAN_ATM_TENDENCY3M",
    "CNT_TRAN_ATM_TENDENCY1M",
    "TURNOVER_DYNAMIC_IL_3M",
    "SUM_TRAN_ATM_TENDENCY3M",
    "SUM_TRAN_ATM_TENDENCY1M",
    "REST_DYNAMIC_PAYM_1M",
    "TURNOVER_DYNAMIC_CUR_3M",
    "CLNT_SETUP_TENOR",
    "TURNOVER_DYNAMIC_PAYM_3M",
    "TURNOVER_DYNAMIC_PAYM_1M",
    "TRANS_AMOUNT_TENDENCY3M",
    "TRANS_CNT_TENDENCY3M",
    "REST_DYNAMIC_CC_1M",
    "TURNOVER_DYNAMIC_CC_1M",
    "REST_DYNAMIC_CC_3M",
    "TURNOVER_DYNAMIC_CC_3M"
)

// // Вычисляем средние
// val means = df.select(columnsToFill.map(avg(_)): _*).first()
val means = cleanedDf.select(columnsToFill.map(avg(_)): _*).first()

// Создаем map с средними значениями
val meanValues = columnsToFill.zipWithIndex.map {
  case (colName, idx) => (colName, means.getDouble(idx))
}.toMap

// Заполняем пропуски
val df2 = columnsToFill.foldLeft(cleanedDf) { (df, colName) =>
  df.withColumn(colName, coalesce(col(colName), lit(meanValues(colName))))
}

// df2.show(numRows=1, truncate=10, vertical=true)

columnsToFill = List(CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC, TURNOVER_DYNAMIC_IL_1M, AMOUNT_RUB_SUP_PRC, REST_DYNAMIC_FDEP_1M, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, AMOUNT_RUB_NAS_PRC, TRANS_COUNT_SUP_PRC, TRANS_COUNT_NAS_PRC, CR_PROD_CNT_TOVR, CR_PROD_CNT_PIL, TURNOVER_CC, TRANS_COUNT_ATM_PRC, AMOUNT_RUB_ATM_PRC, TURNOVER_PAYM, AGE, CR_PROD_CNT_CC, REST_DYNAMIC_FDEP_3M, REST_DYNAMIC_IL_1M, CR_PROD_CNT_CCFP, REST_DYNAMIC_CUR_1M, REST_AVG_PAYM, LDEAL_GRACE_DAYS_PCT_MED, REST_DYNAMIC_CUR_3M, CNT_TRAN_SUP_TENDENCY3M, TURNOVER_DYNAMIC_CUR_1M, REST_DYNAMIC_PAYM_3M, SUM_TRAN_SUP_TENDENCY3M, REST_DYNAMIC_IL_3M, CNT_TRAN_ATM_TENDENCY3M, CNT_TRAN_ATM_TENDENCY1M, TURNOVER_DYNAMIC_IL_3M, SUM_TRAN_ATM_TENDENCY3M, SUM_TRAN_ATM_TENDENCY1M, REST_DYNAMIC_PAYM_1M, TURNOVER_DYNAMI...


List(CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC, TURNOVER_DYNAMIC_IL_1M, AMOUNT_RUB_SUP_PRC, REST_DYNAMIC_FDEP_1M, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, AMOUNT_RUB_NAS_PRC, TRANS_COUNT_SUP_PRC, TRANS_COUNT_NAS_PRC, CR_PROD_CNT_TOVR, CR_PROD_CNT_PIL, TURNOVER_CC, TRANS_COUNT_ATM_PRC, AMOUNT_RUB_ATM_PRC, TURNOVER_PAYM, AGE, CR_PROD_CNT_CC, REST_DYNAMIC_FDEP_3M, REST_DYNAMIC_IL_1M, CR_PROD_CNT_CCFP, REST_DYNAMIC_CUR_1M, REST_AVG_PAYM, LDEAL_GRACE_DAYS_PCT_MED, REST_DYNAMIC_CUR_3M, CNT_TRAN_SUP_TENDENCY3M, TURNOVER_DYNAMIC_CUR_1M, REST_DYNAMIC_PAYM_3M, SUM_TRAN_SUP_TENDENCY3M, REST_DYNAMIC_IL_3M, CNT_TRAN_ATM_TENDENCY3M, CNT_TRAN_ATM_TENDENCY1M, TURNOVER_DYNAMIC_IL_3M, SUM_TRAN_ATM_TENDENCY3M, SUM_TRAN_ATM_TENDENCY1M, REST_DYNAMIC_PAYM_1M, TURNOVER_DYNAMI...

In [8]:
// Заполнение пропусков в текстовых колонках
val df3 = df2.na.fill("missing", Seq(
    "CLNT_TRUST_RELATION",
    "APP_MARITAL_STATUS",
    "APP_KIND_OF_PROP_HABITATION",
    "CLNT_JOB_POSITION_TYPE",
    "CLNT_JOB_POSITION",
    "APP_DRIVING_LICENSE",
    "APP_EDUCATION",
    "APP_TRAVEL_PASS",
    "APP_CAR",
    "APP_POSITION_TYPE",
    "APP_EMP_TYPE",
    "APP_COMP_TYPE",
    "PACK"
))

df3 = [_c0: int, ID: int ... 100 more fields]


[_c0: int, ID: int ... 100 more fields]

In [9]:
//Заполнить все оставшиеся null значением 0
val df4 = df3.na.fill(0)

//data_4.show(numRows = 20, truncate = 10, vertical=true)

df4 = [_c0: int, ID: int ... 100 more fields]


[_c0: int, ID: int ... 100 more fields]

In [10]:
// Вычисляем веса классов
val classWeights = df4
  .groupBy("TARGET")
  .count()
  .withColumn("weight", lit(1.0) / col("count"))
  .collect()
  .map(row => (row.getInt(0), row.getDouble(2)))
  .toMap

// Добавляем веса в данные
val balancedData = df4.withColumn("classWeight", 
  when(col("TARGET") === 0, classWeights(0))
    .otherwise(classWeights(1)))

classWeights = Map(1 -> 3.82321455880104E-5, 0 -> 3.394340954760224E-6)
balancedData = [_c0: int, ID: int ... 101 more fields]


[_c0: int, ID: int ... 101 more fields]

In [11]:
// Разделение на train и test
val Array(trainingData, testData) = balancedData.randomSplit(Array(0.8, 0.2), seed=42)

trainingData = [_c0: int, ID: int ... 101 more fields]
testData = [_c0: int, ID: int ... 101 more fields]


[_c0: int, ID: int ... 101 more fields]

In [15]:
// Категориальные признаки
val categoricalCols = Array(
    "CLNT_TRUST_RELATION",
    "APP_MARITAL_STATUS",
    "APP_KIND_OF_PROP_HABITATION",
    "CLNT_JOB_POSITION_TYPE",
    "APP_DRIVING_LICENSE",
    "APP_EDUCATION",
    "APP_TRAVEL_PASS",
    "APP_CAR",
    "APP_POSITION_TYPE",
    "APP_EMP_TYPE",
    "APP_COMP_TYPE",
    "PACK"
)

// Числовые признаки
val numericCols = Array(
    "_c0",
    "CR_PROD_CNT_IL",
    "AMOUNT_RUB_CLO_PRC",
    "PRC_ACCEPTS_A_EMAIL_LINK",
    "APP_REGISTR_RGN_CODE",
    "PRC_ACCEPTS_A_POS",
    "PRC_ACCEPTS_A_TK",
    "TURNOVER_DYNAMIC_IL_1M",
    "CNT_TRAN_AUT_TENDENCY1M",
    "SUM_TRAN_AUT_TENDENCY1M",
    "AMOUNT_RUB_SUP_PRC",
    "PRC_ACCEPTS_A_AMOBILE",
    "SUM_TRAN_AUT_TENDENCY3M",
    "PRC_ACCEPTS_TK",
    "PRC_ACCEPTS_A_MTP",
    "REST_DYNAMIC_FDEP_1M",
    "CNT_TRAN_AUT_TENDENCY3M",
    "CNT_ACCEPTS_TK",
    "REST_DYNAMIC_SAVE_3M",
    "CR_PROD_CNT_VCU",
    "REST_AVG_CUR",
    "CNT_TRAN_MED_TENDENCY1M",
    "AMOUNT_RUB_NAS_PRC",
    "TRANS_COUNT_SUP_PRC",
    "CNT_TRAN_CLO_TENDENCY1M",
    "SUM_TRAN_MED_TENDENCY1M",
    "PRC_ACCEPTS_A_ATM",
    "PRC_ACCEPTS_MTP",
    "TRANS_COUNT_NAS_PRC",
    "CNT_ACCEPTS_MTP",
    "CR_PROD_CNT_TOVR",
    "CR_PROD_CNT_PIL",
    "SUM_TRAN_CLO_TENDENCY1M",
    "TURNOVER_CC",
    "TRANS_COUNT_ATM_PRC",
    "AMOUNT_RUB_ATM_PRC",
    "TURNOVER_PAYM",
    "AGE",
    "CNT_TRAN_MED_TENDENCY3M",
    "CR_PROD_CNT_CC",
    "SUM_TRAN_MED_TENDENCY3M",
    "REST_DYNAMIC_FDEP_3M",
    "REST_DYNAMIC_IL_1M",
    "SUM_TRAN_CLO_TENDENCY3M",
    // "LDEAL_TENOR_MAX",  // много пропусков
    // "LDEAL_YQZ_CHRG",
    "CR_PROD_CNT_CCFP",
    // "DEAL_YQZ_IR_MAX",
    // "LDEAL_YQZ_COM",
    // "DEAL_YQZ_IR_MIN",
    "CNT_TRAN_CLO_TENDENCY3M",
    "REST_DYNAMIC_CUR_1M",
    "REST_AVG_PAYM",
    "LDEAL_TENOR_MIN",
    // "LDEAL_AMT_MONTH",
    "LDEAL_GRACE_DAYS_PCT_MED",
    "REST_DYNAMIC_CUR_3M",
    "CNT_TRAN_SUP_TENDENCY3M",
    "TURNOVER_DYNAMIC_CUR_1M",
    "REST_DYNAMIC_PAYM_3M",
    "SUM_TRAN_SUP_TENDENCY3M",
    "REST_DYNAMIC_IL_3M",
    "CNT_TRAN_ATM_TENDENCY3M",
    "CNT_TRAN_ATM_TENDENCY1M",
    "TURNOVER_DYNAMIC_IL_3M",
    "SUM_TRAN_ATM_TENDENCY3M",
    "DEAL_GRACE_DAYS_ACC_S1X1",
    // "AVG_PCT_MONTH_TO_PCLOSE",
    "DEAL_YWZ_IR_MIN",
    "SUM_TRAN_SUP_TENDENCY1M",
    "DEAL_YWZ_IR_MAX",
    "SUM_TRAN_ATM_TENDENCY1M",
    "REST_DYNAMIC_PAYM_1M",
    "CNT_TRAN_SUP_TENDENCY1M",
    "DEAL_GRACE_DAYS_ACC_AVG",
    "TURNOVER_DYNAMIC_CUR_3M",
    // "MAX_PCLOSE_DATE",
    // "LDEAL_YQZ_PC",
    "CLNT_SETUP_TENOR",
    "DEAL_GRACE_DAYS_ACC_MAX",
    "TURNOVER_DYNAMIC_PAYM_3M",
    // "LDEAL_DELINQ_PER_MAXYQZ",
    "TURNOVER_DYNAMIC_PAYM_1M",
    // "CLNT_SALARY_VALUE",
    "TRANS_AMOUNT_TENDENCY3M",
    // "MED_DEBT_PRC_YQZ",
    "TRANS_CNT_TENDENCY3M",
    // "LDEAL_USED_AMT_AVG_YQZ",
    "REST_DYNAMIC_CC_1M",
    "LDEAL_USED_AMT_AVG_YWZ",
    "TURNOVER_DYNAMIC_CC_1M",
    // "AVG_PCT_DEBT_TO_DEAL_AMT",
    "LDEAL_ACT_DAYS_ACC_PCT_AVG",
    "REST_DYNAMIC_CC_3M",
    "MED_DEBT_PRC_YWZ",
    "LDEAL_ACT_DAYS_PCT_TR3",
    "LDEAL_ACT_DAYS_PCT_AAVG",
    "LDEAL_DELINQ_PER_MAXYWZ",
    "TURNOVER_DYNAMIC_CC_3M",
    "LDEAL_ACT_DAYS_PCT_TR",
    "LDEAL_ACT_DAYS_PCT_TR4",
    "LDEAL_ACT_DAYS_PCT_CURR"
)

// Обработка категориальных признаков
val indexers = categoricalCols.map(col =>
  new StringIndexer()
    .setInputCol(col)
    .setOutputCol(col + "_index")
    .setHandleInvalid("keep")
)

val encoders = categoricalCols.map(col =>
  new OneHotEncoder()
    .setInputCol(col + "_index")
    .setOutputCol(col + "_encoded")
)

// Сборка фичей
val assembler = new VectorAssembler()
  .setInputCols(numericCols ++ categoricalCols.map(_ + "_encoded"))
  .setOutputCol("features")

// Нормализация
val scaler = new StandardScaler()
  .setInputCol("features")
  .setOutputCol("scaledFeatures")
  .setWithStd(true)
  .setWithMean(true)
  .setWithMean(true)

categoricalCols = Array(CLNT_TRUST_RELATION, APP_MARITAL_STATUS, APP_KIND_OF_PROP_HABITATION, CLNT_JOB_POSITION_TYPE, APP_DRIVING_LICENSE, APP_EDUCATION, APP_TRAVEL_PASS, APP_CAR, APP_POSITION_TYPE, APP_EMP_TYPE, APP_COMP_TYPE, PACK)
numericCols = Array(_c0, CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC, PRC_ACCEPTS_A_EMAIL_LINK, APP_REGISTR_RGN_CODE, PRC_ACCEPTS_A_POS, PRC_ACCEPTS_A_TK, TURNOVER_DYNAMIC_IL_1M, CNT_TRAN_AUT_TENDENCY1M, SUM_TRAN_AUT_TENDENCY1M, AMOUNT_RUB_SUP_PRC, PRC_ACCEPTS_A_AMOBILE, SUM_TRAN_AUT_TENDENCY3M, PRC_ACCEPTS_TK, PRC_ACCEPTS_A_MTP, REST_DYNAMIC_FDEP_1M, CNT_TRAN_AUT_TENDENCY3M, CNT_ACCEPTS_TK, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, CNT_TRAN_MED_TENDENCY1M, AMOUNT_RUB_NAS_PRC, TRANS_COUNT_SUP_PRC, CNT_TRAN_CLO_TENDENCY1M, SUM...


Array(_c0, CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC, PRC_ACCEPTS_A_EMAIL_LINK, APP_REGISTR_RGN_CODE, PRC_ACCEPTS_A_POS, PRC_ACCEPTS_A_TK, TURNOVER_DYNAMIC_IL_1M, CNT_TRAN_AUT_TENDENCY1M, SUM_TRAN_AUT_TENDENCY1M, AMOUNT_RUB_SUP_PRC, PRC_ACCEPTS_A_AMOBILE, SUM_TRAN_AUT_TENDENCY3M, PRC_ACCEPTS_TK, PRC_ACCEPTS_A_MTP, REST_DYNAMIC_FDEP_1M, CNT_TRAN_AUT_TENDENCY3M, CNT_ACCEPTS_TK, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, CNT_TRAN_MED_TENDENCY1M, AMOUNT_RUB_NAS_PRC, TRANS_COUNT_SUP_PRC, CNT_TRAN_CLO_TENDENCY1M, SUM...

### Модель v1: RandomForestClassifier (ROC-AUC: 0.7936)

In [13]:
val rf = new RandomForestClassifier()
  .setFeaturesCol("scaledFeatures")
  .setLabelCol("TARGET")
  .setWeightCol("classWeight")
  .setNumTrees(50)          // Количество деревьев
  .setMaxDepth(10)          // Максимальная глубина
  .setMinInstancesPerNode(5)// Минимальное количество образцов в узле
  .setSubsamplingRate(0.5)  // Доля данных для построения каждого дерева

// Создание пайплайна
val pipeline = new Pipeline()
  .setStages(indexers ++ encoders ++ Array(assembler, scaler, rf)) // или rf

val trainingData_1 = trainingData.repartition(200, $"ID")
trainingData_1.cache
trainingData_1.count

// Обучение модели
val model = pipeline.fit(trainingData_1)

trainingData_1.unpersist

java.lang.IllegalArgumentException: LDEAL_TENOR_MIN does not exist. Available: _c0, ID, CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC, PRC_ACCEPTS_A_EMAIL_LINK, APP_REGISTR_RGN_CODE, PRC_ACCEPTS_A_POS, PRC_ACCEPTS_A_TK, TURNOVER_DYNAMIC_IL_1M, CNT_TRAN_AUT_TENDENCY1M, SUM_TRAN_AUT_TENDENCY1M, AMOUNT_RUB_SUP_PRC, PRC_ACCEPTS_A_AMOBILE, SUM_TRAN_AUT_TENDENCY3M, CLNT_TRUST_RELATION, PRC_ACCEPTS_TK, PRC_ACCEPTS_A_MTP, REST_DYNAMIC_FDEP_1M, CNT_TRAN_AUT_TENDENCY3M, CNT_ACCEPTS_TK, APP_MARITAL_STATUS, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, CNT_TRAN_MED_TENDENCY1M, APP_KIND_OF_PROP_HABITATION, CLNT_JOB_POSITION_TYPE, AMOUNT_RUB_NAS_PRC, CLNT_JOB_POSITION, APP_DRIVING_LICENSE, TRANS_COUNT_SUP_PRC, APP_EDUCATION, CNT_TRAN_CLO_TENDENCY1M, SUM_TRAN_MED_TENDENCY1M, PRC_ACCEPTS_A_ATM, PRC_ACCEPTS_MTP, TRANS_COUNT_NAS_PRC, APP_TRAVEL_PASS, CNT_ACCEPTS_MTP, CR_PROD_CNT_TOVR, APP_CAR, CR_PROD_CNT_PIL, SUM_TRAN_CLO_TENDENCY1M, APP_POSITION_TYPE, TURNOVER_CC, TRANS_COUNT_ATM_PRC, AMOUNT_RUB_ATM_PRC, TURNOVER_PAYM, AGE, CNT_TRAN_MED_TENDENCY3M, CR_PROD_CNT_CC, SUM_TRAN_MED_TENDENCY3M, REST_DYNAMIC_FDEP_3M, REST_DYNAMIC_IL_1M, APP_EMP_TYPE, SUM_TRAN_CLO_TENDENCY3M, CR_PROD_CNT_CCFP, CNT_TRAN_CLO_TENDENCY3M, REST_DYNAMIC_CUR_1M, REST_AVG_PAYM, APP_COMP_TYPE, LDEAL_GRACE_DAYS_PCT_MED, REST_DYNAMIC_CUR_3M, CNT_TRAN_SUP_TENDENCY3M, TURNOVER_DYNAMIC_CUR_1M, REST_DYNAMIC_PAYM_3M, SUM_TRAN_SUP_TENDENCY3M, REST_DYNAMIC_IL_3M, CNT_TRAN_ATM_TENDENCY3M, CNT_TRAN_ATM_TENDENCY1M, TURNOVER_DYNAMIC_IL_3M, SUM_TRAN_ATM_TENDENCY3M, DEAL_GRACE_DAYS_ACC_S1X1, DEAL_YWZ_IR_MIN, SUM_TRAN_SUP_TENDENCY1M, DEAL_YWZ_IR_MAX, SUM_TRAN_ATM_TENDENCY1M, REST_DYNAMIC_PAYM_1M, CNT_TRAN_SUP_TENDENCY1M, DEAL_GRACE_DAYS_ACC_AVG, TURNOVER_DYNAMIC_CUR_3M, PACK, CLNT_SETUP_TENOR, DEAL_GRACE_DAYS_ACC_MAX, TURNOVER_DYNAMIC_PAYM_3M, TURNOVER_DYNAMIC_PAYM_1M, TRANS_AMOUNT_TENDENCY3M, TRANS_CNT_TENDENCY3M, REST_DYNAMIC_CC_1M, LDEAL_USED_AMT_AVG_YWZ, TURNOVER_DYNAMIC_CC_1M, LDEAL_ACT_DAYS_ACC_PCT_AVG, REST_DYNAMIC_CC_3M, MED_DEBT_PRC_YWZ, LDEAL_ACT_DAYS_PCT_TR3, LDEAL_ACT_DAYS_PCT_AAVG, LDEAL_DELINQ_PER_MAXYWZ, TURNOVER_DYNAMIC_CC_3M, LDEAL_ACT_DAYS_PCT_TR, LDEAL_ACT_DAYS_PCT_TR4, LDEAL_ACT_DAYS_PCT_CURR, TARGET, classWeight, CLNT_TRUST_RELATION_index, APP_MARITAL_STATUS_index, APP_KIND_OF_PROP_HABITATION_index, CLNT_JOB_POSITION_TYPE_index, APP_DRIVING_LICENSE_index, APP_EDUCATION_index, APP_TRAVEL_PASS_index, APP_CAR_index, APP_POSITION_TYPE_index, APP_EMP_TYPE_index, APP_COMP_TYPE_index, PACK_index, CLNT_TRUST_RELATION_encoded, APP_MARITAL_STATUS_encoded, APP_KIND_OF_PROP_HABITATION_encoded, CLNT_JOB_POSITION_TYPE_encoded, APP_DRIVING_LICENSE_encoded, APP_EDUCATION_encoded, APP_TRAVEL_PASS_encoded, APP_CAR_encoded, APP_POSITION_TYPE_encoded, APP_EMP_TYPE_encoded, APP_COMP_TYPE_encoded, PACK_encoded

### Модель v2: GBTClassifier (ROC-AUC: 0.8267)

In [16]:
import org.apache.spark.ml.classification.GBTClassifier

val gbt = new GBTClassifier()
  .setLabelCol("TARGET")
  .setFeaturesCol("scaledFeatures")
  .setWeightCol("classWeight")
  .setMaxIter(15)             // Количество деревьев (итераций)
  .setMaxDepth(7)             // Максимальная глубина деревьев
  .setMinInstancesPerNode(1)  // Минимум объектов в узле
  .setStepSize(0.1)           // Скорость обучения (shrinkage)
  .setFeatureSubsetStrategy("auto")

val pipeline = new Pipeline()
  .setStages(indexers ++ encoders ++ Array(assembler, scaler, gbt)) // или rf

// val trainingData_1 = trainingData.repartition(200, $"ID")
val trainingData_1 = df.repartition(200, $"ID")
trainingData_1.cache
trainingData_1.count

val model = pipeline.fit(trainingData_1)

trainingData_1.unpersist

java.lang.IllegalArgumentException: classWeight does not exist. Available: _c0, ID, CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC, PRC_ACCEPTS_A_EMAIL_LINK, APP_REGISTR_RGN_CODE, PRC_ACCEPTS_A_POS, PRC_ACCEPTS_A_TK, TURNOVER_DYNAMIC_IL_1M, CNT_TRAN_AUT_TENDENCY1M, SUM_TRAN_AUT_TENDENCY1M, AMOUNT_RUB_SUP_PRC, PRC_ACCEPTS_A_AMOBILE, SUM_TRAN_AUT_TENDENCY3M, CLNT_TRUST_RELATION, PRC_ACCEPTS_TK, PRC_ACCEPTS_A_MTP, REST_DYNAMIC_FDEP_1M, CNT_TRAN_AUT_TENDENCY3M, CNT_ACCEPTS_TK, APP_MARITAL_STATUS, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, CNT_TRAN_MED_TENDENCY1M, APP_KIND_OF_PROP_HABITATION, CLNT_JOB_POSITION_TYPE, AMOUNT_RUB_NAS_PRC, CLNT_JOB_POSITION, APP_DRIVING_LICENSE, TRANS_COUNT_SUP_PRC, APP_EDUCATION, CNT_TRAN_CLO_TENDENCY1M, SUM_TRAN_MED_TENDENCY1M, PRC_ACCEPTS_A_ATM, PRC_ACCEPTS_MTP, TRANS_COUNT_NAS_PRC, APP_TRAVEL_PASS, CNT_ACCEPTS_MTP, CR_PROD_CNT_TOVR, APP_CAR, CR_PROD_CNT_PIL, SUM_TRAN_CLO_TENDENCY1M, APP_POSITION_TYPE, TURNOVER_CC, TRANS_COUNT_ATM_PRC, AMOUNT_RUB_ATM_PRC, TURNOVER_PAYM, AGE, CNT_TRAN_MED_TENDENCY3M, CR_PROD_CNT_CC, SUM_TRAN_MED_TENDENCY3M, REST_DYNAMIC_FDEP_3M, REST_DYNAMIC_IL_1M, APP_EMP_TYPE, SUM_TRAN_CLO_TENDENCY3M, LDEAL_TENOR_MAX, LDEAL_YQZ_CHRG, CR_PROD_CNT_CCFP, DEAL_YQZ_IR_MAX, LDEAL_YQZ_COM, DEAL_YQZ_IR_MIN, CNT_TRAN_CLO_TENDENCY3M, REST_DYNAMIC_CUR_1M, REST_AVG_PAYM, LDEAL_TENOR_MIN, LDEAL_AMT_MONTH, APP_COMP_TYPE, LDEAL_GRACE_DAYS_PCT_MED, REST_DYNAMIC_CUR_3M, CNT_TRAN_SUP_TENDENCY3M, TURNOVER_DYNAMIC_CUR_1M, REST_DYNAMIC_PAYM_3M, SUM_TRAN_SUP_TENDENCY3M, REST_DYNAMIC_IL_3M, CNT_TRAN_ATM_TENDENCY3M, CNT_TRAN_ATM_TENDENCY1M, TURNOVER_DYNAMIC_IL_3M, SUM_TRAN_ATM_TENDENCY3M, DEAL_GRACE_DAYS_ACC_S1X1, AVG_PCT_MONTH_TO_PCLOSE, DEAL_YWZ_IR_MIN, SUM_TRAN_SUP_TENDENCY1M, DEAL_YWZ_IR_MAX, SUM_TRAN_ATM_TENDENCY1M, REST_DYNAMIC_PAYM_1M, CNT_TRAN_SUP_TENDENCY1M, DEAL_GRACE_DAYS_ACC_AVG, TURNOVER_DYNAMIC_CUR_3M, PACK, MAX_PCLOSE_DATE, LDEAL_YQZ_PC, CLNT_SETUP_TENOR, DEAL_GRACE_DAYS_ACC_MAX, TURNOVER_DYNAMIC_PAYM_3M, LDEAL_DELINQ_PER_MAXYQZ, TURNOVER_DYNAMIC_PAYM_1M, CLNT_SALARY_VALUE, TRANS_AMOUNT_TENDENCY3M, MED_DEBT_PRC_YQZ, TRANS_CNT_TENDENCY3M, LDEAL_USED_AMT_AVG_YQZ, REST_DYNAMIC_CC_1M, LDEAL_USED_AMT_AVG_YWZ, TURNOVER_DYNAMIC_CC_1M, AVG_PCT_DEBT_TO_DEAL_AMT, LDEAL_ACT_DAYS_ACC_PCT_AVG, REST_DYNAMIC_CC_3M, MED_DEBT_PRC_YWZ, LDEAL_ACT_DAYS_PCT_TR3, LDEAL_ACT_DAYS_PCT_AAVG, LDEAL_DELINQ_PER_MAXYWZ, TURNOVER_DYNAMIC_CC_3M, LDEAL_ACT_DAYS_PCT_TR, LDEAL_ACT_DAYS_PCT_TR4, LDEAL_ACT_DAYS_PCT_CURR, TARGET, CLNT_TRUST_RELATION_index, APP_MARITAL_STATUS_index, APP_KIND_OF_PROP_HABITATION_index, CLNT_JOB_POSITION_TYPE_index, APP_DRIVING_LICENSE_index, APP_EDUCATION_index, APP_TRAVEL_PASS_index, APP_CAR_index, APP_POSITION_TYPE_index, APP_EMP_TYPE_index, APP_COMP_TYPE_index, PACK_index, CLNT_TRUST_RELATION_encoded, APP_MARITAL_STATUS_encoded, APP_KIND_OF_PROP_HABITATION_encoded, CLNT_JOB_POSITION_TYPE_encoded, APP_DRIVING_LICENSE_encoded, APP_EDUCATION_encoded, APP_TRAVEL_PASS_encoded, APP_CAR_encoded, APP_POSITION_TYPE_encoded, APP_EMP_TYPE_encoded, APP_COMP_TYPE_encoded, PACK_encoded, features, scaledFeatures

### Оценка метрик

In [86]:
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

// Предсказания на тестовых данных
val predictions = model.transform(testData)

// Вычисление метрик
val evaluator = new BinaryClassificationEvaluator()
  .setLabelCol("TARGET")
  .setRawPredictionCol("rawPrediction")

val auc = evaluator.setMetricName("areaUnderROC").evaluate(predictions)
val auPR = evaluator.setMetricName("areaUnderPR").evaluate(predictions)

println(s"AUC = $auc")
println(s"Area Under PR Curve = $auPR")

lastException = null


java.lang.IllegalArgumentException: LDEAL_TENOR_MAX does not exist. Available: _c0, ID, CR_PROD_CNT_IL, AMOUNT_RUB_CLO_PRC, PRC_ACCEPTS_A_EMAIL_LINK, APP_REGISTR_RGN_CODE, PRC_ACCEPTS_A_POS, PRC_ACCEPTS_A_TK, TURNOVER_DYNAMIC_IL_1M, CNT_TRAN_AUT_TENDENCY1M, SUM_TRAN_AUT_TENDENCY1M, AMOUNT_RUB_SUP_PRC, PRC_ACCEPTS_A_AMOBILE, SUM_TRAN_AUT_TENDENCY3M, CLNT_TRUST_RELATION, PRC_ACCEPTS_TK, PRC_ACCEPTS_A_MTP, REST_DYNAMIC_FDEP_1M, CNT_TRAN_AUT_TENDENCY3M, CNT_ACCEPTS_TK, APP_MARITAL_STATUS, REST_DYNAMIC_SAVE_3M, CR_PROD_CNT_VCU, REST_AVG_CUR, CNT_TRAN_MED_TENDENCY1M, APP_KIND_OF_PROP_HABITATION, CLNT_JOB_POSITION_TYPE, AMOUNT_RUB_NAS_PRC, CLNT_JOB_POSITION, APP_DRIVING_LICENSE, TRANS_COUNT_SUP_PRC, APP_EDUCATION, CNT_TRAN_CLO_TENDENCY1M, SUM_TRAN_MED_TENDENCY1M, PRC_ACCEPTS_A_ATM, PRC_ACCEPTS_MTP, TRANS_COUNT_NAS_PRC, APP_TRAVEL_PASS, CNT_ACCEPTS_MTP, CR_PROD_CNT_TOVR, APP_CAR, CR_PROD_CNT_PIL, SUM_TRAN_CLO_TENDENCY1M, APP_POSITION_TYPE, TURNOVER_CC, TRANS_COUNT_ATM_PRC, AMOUNT_RUB_ATM_PRC, TURNOVER_PAYM, AGE, CNT_TRAN_MED_TENDENCY3M, CR_PROD_CNT_CC, SUM_TRAN_MED_TENDENCY3M, REST_DYNAMIC_FDEP_3M, REST_DYNAMIC_IL_1M, APP_EMP_TYPE, SUM_TRAN_CLO_TENDENCY3M, CR_PROD_CNT_CCFP, CNT_TRAN_CLO_TENDENCY3M, REST_DYNAMIC_CUR_1M, REST_AVG_PAYM, APP_COMP_TYPE, LDEAL_GRACE_DAYS_PCT_MED, REST_DYNAMIC_CUR_3M, CNT_TRAN_SUP_TENDENCY3M, TURNOVER_DYNAMIC_CUR_1M, REST_DYNAMIC_PAYM_3M, SUM_TRAN_SUP_TENDENCY3M, REST_DYNAMIC_IL_3M, CNT_TRAN_ATM_TENDENCY3M, CNT_TRAN_ATM_TENDENCY1M, TURNOVER_DYNAMIC_IL_3M, SUM_TRAN_ATM_TENDENCY3M, DEAL_GRACE_DAYS_ACC_S1X1, DEAL_YWZ_IR_MIN, SUM_TRAN_SUP_TENDENCY1M, DEAL_YWZ_IR_MAX, SUM_TRAN_ATM_TENDENCY1M, REST_DYNAMIC_PAYM_1M, CNT_TRAN_SUP_TENDENCY1M, DEAL_GRACE_DAYS_ACC_AVG, TURNOVER_DYNAMIC_CUR_3M, PACK, CLNT_SETUP_TENOR, DEAL_GRACE_DAYS_ACC_MAX, TURNOVER_DYNAMIC_PAYM_3M, TURNOVER_DYNAMIC_PAYM_1M, TRANS_AMOUNT_TENDENCY3M, TRANS_CNT_TENDENCY3M, REST_DYNAMIC_CC_1M, LDEAL_USED_AMT_AVG_YWZ, TURNOVER_DYNAMIC_CC_1M, LDEAL_ACT_DAYS_ACC_PCT_AVG, REST_DYNAMIC_CC_3M, MED_DEBT_PRC_YWZ, LDEAL_ACT_DAYS_PCT_TR3, LDEAL_ACT_DAYS_PCT_AAVG, LDEAL_DELINQ_PER_MAXYWZ, TURNOVER_DYNAMIC_CC_3M, LDEAL_ACT_DAYS_PCT_TR, LDEAL_ACT_DAYS_PCT_TR4, LDEAL_ACT_DAYS_PCT_CURR, TARGET, classWeight, CLNT_TRUST_RELATION_index, APP_MARITAL_STATUS_index, APP_KIND_OF_PROP_HABITATION_index, CLNT_JOB_POSITION_TYPE_index, APP_DRIVING_LICENSE_index, APP_EDUCATION_index, APP_TRAVEL_PASS_index, APP_CAR_index, APP_POSITION_TYPE_index, APP_EMP_TYPE_index, APP_COMP_TYPE_index, PACK_index, CLNT_TRUST_RELATION_encoded, APP_MARITAL_STATUS_encoded, APP_KIND_OF_PROP_HABITATION_encoded, CLNT_JOB_POSITION_TYPE_encoded, APP_DRIVING_LICENSE_encoded, APP_EDUCATION_encoded, APP_TRAVEL_PASS_encoded, APP_CAR_encoded, APP_POSITION_TYPE_encoded, APP_EMP_TYPE_encoded, APP_COMP_TYPE_encoded, PACK_encoded

### Инференс

In [29]:
// Загружаем данные
val test_data = spark.read
  .option("header", "true")
  .option("inferSchema", "true")
  .csv("/labs/slaba05/lab05_test.csv")

// test_data.show(numRows = 1, truncate = 10, vertical=true)

test_data = [_c0: int, ID: int ... 114 more fields]


[_c0: int, ID: int ... 114 more fields]

In [30]:
val test_data_1 = test_data.repartition(20, $"ID")

test_data_1 = [_c0: int, ID: int ... 114 more fields]


[_c0: int, ID: int ... 114 more fields]

In [31]:
// Заполнить null в string колонках
val test_data_2 = test_data_1.na.fill("missing", Seq(
    "CLNT_TRUST_RELATION",
    "APP_MARITAL_STATUS",
    "APP_KIND_OF_PROP_HABITATION",
    "CLNT_JOB_POSITION_TYPE",
    "CLNT_JOB_POSITION",
    "APP_DRIVING_LICENSE",
    "APP_EDUCATION",
    "APP_TRAVEL_PASS",
    "APP_CAR",
    "APP_POSITION_TYPE",
    "APP_EMP_TYPE",
    "APP_COMP_TYPE",
    "PACK"
))

test_data_2 = [_c0: int, ID: int ... 114 more fields]


[_c0: int, ID: int ... 114 more fields]

In [32]:
// Заполнить все оставшиеся null значением 0
val test_data_3 = test_data_2.na.fill(0)

// data_4.show(numRows = 20, truncate = 10, vertical=true)

test_data_3 = [_c0: int, ID: int ... 114 more fields]


[_c0: int, ID: int ... 114 more fields]

In [33]:
val predictions = model.transform(test_data_3)

// predictions.show(numRows = 2, truncate = 50, vertical=true)

predictions = [_c0: int, ID: int ... 143 more fields]


[_c0: int, ID: int ... 143 more fields]

In [34]:
// Оставить только указанные колонки
val clear_predictions = predictions.select("ID", "prediction")

// переименовать колонки
val lab05 = clear_predictions
  .withColumnRenamed("ID", "id")
  .withColumnRenamed("prediction", "target")

// lab05.show(numRows = 20, truncate = 10, vertical=true)

clear_predictions = [ID: int, prediction: double]
lab05 = [id: int, target: double]


[id: int, target: double]

### Сохранение результатов

In [35]:
import org.apache.hadoop.fs.{FileSystem, Path}

val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val hdfsPath = "hdfs:///user/maksim.burdasov/lab05_temp"
val homeDir = "file:///home/maksim.burdasov"

// Сохраняем в HDFS
lab05.coalesce(1)  // Гарантируем один файл
  .write
  .mode("overwrite")
  .option("sep", "\t")
  .option("header", "true")
  .csv(hdfsPath)

hdfsPath = hdfs:///user/maksim.burdasov/lab05_temp
homeDir = file:///home/maksim.burdasov


fs: org.apache.hadoop.fs.FileSystem = DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1612436294_139, ugi=maksim.burdasov (auth:SIMPLE)]]


file:///home/maksim.burdasov

In [36]:
// Вывод инфы про сохраненные файлы
val hdfsPath = "hdfs:///user/maksim.burdasov/lab05_temp"
import org.apache.hadoop.fs.{FileSystem, Path}
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val savedFiles = fs.listStatus(new Path(hdfsPath))

println("Сохраненные файлы:")
savedFiles.foreach(f => println(f.getPath.getName))

Сохраненные файлы:
_SUCCESS
part-00000-10223083-054e-4c43-aa0f-197c5a98cf6d-c000.csv


hdfsPath = hdfs:///user/maksim.burdasov/lab05_temp


fs: org.apache.hadoop.fs.FileSystem = DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1612436294_139, ugi=maksim.burdasov (auth:SIMPLE)]]
savedFiles: Array[org.apache.hadoop.fs.FileStatus] = Array(HdfsNamedFileStatus{path=hdfs://spark-master-1.newprolab.com:8020/user/maksim.burdasov/lab05_temp/_SUCCESS; isDirectory=false; length=0; replication=3; blocksize=134217728; modification_time=1746859074450; access_time=1746859074438; owner=maksim.burdasov; group=maksim.burdasov; permission=rw-r--r--; isSymlink=false; hasAcl=false; isEncrypted=false; isErasureCoded=false}, HdfsNamedFileStatus{path=hdfs://spark-master-1.newprolab.com:8020/user/maksim.burdasov/lab05_temp/part-00000-10223083-0...


hdfs:///user/maksim.burdasov/lab05_temp

In [37]:
// Копируем на локальную машину
val fs = FileSystem.get(spark.sparkContext.hadoopConfiguration)
val hdfsPath = "hdfs:///user/maksim.burdasov/lab05_temp"
val homeDir = "file:///data/home/maksim.burdasov"

fs.copyToLocalFile(
  false,
  new Path(s"$hdfsPath/part-00000-10223083-054e-4c43-aa0f-197c5a98cf6d-c000.csv"),
  new Path(s"$homeDir/lab05.csv"),
  true
)

hdfsPath = hdfs:///user/maksim.burdasov/lab05_temp
homeDir = file:///data/home/maksim.burdasov


fs: org.apache.hadoop.fs.FileSystem = DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-1612436294_139, ugi=maksim.burdasov (auth:SIMPLE)]]


file:///data/home/maksim.burdasov

In [None]:
// fs.close()