In [1]:
import numpy as np
from pyspark.sql import SparkSession
from pyspark.mllib.regression import LabeledPoint
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator,MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import MultilayerPerceptronClassifier

In [2]:
spark = SparkSession.builder \
    .enableHiveSupport().getOrCreate()
df = spark.read.options(header='True', inferSchema='True', delimiter=',') \
  .csv("final_data.csv")

                                                                                

In [3]:
df_assembler = VectorAssembler(inputCols=[ 'total_loan','year_of_loan','interest','monthly_payment','class', 'sub_class', 'work_type', 'employer_type', 'industry',
 'work_year',
 'house_exist',
 'house_loan_status',
 'censor_status',
 'marriage',
 'offsprings',
 'use',
 'post_code',
 'region',
 'debt_loan_ratio',
 'del_in_18month',
 'scoring_low',
 'scoring_high',
 'pub_dero_bankrup',
 'early_return',
 'early_return_amount',
 'early_return_amount_3mon',
 'recircle_b',
 'recircle_u',
 'initial_list_status',
 'title',
 'policy_code',
 'f0',
 'f1',
 'f2',
 'f3',
 'f4',
 'f5',
 'total_money'],
outputCol='features')
labeled_df = df_assembler.transform(df)

In [4]:
labeled_df = labeled_df["is_default","features"]

In [5]:
data_set = labeled_df.select(['features', 'is_default'])

In [6]:
train_df, test_df = data_set.randomSplit([0.8, 0.2])

In [7]:
log_reg = LogisticRegression(labelCol = 'is_default').fit(train_df)

                                                                                

In [8]:
test_result = log_reg.evaluate(test_df).predictions



In [9]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol='is_default')
print('AUC：', evaluator.evaluate(test_result))

                                                                                

AUC： 0.8014095445033663


In [14]:
print(MulticlassClassificationEvaluator(labelCol="is_default",predictionCol="prediction", metricName="f1").evaluate(test_result))



0.7877607356910269




better lr

In [10]:
for i in range(0,40):
    threshold = 0.3+0.01*i
    log_reg_2 = LogisticRegression(threshold = threshold,labelCol = 'is_default').fit(train_df)
    test_result_2 = log_reg_2.evaluate(test_df).predictions
    evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol='is_default')
    print('AUC：'+str(threshold), evaluator.evaluate(test_result_2))

                                                                                

AUC：0.3 0.8014114612612473


                                                                                

AUC：0.31 0.8014110444230913


                                                                                

AUC：0.32 0.8014116347984709


                                                                                

AUC：0.32999999999999996 0.8014071114940454


                                                                                

AUC：0.33999999999999997 0.8014132951747241


                                                                                

AUC：0.35 0.8014139378728856


                                                                                

AUC：0.36 0.8014123385398768


                                                                                

AUC：0.37 0.8014088233210338


                                                                                

AUC：0.38 0.8014119025167014


                                                                                

AUC：0.39 0.8014137782884031


                                                                                

AUC：0.4 0.8014138846780579


                                                                                

AUC：0.41 0.801413939616978


                                                                                

AUC：0.42 0.8014098907057678


                                                                                

AUC：0.43 0.801413739046317


                                                                                

AUC：0.44 0.8014094974128638


                                                                                

AUC：0.44999999999999996 0.8014103842840031


                                                                                

AUC：0.45999999999999996 0.8014104880575189


                                                                                

AUC：0.47 0.8014090317401117


                                                                                

AUC：0.48 0.8014140111247791


                                                                                

AUC：0.49 0.8014126158506163


                                                                                

AUC：0.5 0.8014120638452764


                                                                                

AUC：0.51 0.8014137407904098


                                                                                

AUC：0.52 0.8014100537784358


                                                                                

AUC：0.53 0.8014139335126536


                                                                                

AUC：0.54 0.8014117490365432


                                                                                

AUC：0.55 0.8014123873744726


                                                                                

AUC：0.56 0.8014103729474003


                                                                                

AUC：0.5700000000000001 0.8014097162964978


                                                                                

AUC：0.5800000000000001 0.8014140355420767


                                                                                

AUC：0.59 0.8014129821100844


                                                                                

AUC：0.6 0.8014134870249217


                                                                                

AUC：0.61 0.8014171897337303


                                                                                

AUC：0.62 0.8014143608153659


                                                                                

AUC：0.63 0.801412994318733


                                                                                

AUC：0.64 0.801412572248299


                                                                                

AUC：0.65 0.801412768458728


                                                                                

AUC：0.6599999999999999 0.8014164999450664


                                                                                

AUC：0.6699999999999999 0.8014135907984373


                                                                                

AUC：0.6799999999999999 0.8014130605942559


                                                                                

AUC：0.69 0.8014117019460404


In [21]:
log_reg_2 = LogisticRegression(threshold = 0.41,labelCol = 'is_default').fit(train_df)
test_result_2 = log_reg_2.evaluate(test_df).predictions
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol='is_default')
print('AUC：', evaluator.evaluate(test_result_2))

                                                                                

AUC： 0.8006012437350225


dt

In [7]:
dt = DecisionTreeClassifier(labelCol="is_default", featuresCol="features",maxDepth=15)

In [8]:
dt_model = dt.fit(train_df)

21/12/19 14:07:32 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [9]:
dt_pred = dt_model.transform(test_df)

In [10]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol='is_default')
print('AUC：', evaluator.evaluate(dt_pred))

                                                                                

AUC： 0.7556666089317783


rf

In [14]:
numTrees_list = [20,50,100,150,200,300]
maxDepth_list = [5,10,15,20,30,40,50]
impurity_list = ['gini', 'entropy']
maxBins_list = [24,32,40]
for numTrees in numTrees_list:
    for maxDepth in maxDepth_list:
        for impurity in impurity_list:
            for maxBins in maxBins_list:
                rf_classifier=RandomForestClassifier(labelCol='is_default', featuresCol="features",numTrees=numTrees,maxDepth=maxDepth,impurity=impurity,maxBins=maxBins).fit(train_df)  
                rf_predictions=rf_classifier.transform(test_df)
                evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol='is_default')
                print('AUC：'+str(numTrees)+str(maxDepth)+impurity+str(maxBins), evaluator.evaluate(rf_predictions))

                                                                                

AUC：205gini24 0.8194834321545822


                                                                                

AUC：205gini32 0.8185329671994049


                                                                                

AUC：205gini40 0.8212252118992234


                                                                                

AUC：205entropy24 0.8110912137195835


                                                                                

AUC：205entropy32 0.8015992903530812


                                                                                

AUC：205entropy40 0.80740648874693


21/12/18 18:43:25 WARN DAGScheduler: Broadcasting large task binary with size 1027.9 KiB
21/12/18 18:43:27 WARN DAGScheduler: Broadcasting large task binary with size 1721.8 KiB
                                                                                

AUC：2010gini24 0.8542378731162448


21/12/18 18:43:47 WARN DAGScheduler: Broadcasting large task binary with size 1005.5 KiB
21/12/18 18:43:49 WARN DAGScheduler: Broadcasting large task binary with size 1686.1 KiB
                                                                                

AUC：2010gini32 0.8538066432844814


21/12/18 18:44:09 WARN DAGScheduler: Broadcasting large task binary with size 1001.0 KiB
21/12/18 18:44:11 WARN DAGScheduler: Broadcasting large task binary with size 1665.4 KiB
                                                                                

AUC：2010gini40 0.8541621647157527


21/12/18 18:44:32 WARN DAGScheduler: Broadcasting large task binary with size 1457.8 KiB
                                                                                

AUC：2010entropy24 0.8518293215721111


21/12/18 18:44:53 WARN DAGScheduler: Broadcasting large task binary with size 1487.6 KiB
                                                                                

AUC：2010entropy32 0.8508379340479095


21/12/18 18:45:14 WARN DAGScheduler: Broadcasting large task binary with size 1473.4 KiB
                                                                                

AUC：2010entropy40 0.8513615922015019


21/12/18 18:45:34 WARN DAGScheduler: Broadcasting large task binary with size 1027.9 KiB
21/12/18 18:45:36 WARN DAGScheduler: Broadcasting large task binary with size 1721.8 KiB
21/12/18 18:45:37 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
21/12/18 18:45:40 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
21/12/18 18:45:43 WARN DAGScheduler: Broadcasting large task binary with size 7.0 MiB
21/12/18 18:45:46 WARN DAGScheduler: Broadcasting large task binary with size 1290.1 KiB
21/12/18 18:45:47 WARN DAGScheduler: Broadcasting large task binary with size 10.5 MiB
21/12/18 18:45:51 WARN DAGScheduler: Broadcasting large task binary with size 1766.3 KiB
21/12/18 18:45:53 WARN DAGScheduler: Broadcasting large task binary with size 15.3 MiB
21/12/18 18:45:57 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
21/12/18 18:46:00 WARN DAGScheduler: Broadcasting large task binary with size 9.6 MiB
                                        

AUC：2015gini24 0.8604041114185099


21/12/18 18:46:28 WARN DAGScheduler: Broadcasting large task binary with size 1005.5 KiB
21/12/18 18:46:30 WARN DAGScheduler: Broadcasting large task binary with size 1686.1 KiB
21/12/18 18:46:33 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
21/12/18 18:46:37 WARN DAGScheduler: Broadcasting large task binary with size 4.4 MiB
21/12/18 18:46:41 WARN DAGScheduler: Broadcasting large task binary with size 6.8 MiB
21/12/18 18:46:45 WARN DAGScheduler: Broadcasting large task binary with size 1231.7 KiB
21/12/18 18:46:47 WARN DAGScheduler: Broadcasting large task binary with size 10.2 MiB
21/12/18 18:46:52 WARN DAGScheduler: Broadcasting large task binary with size 1689.2 KiB
21/12/18 18:46:55 WARN DAGScheduler: Broadcasting large task binary with size 14.7 MiB
21/12/18 18:47:00 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
21/12/18 18:47:04 WARN DAGScheduler: Broadcasting large task binary with size 9.3 MiB
                                        

AUC：2015gini32 0.8605033121800783


21/12/18 18:47:23 WARN DAGScheduler: Broadcasting large task binary with size 1001.0 KiB
21/12/18 18:47:24 WARN DAGScheduler: Broadcasting large task binary with size 1665.4 KiB
21/12/18 18:47:26 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
21/12/18 18:47:29 WARN DAGScheduler: Broadcasting large task binary with size 4.3 MiB
21/12/18 18:47:32 WARN DAGScheduler: Broadcasting large task binary with size 6.6 MiB
21/12/18 18:47:34 WARN DAGScheduler: Broadcasting large task binary with size 1212.1 KiB
21/12/18 18:47:36 WARN DAGScheduler: Broadcasting large task binary with size 10.0 MiB
21/12/18 18:47:39 WARN DAGScheduler: Broadcasting large task binary with size 1663.3 KiB
21/12/18 18:47:42 WARN DAGScheduler: Broadcasting large task binary with size 14.5 MiB
21/12/18 18:47:45 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
21/12/18 18:47:49 WARN DAGScheduler: Broadcasting large task binary with size 9.2 MiB
                                        

AUC：2015gini40 0.8604185365677196


21/12/18 18:48:09 WARN DAGScheduler: Broadcasting large task binary with size 1457.8 KiB
21/12/18 18:48:10 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
21/12/18 18:48:12 WARN DAGScheduler: Broadcasting large task binary with size 3.8 MiB
21/12/18 18:48:15 WARN DAGScheduler: Broadcasting large task binary with size 5.9 MiB
21/12/18 18:48:17 WARN DAGScheduler: Broadcasting large task binary with size 1086.7 KiB
21/12/18 18:48:19 WARN DAGScheduler: Broadcasting large task binary with size 8.9 MiB
21/12/18 18:48:22 WARN DAGScheduler: Broadcasting large task binary with size 1500.3 KiB
21/12/18 18:48:23 WARN DAGScheduler: Broadcasting large task binary with size 12.9 MiB
21/12/18 18:48:27 WARN DAGScheduler: Broadcasting large task binary with size 1956.5 KiB
21/12/18 18:48:29 WARN DAGScheduler: Broadcasting large task binary with size 7.6 MiB
                                                                                

AUC：2015entropy24 0.8608131940107278


21/12/18 18:48:49 WARN DAGScheduler: Broadcasting large task binary with size 1487.6 KiB
21/12/18 18:48:51 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
21/12/18 18:48:54 WARN DAGScheduler: Broadcasting large task binary with size 3.8 MiB
21/12/18 18:48:57 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
21/12/18 18:48:59 WARN DAGScheduler: Broadcasting large task binary with size 1058.0 KiB
21/12/18 18:49:00 WARN DAGScheduler: Broadcasting large task binary with size 8.7 MiB
21/12/18 18:49:03 WARN DAGScheduler: Broadcasting large task binary with size 1442.3 KiB
21/12/18 18:49:05 WARN DAGScheduler: Broadcasting large task binary with size 12.6 MiB
21/12/18 18:49:09 WARN DAGScheduler: Broadcasting large task binary with size 1885.2 KiB
21/12/18 18:49:11 WARN DAGScheduler: Broadcasting large task binary with size 7.5 MiB
                                                                                

AUC：2015entropy32 0.8607346375257043


21/12/18 18:49:31 WARN DAGScheduler: Broadcasting large task binary with size 1473.4 KiB
21/12/18 18:49:32 WARN DAGScheduler: Broadcasting large task binary with size 2.4 MiB
21/12/18 18:49:35 WARN DAGScheduler: Broadcasting large task binary with size 3.8 MiB
21/12/18 18:49:39 WARN DAGScheduler: Broadcasting large task binary with size 5.8 MiB
21/12/18 18:49:41 WARN DAGScheduler: Broadcasting large task binary with size 1056.9 KiB
21/12/18 18:49:43 WARN DAGScheduler: Broadcasting large task binary with size 8.8 MiB
21/12/18 18:49:46 WARN DAGScheduler: Broadcasting large task binary with size 1438.5 KiB
21/12/18 18:49:48 WARN DAGScheduler: Broadcasting large task binary with size 12.6 MiB
21/12/18 18:49:52 WARN DAGScheduler: Broadcasting large task binary with size 1858.0 KiB
21/12/18 18:49:54 WARN DAGScheduler: Broadcasting large task binary with size 7.5 MiB
                                                                                

AUC：2015entropy40 0.860563475797644


21/12/18 18:50:13 WARN DAGScheduler: Broadcasting large task binary with size 1028.0 KiB
21/12/18 18:50:14 WARN DAGScheduler: Broadcasting large task binary with size 1721.8 KiB
21/12/18 18:50:16 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
21/12/18 18:50:19 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
21/12/18 18:50:22 WARN DAGScheduler: Broadcasting large task binary with size 7.0 MiB
21/12/18 18:50:24 WARN DAGScheduler: Broadcasting large task binary with size 1290.1 KiB
21/12/18 18:50:26 WARN DAGScheduler: Broadcasting large task binary with size 10.5 MiB
21/12/18 18:50:29 WARN DAGScheduler: Broadcasting large task binary with size 1766.3 KiB
21/12/18 18:50:31 WARN DAGScheduler: Broadcasting large task binary with size 15.3 MiB
21/12/18 18:50:35 WARN DAGScheduler: Broadcasting large task binary with size 2.3 MiB
21/12/18 18:50:38 WARN DAGScheduler: Broadcasting large task binary with size 21.5 MiB
21/12/18 18:50:43 WARN DAGScheduler: Br

Py4JJavaError: An error occurred while calling o3380.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 787.0 failed 1 times, most recent failure: Lost task 0.0 in stage 787.0 (TID 1289) (192.168.0.105 executor driver): java.lang.OutOfMemoryError: GC overhead limit exceeded
	at org.apache.spark.ml.tree.impl.DTStatsAggregator$$Lambda$3574/1330471226.get$Lambda(Unknown Source)
	at java.lang.invoke.LambdaForm$DMH/245672235.invokeStatic_L_L(LambdaForm$DMH)
	at java.lang.invoke.LambdaForm$MH/2068562923.linkToTargetMethod(LambdaForm$MH)
	at org.apache.spark.ml.tree.impl.DTStatsAggregator.<init>(DTStatsAggregator.scala:54)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22(RandomForest.scala:651)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22$adapted(RandomForest.scala:647)
	at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda$3571/1318815440.apply(Unknown Source)
	at scala.Array$.tabulate(Array.scala:418)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$21(RandomForest.scala:647)
	at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda$3552/332149708.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.RDD$$Lambda$2621/1999655490.apply(Unknown Source)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2377/167528220.apply(Unknown Source)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2403)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2352)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2351)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2351)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1109)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1109)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2591)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2533)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2522)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:898)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2214)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2235)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2254)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2279)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1030)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1029)
	at org.apache.spark.rdd.PairRDDFunctions.$anonfun$collectAsMap$1(PairRDDFunctions.scala:737)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:414)
	at org.apache.spark.rdd.PairRDDFunctions.collectAsMap(PairRDDFunctions.scala:736)
	at org.apache.spark.ml.tree.impl.RandomForest$.findBestSplits(RandomForest.scala:663)
	at org.apache.spark.ml.tree.impl.RandomForest$.runBagged(RandomForest.scala:208)
	at org.apache.spark.ml.tree.impl.RandomForest$.run(RandomForest.scala:302)
	at org.apache.spark.ml.classification.RandomForestClassifier.$anonfun$train$1(RandomForestClassifier.scala:161)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:138)
	at org.apache.spark.ml.classification.RandomForestClassifier.train(RandomForestClassifier.scala:46)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:151)
	at sun.reflect.GeneratedMethodAccessor154.invoke(Unknown Source)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
	at org.apache.spark.ml.tree.impl.DTStatsAggregator$$Lambda$3574/1330471226.get$Lambda(Unknown Source)
	at java.lang.invoke.LambdaForm$DMH/245672235.invokeStatic_L_L(LambdaForm$DMH)
	at java.lang.invoke.LambdaForm$MH/2068562923.linkToTargetMethod(LambdaForm$MH)
	at org.apache.spark.ml.tree.impl.DTStatsAggregator.<init>(DTStatsAggregator.scala:54)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22(RandomForest.scala:651)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$22$adapted(RandomForest.scala:647)
	at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda$3571/1318815440.apply(Unknown Source)
	at scala.Array$.tabulate(Array.scala:418)
	at org.apache.spark.ml.tree.impl.RandomForest$.$anonfun$findBestSplits$21(RandomForest.scala:647)
	at org.apache.spark.ml.tree.impl.RandomForest$$$Lambda$3552/332149708.apply(Unknown Source)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:863)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:863)
	at org.apache.spark.rdd.RDD$$Lambda$2621/1999655490.apply(Unknown Source)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:373)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:337)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:131)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:506)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2377/167528220.apply(Unknown Source)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1462)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:509)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [11]:
rf_classifier=RandomForestClassifier(labelCol='is_default', featuresCol="features",numTrees=20,maxDepth=5,impurity='entropy').fit(train_df)  

                                                                                

In [12]:
rf_predictions=rf_classifier.transform(test_df)

In [13]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol='is_default')
print('AUC：', evaluator.evaluate(rf_predictions))



AUC： 0.8015992903530811


                                                                                

gbt

In [15]:
gbt_classifier = GBTClassifier(labelCol='is_default', featuresCol="features", maxIter=15).fit(train_df)

                                                                                

In [16]:
gbt_predictions=gbt_classifier.transform(test_df)

In [17]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol='is_default')
print('AUC：', evaluator.evaluate(rf_predictions))

                                                                                

AUC： 0.8131785792222486


MLPC

In [7]:
layers = [38,50,2]

In [8]:
MLPC_trainer = MultilayerPerceptronClassifier(labelCol='is_default', featuresCol="features", maxIter=100, layers=layers, blockSize=128, seed=1234)

In [9]:
MLPC_model = MLPC_trainer.fit(train_df)

21/12/19 17:05:56 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/12/19 17:05:56 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [10]:
MLPC_predictions = MLPC_model.transform(test_df)

In [11]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction',labelCol='is_default')
print('AUC：', evaluator.evaluate(MLPC_predictions))

                                                                                

AUC： 0.641615563116166
