In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('myproj').getOrCreate()
data = spark.read.csv('/FileStore/tables/UCI_Credit_Card.csv',inferSchema=True,header=True)
data.printSchema()

Data Dictionary
1.	ID: ID of each client
2.	LIMIT_BAL: Amount of given credit in NT dollars (includes individual and family/supplementary credit)
3.	SEX: Gender (1=male, 2=female)
4.	EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)
5.	MARRIAGE: Marital status (1=married, 2=single, 3=others)
6.	Age: Age in years
7.	PAY_0 ------ PAY_6 (6 features): Repayment status from April, 2005 to September, 2005(-1=pay duly, 0=not delay, other number = the number of months for payment delay)
8.	BILL_AMT1 ------ BILL_AMT6 (6 features): Amount of bill statement from April, 2005 to September 2005(NT dollar)
9.	PAY_AMT1: Amount of previous payment from April, 2005 to September 2005(NT dollar)
10.	Default. Payment. next. Month: default payment (1=yes, 2=no)

In [0]:
data = data.withColumnRenamed("PAY_0", "PAY_1")
data = data.withColumnRenamed("default.payment.next.month", "Default")

Change data types

In [0]:
from pyspark.sql.types import StructField,StringType,IntegerType,DoubleType,StructType
# if the second element is True means it could have null cells
df_schema = StructType([
    StructField('ID', StringType(), True),
    StructField('LIMIT_BAL', DoubleType(), True),
    StructField('SEX', StringType(), True),
    StructField('EDUCATION', StringType(), True),
    StructField('MARRIAGE', StringType(), True),
    StructField('AGE', IntegerType(), True),
    StructField('PAY_1', IntegerType(), True),
    StructField('PAY_2', IntegerType(), True),
    StructField('PAY_3', IntegerType(), True),
    StructField('PAY_4', IntegerType(), True),
    StructField('PAY_5', IntegerType(), True),
    StructField('PAY_6', IntegerType(), True),
    StructField('BILL_AMT1', DoubleType(), True),
    StructField('BILL_AMT2', DoubleType(), True),
    StructField('BILL_AMT3', DoubleType(), True),
    StructField('BILL_AMT4', DoubleType(), True),
    StructField('BILL_AMT5', DoubleType(), True),
    StructField('BILL_AMT6', DoubleType(), True),
    StructField('PAY_AMT1', DoubleType(), True),
    StructField('PAY_AMT2', DoubleType(), True),
    StructField('PAY_AMT3', DoubleType(), True),
    StructField('PAY_AMT4', DoubleType(), True),
    StructField('PAY_AMT5', DoubleType(), True),
    StructField('PAY_AMT6', DoubleType(), True),
    StructField('Default', IntegerType(), True),
    ])
data = spark.createDataFrame(data.collect(),schema = df_schema)
data.printSchema()

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.functions import when, count, col
df = data
na_report=df.select([count(when(isnull(c), c)).alias(c) for c in df.columns])# dimensions of the dataframe
print("Number of Rows: ",df.count() ,"   Number of Columns: ", len(df.columns))
# we should handle the missing values by imputation unless too many of them are emtpy
# na.drop() method is not recommended for avoiding biased except for the dependent variable
na_report.show()

In [0]:
df.describe().show()

Drop abnormal data in "EDUCATION" and "MARRIAGE" comlumns

In [0]:
df.select('MARRIAGE','EDUCATION').describe().show()

In [0]:
df = df.filter((df.MARRIAGE != '3')&(df.MARRIAGE != '0') &(df.EDUCATION != '0') &  (df.EDUCATION != '5') & (df.EDUCATION != '6'))

In [0]:
df.select('MARRIAGE','EDUCATION').describe().show()

Visualizations

In [0]:
df.select("EDUCATION","Default","ID").groupBy("EDUCATION","Default").agg(count("ID")).orderBy('EDUCATION').display()

EDUCATION,Default,count(ID)
1,0,8508
1,1,2023
2,0,10577
2,1,3285
3,1,1206
3,0,3564
4,0,113
4,1,7


In [0]:
df.select("SEX","Default","ID").groupBy("SEX","Default").agg(count("ID")).display()

SEX,Default,count(ID)
1,0,8795
1,1,2821
2,0,13967
2,1,3700


In [0]:
df.select("AGE","Default","ID").groupBy("AGE","Default").agg(count("ID")).orderBy('AGE').display()

AGE,Default,count(ID)
21,1,14
21,0,50
22,0,383
22,1,164
23,1,246
23,0,669
24,0,815
24,1,300
25,1,298
25,0,873


In [0]:
df.select("EDUCATION","ID").groupBy("EDUCATION").agg(count("ID")).orderBy('EDUCATION').display()

EDUCATION,count(ID)
1,10531
2,13862
3,4770
4,120


In [0]:
df.select("MARRIAGE","ID").groupBy("MARRIAGE").agg(count("ID")).display()

MARRIAGE,count(ID)
1,13477
2,15806


In [0]:
df.select("MARRIAGE","ID","Default").groupBy("Default","MARRIAGE").agg(count("id")).orderBy('MARRIAGE').display()

Default,MARRIAGE,count(id)
0,1,10285
1,1,3192
0,2,12477
1,2,3329


In [0]:
df.select("LIMIT_BAL","ID","Default").groupBy("Default","LIMIT_BAL").agg(count("id")).orderBy('LIMIT_BAL').display()

Default,LIMIT_BAL,count(id)
1,10000.0,189
0,10000.0,284
0,16000.0,1
1,20000.0,679
0,20000.0,1231
0,30000.0,1003
1,30000.0,554
1,40000.0,92
0,40000.0,132
0,50000.0,2397


In [0]:
df.select("Default","ID").groupBy("Default").agg(count("ID")).display()

Default,count(ID)
1,6521
0,22762


Check outliers

In [0]:
quantiles = {
    c: dict(
        zip(["q1", "q3"], df.approxQuantile(c, [0.25, 0.75], 0))
    )
    for c in ["LIMIT_BAL", "AGE","PAY_1","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6","BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6","PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6"]
}
quantiles

In [0]:
for i in quantiles:
    iqr = quantiles[i]['q3'] - quantiles[i]['q1']
    quantiles[i]['lower_bound'] = quantiles[i]['q1'] - (iqr * 1.5)
    quantiles[i]['upper_bound'] = quantiles[i]['q3'] + (iqr * 1.5)
print(quantiles)

In [0]:

import pyspark.sql.functions as f
df_clean=df.select(
    "*",
    *[
        f.when(
            f.col(c).between(quantiles[c]['lower_bound'], quantiles[c]['upper_bound']),
            0
        ).otherwise(1).alias(c+"_out") 
        for c in ["LIMIT_BAL", "AGE","PAY_1","PAY_2","PAY_3","PAY_4","PAY_5","PAY_6","BILL_AMT1","BILL_AMT2","BILL_AMT3","BILL_AMT4","BILL_AMT5","BILL_AMT6","PAY_AMT1","PAY_AMT2","PAY_AMT3","PAY_AMT4","PAY_AMT5","PAY_AMT6"]
    ]
)

In [0]:
from pyspark.sql.functions import col
df_clean=df_clean.withColumn("outliers", col("LIMIT_BAL_out")+col("AGE_out")+col("PAY_1_out")+col("BILL_AMT1_out")+col("PAY_AMT1_out")+col("PAY_2_out")+col("BILL_AMT2_out")+col("PAY_AMT2_out")+col("PAY_3_out")+col("BILL_AMT3_out")+col("PAY_AMT3_out")+col("PAY_4_out")+col("BILL_AMT4_out")+col("PAY_AMT4_out")+col("PAY_5_out")+col("BILL_AMT5_out")+col("PAY_AMT5_out")+col("PAY_6_out")+col("BILL_AMT6_out")+col("PAY_AMT6_out"))

In [0]:
df_clean.select("outliers","ID").groupBy("outliers").agg(count("id")).orderBy('outliers').display()

outliers,count(id)
0,12827
1,5797
2,3040
3,2001
4,1397
5,1044
6,1635
7,454
8,259
9,188


Data processing

Dealing with imbalanced labels（Try to improve the performance---Oversampling）

In [0]:
majority = df.filter((df.Default == 0))
minority = df.filter((df.Default == 1))
 
majority_count = majority.count()
minority_count = minority.count() 
ratio = majority_count / minority_count
print("ratio: {}".format(ratio))
sampled_minority_df = minority.sample(withReplacement=True, fraction=ratio)
 
df = sampled_minority_df.unionAll(majority)
df.show()

In [0]:
df.select('Default').describe().show()

In [0]:
df.select("Default","ID").groupBy("Default").agg(count("ID")).display()

Default,count(ID)
1,22779
0,22762


For statistical imputation we convert the pyspark dataframe into pandas to use functions of this library

In [0]:
from sklearn.impute import SimpleImputer
import pandas as pd

In [0]:
sample=df.toPandas()

In [0]:
samplecopy=sample
samplecopy

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,Default
0,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
2,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
3,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
4,1,20000.0,2,2,1,24,2,2,-1,-1,-2,-2,3913.0,3102.0,689.0,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45536,29991,140000.0,1,2,1,41,0,0,0,0,0,0,138325.0,137142.0,139110.0,138262.0,49675.0,46121.0,6000.0,7000.0,4228.0,1505.0,2000.0,2000.0,0
45537,29993,10000.0,1,3,1,43,0,0,0,-2,-2,-2,8802.0,10400.0,0.0,0.0,0.0,0.0,2000.0,0.0,0.0,0.0,0.0,0.0,0
45538,29994,100000.0,1,1,2,38,0,-1,-1,0,0,0,3042.0,1427.0,102996.0,70626.0,69473.0,55004.0,2000.0,111784.0,4000.0,3000.0,2000.0,2000.0,0
45539,29996,220000.0,1,3,1,39,0,0,0,0,0,0,188948.0,192815.0,208365.0,88004.0,31237.0,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0,0


Working with Categorical Columns

In [0]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [0]:
# OneHot encoding of categorical variable to be able to enter into machine learning algorithms
# we vectorize them to be able to use it
# Indexer return the numrical equivalent it's could be used for label encoding
# OneHotEncoder is for using returning dummy variables equivalent of the categorical variables
SEX_indexer = StringIndexer(inputCol='SEX',outputCol='SEXIndex')
SEX_encoder = OneHotEncoder(inputCol='SEXIndex',outputCol='SEXVec')

EDUCATION_indexer = StringIndexer(inputCol='EDUCATION',outputCol='EDUCATIONIndex')
EDUCATION_encoder = OneHotEncoder(inputCol='EDUCATIONIndex',outputCol='EDUCATIONVec')

MARRIAGE_indexer = StringIndexer(inputCol='MARRIAGE',outputCol='MARRIAGEIndex')
MARRIAGE_encoder = OneHotEncoder(inputCol='MARRIAGEIndex',outputCol='MARRIAGEVec')

In [0]:
input_cols_OneHot= ['LIMIT_BAL', 'SEXVec', 'EDUCATIONVec', 'MARRIAGEVec', 'AGE', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
assembler_OneHot = VectorAssembler(inputCols= input_cols_OneHot ,outputCol='features')

input_cols_Label= ['LIMIT_BAL', 'SEXIndex', 'EDUCATIONIndex', 'MARRIAGEIndex', 'AGE', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
assembler_Label = VectorAssembler(inputCols=input_cols_Label,outputCol='features')

- LogisticRegression

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier

with pipeline we define the steps that we want to do

In [0]:
from pyspark.ml import Pipeline

In [0]:
log_reg_df = LogisticRegression(featuresCol='features',labelCol='Default')
rf_df = RandomForestClassifier(featuresCol='features',labelCol='Default')

In [0]:
# a pipeline for logistic regression
pipeline_lr_OneHot = Pipeline(stages=[SEX_indexer,EDUCATION_indexer,MARRIAGE_indexer,
                            SEX_encoder,EDUCATION_encoder,MARRIAGE_encoder,
                           assembler_OneHot,log_reg_df])

pipeline_lr_Label = Pipeline(stages=[SEX_indexer,EDUCATION_indexer,MARRIAGE_indexer,
                           assembler_Label,log_reg_df])

pipeline_rf_Label = Pipeline(stages=[SEX_indexer,EDUCATION_indexer,MARRIAGE_indexer,
                           assembler_Label,rf_df])

In [0]:
# random split with a seed number to reproduce the results
train_df, test_df = df.randomSplit([0.8,.2],110)

In [0]:
fit_model_OneHot = pipeline_lr_OneHot.fit(train_df)
fit_model_Label = pipeline_lr_Label.fit(train_df)
fit_rf_Label = pipeline_rf_Label.fit(train_df)

In [0]:
train_OneHot_results = fit_model_OneHot.transform(train_df)
test_OneHot_results = fit_model_OneHot.transform(test_df)

train_Label_results = fit_model_Label.transform(train_df)
test_Label_results = fit_model_Label.transform(test_df)

- AUC
- Accuracy

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
AUC_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Default')
Accu_evaluator = MulticlassClassificationEvaluator()
Accu_evaluator.setPredictionCol('prediction')
Accu_evaluator.setLabelCol('Default')

In [0]:
test_Label_results.select('Default','prediction').show(5)

In [0]:
AUC_test_Label = AUC_eval.evaluate(test_Label_results)
AUC_train_Label = AUC_eval.evaluate(train_Label_results)

AUC_test_OneHot = AUC_eval.evaluate(test_OneHot_results)
AUC_train_OneHot = AUC_eval.evaluate(train_OneHot_results)

In [0]:
Accu_test_Label = Accu_evaluator.evaluate(test_Label_results, {Accu_evaluator.metricName: "accuracy"})
Accu_train_Label = Accu_evaluator.evaluate(train_Label_results, {Accu_evaluator.metricName: "accuracy"})

Accu_test_OneHot = Accu_evaluator.evaluate(test_OneHot_results, {Accu_evaluator.metricName: "accuracy"})
Accu_train_OneHot = Accu_evaluator.evaluate(train_OneHot_results, {Accu_evaluator.metricName: "accuracy"})

In [0]:
print("Test AUC with OneHot Encoding", AUC_test_OneHot)
print("\n")
print("Train AUC with OneHot Encoding", AUC_train_OneHot)

In [0]:
print("Accuracy: Test with OneHot Encoding", Accu_test_OneHot)
print("\n")
print("Accuracy: Train with OneHot Encoding", Accu_train_OneHot)

In [0]:
print("Test AUC with Label Encoding", AUC_test_Label)
print("\n")
print("Train AUC with Label Encoding", AUC_train_Label)

In [0]:
print("Accuracy: Test with Label Encoding", Accu_test_Label)
print("\n")
print("Accuracy: Train with Label Encoding", Accu_train_Label)

Feature Selection
- More generalizable more 
- avoid over fitting
- prediction in train set and test set would be closer but it's not guaranteed

In [0]:
fit_rf_Label.stages[-1].featureImportances

In [0]:
input_cols_Label

In [0]:
df2=fit_rf_Label.transform(train_df)

In [0]:
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

In [0]:
#with this you can extract the features, since the outcome is a pandas dataframe, we can use pandas related functions and methods
# you can check it with this command: type()
ExtractFeatureImp(fit_rf_Label.stages[-1].featureImportances, df2, "features")

Unnamed: 0,idx,name,vals,score
2,5,PAY_1,,0.354374
3,6,PAY_2,,0.182674
4,7,PAY_3,,0.117484
5,8,PAY_4,,0.072501
0,0,LIMIT_BAL,,0.053582
7,10,PAY_6,,0.043897
6,9,PAY_5,,0.043617
15,18,PAY_AMT2,,0.038
14,17,PAY_AMT1,,0.024279
8,11,BILL_AMT1,,0.017907


In [0]:
features=ExtractFeatureImp(fit_rf_Label.stages[-1].featureImportances, df2, "features")

features=features[features.score>0.00036].iloc[:,1].tolist()
vars=features+['Default']
vars

In [0]:
train_df_selected=fit_model_Label.transform(train_df).select(vars)
test_df_selected=fit_model_Label.transform(test_df).select(vars)

In [0]:
# we don't need to apply whole of pipeline just the last step which training the model is enough
assembler = VectorAssembler(inputCols= features, outputCol="features")
datatrain = assembler.transform(train_df_selected)
datatest = assembler.transform(test_df_selected)

log_reg_df = LogisticRegression(featuresCol='features',labelCol='Default')

fit_new = log_reg_df.fit(datatrain)

In [0]:
train_new_results = fit_new.transform(datatrain)
test_new_results = fit_new.transform(datatest)

In [0]:
AUC_test_new = AUC_eval.evaluate(test_new_results)
AUC_train_new = AUC_eval.evaluate(train_new_results)

Accu_test_new = Accu_evaluator.evaluate(test_new_results, {Accu_evaluator.metricName: "accuracy"})
Accu_train_new = Accu_evaluator.evaluate(train_new_results, {Accu_evaluator.metricName: "accuracy"})

In [0]:
print("Test AUC with Label Encoding & feature selection", AUC_test_new)
print("Train AUC with Label Encoding & feature selection", AUC_train_new)
print("\n")
print("Accuracy: Test with Label Encoding & feature selection", Accu_test_new)
print("Accuracy: Train with Label Encoding & feature selection", Accu_train_new)

- Tuning the algorithms (CrossValidator) for LogisticRegression
- As we indicate 3 values for regParam, 3 values for maxIter, and 3 values for elasticNetParam, this grid will have 3 x 3 x 3 = 27 parameter settings for CrossValidator to choose from. We will create a 5-fold cross validator.

In [0]:
import pyspark.ml.tuning as tune
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [0]:
lc_df = LogisticRegression(featuresCol='features',labelCol='Default')

In [0]:
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lc_df.regParam, [0.01, 0.5, 2.0])
             .addGrid(lc_df.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lc_df.maxIter, [1, 5, 10])
             .build())

In [0]:
cv = CrossValidator(estimator=lc_df, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Default'),numFolds=5)

In [0]:
cvModel = cv.fit(datatrain)

In [0]:
train_new0_results = cvModel.transform(datatrain)
test_new0_results = cvModel.transform(datatest)

AUC_test_new0 = AUC_eval.evaluate(test_new0_results)
AUC_train_new0 = AUC_eval.evaluate(train_new0_results)

Accu_test_new0 = Accu_evaluator.evaluate(test_new0_results, {Accu_evaluator.metricName: "accuracy"})
Accu_train_new0 = Accu_evaluator.evaluate(train_new0_results, {Accu_evaluator.metricName: "accuracy"})

In [0]:
print("Test AUC with Label Encoding & feature selection", AUC_test_new0)
print("Train AUC with Label Encoding & feature selection", AUC_train_new0)
print("\n")
print("Accuracy: Test with Label Encoding & feature selection", Accu_test_new0)
print("Accuracy: Train with Label Encoding & feature selection", Accu_train_new0)

Trying other algorithms
- Decision tree
- Random forest
- Gradient boosting tree
- SVM

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier

In [0]:
dt_df = DecisionTreeClassifier(featuresCol='features',labelCol='Default')

fit_new1 = dt_df.fit(datatrain)

In [0]:
train_new1_results = fit_new1.transform(datatrain)
test_new1_results = fit_new1.transform(datatest)

In [0]:
AUC_test_new1 = AUC_eval.evaluate(test_new1_results)
AUC_train_new1 = AUC_eval.evaluate(train_new1_results)

Accu_test_new1 = Accu_evaluator.evaluate(test_new1_results, {Accu_evaluator.metricName: "accuracy"})
Accu_train_new1 = Accu_evaluator.evaluate(train_new1_results, {Accu_evaluator.metricName: "accuracy"})

In [0]:
print("Test AUC with Label Encoding & feature selection", AUC_test_new1)
print("Train AUC with Label Encoding & feature selection", AUC_train_new1)
print("\n")
print("Accuracy: Test with Label Encoding & feature selection", Accu_test_new1)
print("Accuracy: Train with Label Encoding & feature selection", Accu_train_new1)

In [0]:
from pyspark.ml.classification import RandomForestClassifier

In [0]:
rf_df = RandomForestClassifier(featuresCol='features',labelCol='Default')
fit_new2 = rf_df.fit(datatrain)

In [0]:
train_new2_results = fit_new2.transform(datatrain)
test_new2_results = fit_new2.transform(datatest)

In [0]:
AUC_test_new2 = AUC_eval.evaluate(test_new2_results)
AUC_train_new2 = AUC_eval.evaluate(train_new2_results)

Accu_test_new2 = Accu_evaluator.evaluate(test_new2_results, {Accu_evaluator.metricName: "accuracy"})
Accu_train_new2 = Accu_evaluator.evaluate(train_new2_results, {Accu_evaluator.metricName: "accuracy"})

In [0]:
print("Test AUC with Label Encoding & feature selection", AUC_test_new2)
print("Train AUC with Label Encoding & feature selection", AUC_train_new2)
print("\n")
print("Accuracy: Test with Label Encoding & feature selection", Accu_test_new2)
print("Accuracy: Train with Label Encoding & feature selection", Accu_train_new2)

In [0]:
from pyspark.ml.classification import GBTClassifier

In [0]:
gbt_df = GBTClassifier(featuresCol='features',labelCol='Default')
fit_new3 = gbt_df.fit(datatrain)

In [0]:
train_new3_results = fit_new3.transform(datatrain)
test_new3_results = fit_new3.transform(datatest)

AUC_test_new3 = AUC_eval.evaluate(test_new3_results)
AUC_train_new3 = AUC_eval.evaluate(train_new3_results)

Accu_test_new3 = Accu_evaluator.evaluate(test_new3_results, {Accu_evaluator.metricName: "accuracy"})
Accu_train_new3 = Accu_evaluator.evaluate(train_new3_results, {Accu_evaluator.metricName: "accuracy"})

In [0]:
print("Test AUC with Label Encoding & feature selection", AUC_test_new3)
print("Train AUC with Label Encoding & feature selection", AUC_train_new3)
print("\n")
print("Accuracy: Test with Label Encoding & feature selection", Accu_test_new3)
print("Accuracy: Train with Label Encoding & feature selection", Accu_train_new3)

In [0]:
from pyspark.ml.classification import LinearSVC

In [0]:
svm_df = LinearSVC(featuresCol='features',labelCol='Default')
fit_new4 = svm_df.fit(datatrain)

In [0]:
train_new4_results = fit_new4.transform(datatrain)
test_new4_results = fit_new4.transform(datatest)

AUC_test_new4 = AUC_eval.evaluate(test_new4_results)
AUC_train_new4 = AUC_eval.evaluate(train_new4_results)

Accu_test_new4 = Accu_evaluator.evaluate(test_new4_results, {Accu_evaluator.metricName: "accuracy"})
Accu_train_new4 = Accu_evaluator.evaluate(train_new4_results, {Accu_evaluator.metricName: "accuracy"})

In [0]:
print("Test AUC with Label Encoding & feature selection", AUC_test_new4)
print("Train AUC with Label Encoding & feature selection", AUC_train_new4)
print("\n")
print("Accuracy: Test with Label Encoding & feature selection", Accu_test_new4)
print("Accuracy: Train with Label Encoding & feature selection", Accu_train_new4)

As we indicate 4 values for maxDepth and 3 values for maxBin, this grid will have 4 x 3 = 12 parameter settings for CrossValidator to choose from. We will create a 5-fold CrossValidator.

In [0]:
# Create ParamGrid for Cross Validation
paramGrid1 = (ParamGridBuilder()
             .addGrid(dt_df.maxDepth, [1, 2, 6, 10])
             .addGrid(dt_df.maxBins, [20, 40, 80])
             .build())

In [0]:
# Create 5-fold CrossValidator
cv1 = CrossValidator(estimator=dt_df, estimatorParamMaps=paramGrid1, evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Default'),numFolds=5)

In [0]:
# Run cross validations
cvModel1 = cv1.fit(datatrain)

In [0]:
train_new01_results = cvModel1.transform(datatrain)
test_new01_results = cvModel1.transform(datatest)

AUC_test_new01 = AUC_eval.evaluate(test_new01_results)
AUC_train_new01 = AUC_eval.evaluate(train_new01_results)

Accu_test_new01 = Accu_evaluator.evaluate(test_new01_results, {Accu_evaluator.metricName: "accuracy"})
Accu_train_new01 = Accu_evaluator.evaluate(train_new01_results, {Accu_evaluator.metricName: "accuracy"})

In [0]:
print("Test AUC with Label Encoding & feature selection", AUC_test_new01)
print("Train AUC with Label Encoding & feature selection", AUC_train_new01)
print("\n")
print("Accuracy: Test with Label Encoding & feature selection", Accu_test_new01)
print("Accuracy: Train with Label Encoding & feature selection", Accu_train_new01)

As we indicate 4 values for maxDepth, 3 values for maxBin, and 2 values for numTrees, this grid will have 4 x 3 x 2 = 24 parameter settings for CrossValidator to choose from. We will create a 5-fold CrossValidator.

In [0]:
# Create ParamGrid for Cross Validation
paramGrid2 = (ParamGridBuilder()
             .addGrid(rf_df.maxDepth, [1, 2, 6, 10])
             .addGrid(rf_df.maxBins, [20, 40, 80])
             .addGrid(rf_df.numTrees, [5, 20])
             .build())

In [0]:
# Create 5-fold CrossValidator
cv2 = CrossValidator(estimator=rf_df, estimatorParamMaps=paramGrid2, evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Default'),numFolds=5)

In [0]:
# Run cross validations
cvModel2 = cv2.fit(datatrain)

In [0]:
train_new02_results = cvModel2.transform(datatrain)
test_new02_results = cvModel2.transform(datatest)

AUC_test_new02 = AUC_eval.evaluate(test_new02_results)
AUC_train_new02 = AUC_eval.evaluate(train_new02_results)

Accu_test_new02 = Accu_evaluator.evaluate(test_new02_results, {Accu_evaluator.metricName: "accuracy"})
Accu_train_new02 = Accu_evaluator.evaluate(train_new02_results, {Accu_evaluator.metricName: "accuracy"})

In [0]:
print("Test AUC with Label Encoding & feature selection", AUC_test_new02)
print("Train AUC with Label Encoding & feature selection", AUC_train_new02)
print("\n")
print("Accuracy: Test with Label Encoding & feature selection", Accu_test_new02)
print("Accuracy: Train with Label Encoding & feature selection", Accu_train_new02)

The best model is Random forest（tuning with the ParamGridBuilder and the CrossValidator.）
- Test AUC with Label Encoding & feature selection 0.761141312513927

- Accuracy: Test with Label Encoding & feature selection 0.7606781871535703