Working with Categorical Columns

In [0]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,
                                OneHotEncoder,StringIndexer)

In [0]:
# OneHot encoding of categorical variable to be able to enter into machine learning algorithms
# we vectorize them to be able to use it
# Indexer return the numrical equivalent it's could be used for label encoding
# OneHotEncoder is for using returning dummy variables equivalent of the categorical variables
SEX_indexer = StringIndexer(inputCol='SEX',outputCol='SEXIndex')
SEX_encoder = OneHotEncoder(inputCol='SEXIndex',outputCol='SEXVec')

EDUCATION_indexer = StringIndexer(inputCol='EDUCATION',outputCol='EDUCATIONIndex')
EDUCATION_encoder = OneHotEncoder(inputCol='EDUCATIONIndex',outputCol='EDUCATIONVec')

MARRIAGE_indexer = StringIndexer(inputCol='MARRIAGE',outputCol='MARRIAGEIndex')
MARRIAGE_encoder = OneHotEncoder(inputCol='MARRIAGEIndex',outputCol='MARRIAGEVec')

In [0]:
input_cols_OneHot= ['LIMIT_BAL', 'SEXVec', 'EDUCATIONVec', 'MARRIAGEVec', 'AGE', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
assembler_OneHot = VectorAssembler(inputCols= input_cols_OneHot ,outputCol='features')

input_cols_Label= ['LIMIT_BAL', 'SEXIndex', 'EDUCATIONIndex', 'MARRIAGEIndex', 'AGE', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
assembler_Label = VectorAssembler(inputCols=input_cols_Label,outputCol='features')

- LogisticRegression

In [0]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import RandomForestClassifier

with pipeline we define the steps that we want to do

In [0]:
from pyspark.ml import Pipeline

In [0]:
log_reg_df = LogisticRegression(featuresCol='features',labelCol='Default')
rf_df = RandomForestClassifier(featuresCol='features',labelCol='Default')

In [0]:
# a pipeline for logistic regression
pipeline_lr_OneHot = Pipeline(stages=[SEX_indexer,EDUCATION_indexer,MARRIAGE_indexer,
                            SEX_encoder,EDUCATION_encoder,MARRIAGE_encoder,
                           assembler_OneHot,log_reg_df])

pipeline_lr_Label = Pipeline(stages=[SEX_indexer,EDUCATION_indexer,MARRIAGE_indexer,
                           assembler_Label,log_reg_df])

pipeline_rf_Label = Pipeline(stages=[SEX_indexer,EDUCATION_indexer,MARRIAGE_indexer,
                           assembler_Label,rf_df])

In [0]:
# random split with a seed number to reproduce the results
train_df, test_df = df.randomSplit([0.8,.2],110)

In [0]:
fit_model_OneHot = pipeline_lr_OneHot.fit(train_df)
fit_model_Label = pipeline_lr_Label.fit(train_df)
fit_rf_Label = pipeline_rf_Label.fit(train_df)

In [0]:
train_OneHot_results = fit_model_OneHot.transform(train_df)
test_OneHot_results = fit_model_OneHot.transform(test_df)

train_Label_results = fit_model_Label.transform(train_df)
test_Label_results = fit_model_Label.transform(test_df)

- AUC
- Accuracy

In [0]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [0]:
AUC_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Default')
Accu_evaluator = MulticlassClassificationEvaluator()
Accu_evaluator.setPredictionCol('prediction')
Accu_evaluator.setLabelCol('Default')

In [0]:
test_Label_results.select('Default','prediction').show(5)

In [0]:
AUC_test_Label = AUC_eval.evaluate(test_Label_results)
AUC_train_Label = AUC_eval.evaluate(train_Label_results)

AUC_test_OneHot = AUC_eval.evaluate(test_OneHot_results)
AUC_train_OneHot = AUC_eval.evaluate(train_OneHot_results)

In [0]:
Accu_test_Label = Accu_evaluator.evaluate(test_Label_results, {Accu_evaluator.metricName: "accuracy"})
Accu_train_Label = Accu_evaluator.evaluate(train_Label_results, {Accu_evaluator.metricName: "accuracy"})

Accu_test_OneHot = Accu_evaluator.evaluate(test_OneHot_results, {Accu_evaluator.metricName: "accuracy"})
Accu_train_OneHot = Accu_evaluator.evaluate(train_OneHot_results, {Accu_evaluator.metricName: "accuracy"})

In [0]:
print("Test AUC with OneHot Encoding", AUC_test_OneHot)
print("\n")
print("Train AUC with OneHot Encoding", AUC_train_OneHot)

In [0]:
print("Accuracy: Test with OneHot Encoding", Accu_test_OneHot)
print("\n")
print("Accuracy: Train with OneHot Encoding", Accu_train_OneHot)

In [0]:
print("Test AUC with Label Encoding", AUC_test_Label)
print("\n")
print("Train AUC with Label Encoding", AUC_train_Label)

In [0]:
print("Accuracy: Test with Label Encoding", Accu_test_Label)
print("\n")
print("Accuracy: Train with Label Encoding", Accu_train_Label)

Feature Selection
- More generalizable more 
- avoid over fitting
- prediction in train set and test set would be closer but it's not guaranteed

In [0]:
fit_rf_Label.stages[-1].featureImportances

In [0]:
input_cols_Label

In [0]:
df2=fit_rf_Label.transform(train_df)

In [0]:
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

In [0]:
#with this you can extract the features, since the outcome is a pandas dataframe, we can use pandas related functions and methods
# you can check it with this command: type()
ExtractFeatureImp(fit_rf_Label.stages[-1].featureImportances, df2, "features")

Unnamed: 0,idx,name,vals,score
2,5,PAY_1,,0.354374
3,6,PAY_2,,0.182674
4,7,PAY_3,,0.117484
5,8,PAY_4,,0.072501
0,0,LIMIT_BAL,,0.053582
7,10,PAY_6,,0.043897
6,9,PAY_5,,0.043617
15,18,PAY_AMT2,,0.038
14,17,PAY_AMT1,,0.024279
8,11,BILL_AMT1,,0.017907


In [0]:
features=ExtractFeatureImp(fit_rf_Label.stages[-1].featureImportances, df2, "features")

features=features[features.score>0.00036].iloc[:,1].tolist()
vars=features+['Default']
vars

In [0]:
train_df_selected=fit_model_Label.transform(train_df).select(vars)
test_df_selected=fit_model_Label.transform(test_df).select(vars)

In [0]:
# we don't need to apply whole of pipeline just the last step which training the model is enough
assembler = VectorAssembler(inputCols= features, outputCol="features")
datatrain = assembler.transform(train_df_selected)
datatest = assembler.transform(test_df_selected)

log_reg_df = LogisticRegression(featuresCol='features',labelCol='Default')

fit_new = log_reg_df.fit(datatrain)

In [0]:
train_new_results = fit_new.transform(datatrain)
test_new_results = fit_new.transform(datatest)

In [0]:
AUC_test_new = AUC_eval.evaluate(test_new_results)
AUC_train_new = AUC_eval.evaluate(train_new_results)

Accu_test_new = Accu_evaluator.evaluate(test_new_results, {Accu_evaluator.metricName: "accuracy"})
Accu_train_new = Accu_evaluator.evaluate(train_new_results, {Accu_evaluator.metricName: "accuracy"})

In [0]:
print("Test AUC with Label Encoding & feature selection", AUC_test_new)
print("Train AUC with Label Encoding & feature selection", AUC_train_new)
print("\n")
print("Accuracy: Test with Label Encoding & feature selection", Accu_test_new)
print("Accuracy: Train with Label Encoding & feature selection", Accu_train_new)

- Tuning the algorithms (CrossValidator) for LogisticRegression
- As we indicate 3 values for regParam, 3 values for maxIter, and 3 values for elasticNetParam, this grid will have 3 x 3 x 3 = 27 parameter settings for CrossValidator to choose from. We will create a 5-fold cross validator.

In [0]:
import pyspark.ml.tuning as tune
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [0]:
lc_df = LogisticRegression(featuresCol='features',labelCol='Default')

In [0]:
# Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lc_df.regParam, [0.01, 0.5, 2.0])
             .addGrid(lc_df.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lc_df.maxIter, [1, 5, 10])
             .build())

In [0]:
cv = CrossValidator(estimator=lc_df, estimatorParamMaps=paramGrid, evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol='Default'),numFolds=5)

In [0]:
cvModel = cv.fit(datatrain)

In [0]:
train_new0_results = cvModel.transform(datatrain)
test_new0_results = cvModel.transform(datatest)

AUC_test_new0 = AUC_eval.evaluate(test_new0_results)
AUC_train_new0 = AUC_eval.evaluate(train_new0_results)

Accu_test_new0 = Accu_evaluator.evaluate(test_new0_results, {Accu_evaluator.metricName: "accuracy"})
Accu_train_new0 = Accu_evaluator.evaluate(train_new0_results, {Accu_evaluator.metricName: "accuracy"})

In [0]:
print("Test AUC with Label Encoding & feature selection", AUC_test_new0)
print("Train AUC with Label Encoding & feature selection", AUC_train_new0)
print("\n")
print("Accuracy: Test with Label Encoding & feature selection", Accu_test_new0)
print("Accuracy: Train with Label Encoding & feature selection", Accu_train_new0)