In [1]:
import os
import pandas as pd
import sklearn.ensemble
from sklearn.metrics import roc_auc_score

import argparse

# Load the featurized data selected by active learning

In [2]:
label_column_name = 'flagged'

In [3]:
training_set = pd.read_csv('training_set_01.csv').drop('rev_id', axis=1)

X_train = training_set.loc[:, :'V511']

y_train = training_set.loc[:, label_column_name]

In [4]:
test_set = pd.read_csv('test_set_01.csv').drop('rev_id', axis=1)

X_test = test_set.loc[:, :'V511']

y_test = test_set.loc[:, label_column_name]

# Train default (un-tuned) scikit-learn random forest

In [24]:
%%time

classifier_model = sklearn.ensemble.RandomForestClassifier(random_state=1)
# classifier_model = sklearn.ensemble.RandomForestClassifier(n_estimators=101, max_depth=8, n_jobs=-1, random_state=1)
# classifier_model = sklearn.ensemble.RandomForestClassifier(n_estimators=500, criterion='entropy', random_state=1)

fitted_model = classifier_model.fit(X_train, y_train)

CPU times: user 63.4 ms, sys: 132 µs, total: 63.5 ms
Wall time: 62 ms


# Test the model

In [25]:
pred_prob = fitted_model.predict_proba(X_test)

scores = pred_prob[:,1]

auc = roc_auc_score(y_test, scores)

print('AUC:', auc)

AUC: 0.8286844255777241


# Beginning of PySpark code

In [12]:
# This cell is only needed if not running in a Pyspark kernel
import pyspark
spark = pyspark.sql.SparkSession.builder.appName("MyApp") \
            .config("spark.jars.packages", "Azure:mmlspark:0.13") \
            .getOrCreate()
import mmlspark

In [13]:
import pyspark

import pandas as pd
import mmlspark
from pyspark.sql.types import IntegerType, StringType, FloatType, StructType, StructField

import os, urllib


In [14]:
# Create Spark dataframes

tune = spark.createDataFrame(training_set)
test = spark.createDataFrame(test_set)


# Tune logistic regression, random forest, and GBM

In [15]:
from mmlspark import TuneHyperparameters
from mmlspark.TrainClassifier import TrainClassifier
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier

# Define the models to try: Logistic Regression, Random Forest, and Gradient Boosted Trees

logReg = LogisticRegression()
randForest = RandomForestClassifier()
gbt = GBTClassifier()

smlmodels = [logReg, randForest, gbt]

mmlmodels = [TrainClassifier(model=model, labelCol=label_column_name) for model in smlmodels]

In [11]:
from mmlspark import HyperparamBuilder
from mmlspark import RangeHyperParam
from mmlspark import DiscreteHyperParam
from mmlspark import RandomSpace

# Define the hyperparameters to tune

paramBuilder = \
  HyperparamBuilder() \
    .addHyperparam(logReg, logReg.regParam, RangeHyperParam(0.1, 0.3, isDouble=True)) \
    .addHyperparam(randForest, randForest.numTrees, RangeHyperParam(50, 1000)) \
    .addHyperparam(randForest, randForest.maxDepth, RangeHyperParam(3, 30)) \
    .addHyperparam(randForest, randForest.maxBins, RangeHyperParam(100, 1000)) \
    .addHyperparam(randForest, randForest.impurity, DiscreteHyperParam(['gini', 'entropy'])) \
    .addHyperparam(gbt, gbt.maxBins, RangeHyperParam(100, 1000)) \
    .addHyperparam(gbt, gbt.maxDepth, RangeHyperParam(3, 30))

randomSpace = RandomSpace(paramBuilder.build())

In [12]:
%%time
bestModel = TuneHyperparameters(
              evaluationMetric="AUC", models=mmlmodels, numFolds=3,
              numRuns=len(mmlmodels) * 1, parallelism=1,
              paramSpace=randomSpace.space(), seed=0).fit(tune)

CPU times: user 537 ms, sys: 295 ms, total: 833 ms
Wall time: 15min 50s


In [13]:
# Print the parameters of the best model

bestModelInfo = bestModel._java_obj.getBestModelInfo()

for entry in bestModelInfo.split(', '):
    print(entry)

aggregationDepth: 2
elasticNetParam: 0.0
family: auto
featuresCol: TrainClassifier_4408b3941f51ab92cc84_features
fitIntercept: true
labelCol: flagged
maxIter: 100
predictionCol: prediction
probabilityCol: probability
rawPredictionCol: rawPrediction
regParam: 0.2461935574753314
standardization: true
threshold: 0.5
tol: 1.0E-6


# Test the best model on the held-out test set

In [14]:
from mmlspark import ComputeModelStatistics

prediction = bestModel.transform(test)

metrics = ComputeModelStatistics().transform(prediction)

metrics.toPandas().transpose().rename(columns={0: ''})

Unnamed: 0,Unnamed: 1
evaluation_type,Classification
predicted_class_as_0.0_actual_is_0.0,8454
predicted_class_as_0.0_actual_is_1.0,523
predicted_class_as_1.0_actual_is_0.0,203
predicted_class_as_1.0_actual_is_1.0,820
accuracy,0.9274
precision,0.801564
recall,0.610573
AUC,0.929847
