In [None]:
import pandas as pd
import random


In [9]:

# Generate data
n = 100
data = {
    'user_id': [x for x in range(0, n)],
    'month_interaction_count': [random.randrange(25, 50) for x in range(0, n)],
    'week_interaction_count': [random.randrange(6, 15) for x in range(0, n)],
    'day_interaction_count': [random.randrange(0, 4) for x in range(0, n)],
    
    'cancelled_within_week': [random.randrange(0, 1+1) for x in range(0, n)],
    
    
}


df = pd.DataFrame(data)

df#.show()


Unnamed: 0,user_id,month_interaction_count,week_interaction_count,day_interaction_count,cancelled_within_week
0,0,43,7,1,0
1,1,41,10,2,1
2,2,43,14,2,1
3,3,34,8,1,1
4,4,46,8,2,1
...,...,...,...,...,...
95,95,47,6,2,0
96,96,29,6,3,1
97,97,37,8,0,1
98,98,41,10,3,1


In [10]:

# Get features column
# Get label column 
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=["month_interaction_count", "week_interaction_count", "day_interaction_count"],
    outputCol="features"
)

df = assembler.transform(df)
df.show()

AssertionError: 

In [14]:
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator


# choose model
lr = LogisticRegression()

# make a grid search for all the parameters
### regularization
### threshold
### maxIter - tries 1 value
paramGrid = ParamGridBuilder().addGrid(lr.regParam, [0.1, 0.01])\
    .addGrid(lr.elasticNetParam, [0,0, 0.5, 1.0])\
    .addGrid(lr.threshold, [0.4, 0.5, 0.6])\
    .addGrid(lr.maxIter, [10])\
    .build()

# Test the estimator (lr) and Splits data into 80/20 split
tvs = TrainValidationSplit(
    estimator=lr, 
    estimatorParamMaps=paramGrid, 
    evaluator=BinaryClassificationEvaluator(), 
    trainRatio=.8
)

# Begins grid search
model = tvs.fit(df)


# Print results
for x,y in zip(model.validationMetrics, model.getEstimatorParamMaps()):
    for key, value in y.items():
        print(key.name, value)
    print("---------------------------------")
    print(f"Accuracy: {x} \n\n")

AssertionError: 

In [12]:
# 18 iterations, 76% max

In [13]:
# Now: Use Bayesian Optimization!

In [15]:
from hyperopt import fmin, tpe, hp, SparkTrials, STATUS_OK, Trials

train_df, test_df = df.randomSplit([.8, .2])

def train(params):
    regParam = float(params['regParam'])
    elasticNetParam = float(params['elasticNetParam'])
    threshold = float(params['threshold'])
    
    lr = LogisticRegression(
        maxIter=10,
        regParam=regParam,
        elasticNetParam=elasticNetParam,
        threshold=threshold
    )

    lrmodel = lr.fit(train_df)
    testEvaluation = lrmodel.evaluate(test_df)
    print("Accuracy:", testEvaluation.accuracy, "Hyperparameters", params)
    return {'loss': -testEvaluation.accuracy, 'status': STATUS_OK} # return negative so we can minimize the loss function


search_space = {
    'elasticNetParam': hp.uniform('elasticNetParam', 0, 1), # distribution between 0-1
    'regParam': hp.uniform('regParam', .01, .1), 
    'threshold': hp.uniform('threshold', .4, .6), 
}

algo = tpe.suggest # Tree of Parzen Estimators (Bayesian Method!)

best_hyperparameters = fmin( #fmin() = "Function to minimize"
    fn=train, # Function user-defined
    space=search_space,
    algo=algo,
    max_evals=9 # note: earlier we had 18, only going to give them 9 this time!
)

print("Best hyperparameters:", best_hyperparameters)

ModuleNotFoundError: No module named 'hyperopt'

In [None]:
# 9 iterations, 85% max