In [1]:
sc

In [2]:
from sklearn import datasets
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import  pyspark.sql.functions as F
import random
from pyspark.sql.types import *
from sklearn.model_selection import train_test_split

# Create/Load the Dataset

In [4]:
X,y = datasets.make_classification(n_samples=10000, n_features=4, n_informative=2, n_classes=2, random_state=1,shuffle=True)

In [5]:
train = pd.DataFrame(X)
train['target'] = y

In [6]:
train_sp = spark.createDataFrame(train)

In [7]:
train_sp = train_sp.toDF(*['c0', 'c1', 'c2', 'c3', 'target'])

In [8]:
train_sp.show()

# Replicate the Dataset n times

In [10]:
# replicate the spark dataframe into multiple copies
replication_df = spark.createDataFrame(pd.DataFrame(list(range(1,100)),columns=['replication_id']))

In [11]:
replicated_train_df = train_sp.crossJoin(replication_df)

In [12]:
replicated_train_df.show()

# Create Pandas UDF to run Model

In [14]:
# 0. Declare the schema for the output of our function
outSchema = StructType([StructField('replication_id',IntegerType(),True),StructField('Accuracy',DoubleType(),True),StructField('num_trees',IntegerType(),True),StructField('depth',IntegerType(),True),StructField('criterion',StringType(),True)])

# decorate our function with pandas_udf decorator
@F.pandas_udf(outSchema, F.PandasUDFType.GROUPED_MAP)
def run_model(pdf):
    # 1. Get hyperparam values
    num_trees =  random.choice(list(range(50,500)))
    depth = random.choice(list(range(2,10)))
    criterion = random.choice(['gini','entropy'])
    replication_id = pdf.replication_id.values[0]
    # 2. Train test split
    X = pdf[['c0', 'c1', 'c2', 'c3']]
    y = pdf['target']
    #del X['target']
    Xtrain,Xcv,ytrain,ycv = train_test_split(X, y, test_size=0.33, random_state=42)
    # 3. Create model using the pandas dataframe
    clf = RandomForestClassifier(n_estimators=num_trees, max_depth = depth, criterion =criterion)
    clf.fit(Xtrain,ytrain)
    # 4. Evaluate the model
    accuracy = accuracy_score(clf.predict(Xcv),ycv)
    # 5. return results as pandas DF
    res =pd.DataFrame({'replication_id':replication_id,'Accuracy':accuracy, 'num_trees':num_trees,'depth':depth,'criterion':criterion}, index=[0])
    return res



# Run the model

In [16]:
results = replicated_train_df.groupby("replication_id").apply(run_model)
results.sort(F.desc("Accuracy")).show()