In [1]:
import os
import logging
from ucimlrepo import fetch_ucirepo 
from pyspark.sql import SparkSession
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline



spark_master = os.environ.get("SPARK_MASTER_URL")

In [2]:
# initialize spark session
spark_master = os.environ.get("SPARK_MASTER_URL")
spark = SparkSession.builder \
    .appName("Random-Forest-Classifier") \
    .master(spark_master) \
    .getOrCreate()

logger = logging.getLogger("py4j")
logger.setLevel(logging.ERROR)


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/22 07:50:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# fetch dataset 
default_of_credit_card_clients = fetch_ucirepo(id=350) 
X = spark.createDataFrame(default_of_credit_card_clients.data.features)
y = spark.createDataFrame(default_of_credit_card_clients.data.targets)

In [4]:
# combine X and y
X = X.withColumn("id", monotonically_increasing_id())
y = y.withColumn("id", monotonically_increasing_id())
df = X.join(y, on="id", how="inner").drop("id")

# split training and testing data
train, test = df.randomSplit([.7,.3], seed=42)


In [5]:
# list of numerical and categorical columns
target = ['Y']
num_feat = ['X1','X5','X12','X13','X14','X15','X16','X17','X18','X19','X20','X21','X22','X23']
cat_feat = [col for col in df.columns if col not in num_feat+target]
cat_feat_indexed = [f"{col}_i" for col in cat_feat]
cat_feat_encoded = [f"{col}_e" for col in cat_feat_indexed]



In [6]:
# Feature Transformations
string_indexer = StringIndexer(inputCols=cat_feat, outputCols=cat_feat_indexed, handleInvalid='keep')
hot_encoder = OneHotEncoder(inputCols=cat_feat_indexed, outputCols=cat_feat_encoded, handleInvalid='keep')
vector_assembler_1 = VectorAssembler(inputCols=num_feat+cat_feat_encoded,outputCol="features")

# Target Transformations
string_indexer_target = StringIndexer(inputCol='Y',outputCol='Y_i',handleInvalid='keep')
hot_encoder_target = OneHotEncoder(inputCol='Y_i', outputCol='Y_i_e',handleInvalid='keep')


In [7]:
# Vector Indexing (might be better for trees)
vector_assembler_2 = VectorAssembler(inputCols=num_feat+cat_feat,\
                                     outputCol="features")
vector_indexer = VectorIndexer(maxCategories=15,inputCol="features",\
                               outputCol="indexed_features")

# Target
vector_indexer_target = VectorIndexer(maxCategories=15,inputCol='Y',\
                               outputCol="indexed_target")

In [10]:
# pipeline
pipeline1 = Pipeline(stages=[string_indexer, \
                            hot_encoder, \
                            vector_assembler_1, \
                            string_indexer_target, \
                            hot_encoder_target])
train_1 = pipeline1.fit(train).transform(train)

                                                                                

In [12]:
# pipeline
pipeline2 = Pipeline(stages=[vector_assembler_2,\
                            vector_indexer,\
                            vector_indexer_target])
train_2 = pipeline2.fit(train).transform(train)

                                                                                

IllegalArgumentException: requirement failed: Column Y must be of type class org.apache.spark.ml.linalg.VectorUDT:struct<type:tinyint,size:int,indices:array<int>,values:array<double>> but was actually class org.apache.spark.sql.types.LongType$:bigint.