In [6]:
import os
import logging
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, LongType
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, VectorIndexer
from pyspark.ml import Pipeline




In [8]:
SEED = 9
spark_master = os.environ.get("SPARK_MASTER_URL")
train_path = os.path.join(os.environ.get("TRAIN_PATH"),str(SEED))
test_path = os.path.join(os.environ.get("TEST_PATH"),str(SEED))

In [9]:
# initialize spark session
spark_master = os.environ.get("SPARK_MASTER_URL")
spark = SparkSession.builder \
    .appName("Random-Forest-Classifier") \
    .master(spark_master) \
    .getOrCreate()

logger = logging.getLogger("py4j")
logger.setLevel(logging.ERROR)


In [10]:
schema = StructType([
    StructField("X1", LongType(), True),
    StructField("X2", LongType(), True),
    StructField("X3", LongType(), True),
    StructField("X4", LongType(), True),
    StructField("X5", LongType(), True),
    StructField("X6", LongType(), True),
    StructField("X7", LongType(), True),
    StructField("X8", LongType(), True),
    StructField("X9", LongType(), True),
    StructField("X10", LongType(), True),
    StructField("X11", LongType(), True),
    StructField("X12", LongType(), True),
    StructField("X13", LongType(), True),
    StructField("X14", LongType(), True),
    StructField("X15", LongType(), True),
    StructField("X16", LongType(), True),
    StructField("X17", LongType(), True),
    StructField("X18", LongType(), True),
    StructField("X19", LongType(), True),
    StructField("X20", LongType(), True),
    StructField("X21", LongType(), True),
    StructField("X22", LongType(), True),
    StructField("X23", LongType(), True),
    StructField("Y", LongType(), True)
])


In [11]:
# Read the DataFrame from the saved Parquet file
if os.path.exists(train_path) and os.path.exists(test_path):
    train = spark.read.schema(schema).parquet(train_path)
    test = spark.read.schema(schema).parquet(test_path)


In [None]:
# list of numerical and categorical columns
target = ['Y']
num_feat = ['X1','X5','X12','X13','X14','X15','X16','X17','X18','X19','X20','X21','X22','X23']
cat_feat = [col for col in train.columns if col not in num_feat+target]
cat_feat_indexed = [f"{col}_i" for col in cat_feat]
cat_feat_encoded = [f"{col}_e" for col in cat_feat_indexed]



In [None]:
# Feature Transformations
string_indexer = StringIndexer(inputCols=cat_feat, outputCols=cat_feat_indexed, handleInvalid='keep')
hot_encoder = OneHotEncoder(inputCols=cat_feat_indexed, outputCols=cat_feat_encoded, handleInvalid='keep')
vector_assembler_1 = VectorAssembler(inputCols=num_feat+cat_feat_encoded,outputCol="features")

# Target Transformations
string_indexer_target = StringIndexer(inputCol='Y',outputCol='Y_i',handleInvalid='keep')
hot_encoder_target = OneHotEncoder(inputCol='Y_i', outputCol='Y_i_e',handleInvalid='keep')


In [None]:
# Vector Indexing (might be better for trees)
vector_assembler_2 = VectorAssembler(inputCols=num_feat+cat_feat,\
                                     outputCol="features")
vector_indexer = VectorIndexer(maxCategories=15,inputCol="features",\
                               outputCol="indexed_features")

# Target
vector_indexer_target = VectorIndexer(maxCategories=15,inputCol='Y',\
                               outputCol="indexed_target")

In [None]:
# pipeline
pipeline1 = Pipeline(stages=[string_indexer, \
                            hot_encoder, \
                            vector_assembler_1, \
                            string_indexer_target, \
                            hot_encoder_target])
train_1 = pipeline1.fit(train).transform(train)

In [None]:
# pipeline
pipeline2 = Pipeline(stages=[vector_assembler_2,\
                            vector_indexer,\
                            vector_indexer_target])
train_2 = pipeline2.fit(train).transform(train)

In [13]:
spark.stop()