In [1]:
# Initialising a SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('dt_rf_gbt').getOrCreate()

In [2]:
# Loading the dataset
df = spark.sql('SELECT * FROM dog_food_csv')

In [3]:
df.show()

In [4]:
# Converting the dataset to features and label
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = df.columns[:-1],
                           outputCol = 'features')
final_df = assembler.transform(df).select('features', 'Spoiled')

In [5]:
# Instantiating the models
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
dtc = DecisionTreeClassifier(labelCol = 'Spoiled')
rfc = RandomForestClassifier(labelCol = 'Spoiled', numTrees = 250)
gbtc = GBTClassifier(labelCol = 'Spoiled')

In [6]:
# Fitting the models
dtc_model = dtc.fit(final_df)
rfc_model = rfc.fit(final_df)
gbtc_model = gbtc.fit(final_df)

In [7]:
# Selecting the best features
print('DTC:', end='')
dtc_model.featureImportances

In [8]:
print('RFC:', end='')
rfc_model.featureImportances

In [9]:
print('GBTC:', end='')
gbtc_model.featureImportances

In [10]:
df.show()