In [1]:
import pyspark
from pyspark.sql.types import DoubleType, StringType, StructField, StructType, IntegerType
from pyspark.ml.classification import DecisionTreeClassifier, DecisionTreeClassificationModel
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from distutils.version import LooseVersion
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
schema = schema = StructType([
    StructField('Date', StringType(), True),
    StructField('Location', StringType(), True),
    StructField('MinTemp', DoubleType(), True),
    StructField('MaxTemp', DoubleType(), True),
    StructField('Rainfall', DoubleType(), True),
    StructField('Evaporation', DoubleType(), True),
    StructField('Sunshine', DoubleType(), True),
    StructField('WindGustDir', StringType(), True),
    StructField('WindGustSpeed', IntegerType(), True),
    StructField('WindDir9am', StringType(), True),
    StructField('WindDir3pm', StringType(), True),
    StructField('WindSpeed9am', IntegerType(), True),
    StructField('WindSpeed3pm', IntegerType(), True),
    StructField('Humidity9am', IntegerType(), True),
    StructField('Humidity3pm', IntegerType(), True),
    StructField('Pressure9am', DoubleType(), True),
    StructField('Pressure3pm', DoubleType(), True),
    StructField('Cloud9am', IntegerType(), True),
    StructField('Cloud3pm', IntegerType(), True),
    StructField('Temp9am', DoubleType(), True),
    StructField('Temp3pm', DoubleType(), True),
    StructField('RainToday', StringType(), True),
    StructField('RainTomorrow', StringType(), True)
])


dataset = spark.read.csv('weatherAUS.csv', header=True, schema=schema)
dataset = dataset.na.drop("any")

In [3]:
#Step 1
(trainingData, testingData) = dataset.randomSplit([0.8,0.2], seed=12345)

In [4]:
#Step 2
#cat_colmn = ['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday', 'RainTomorrow']
categorical_columns = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
stage=[]
for cat_col in categorical_columns:
    stringIndexer = StringIndexer(inputCol=cat_col, outputCol=cat_col + "Index")
    if LooseVersion(pyspark.__version__) < LooseVersion("3.0"):
        from pyspark.ml.feature import OneHotEncoderEstimator
        encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCols()], outputCols=[cat_col+ 'classVec'])
    else:
        from pyspark.ml.feature import OneHotEncoder
        encoder = OneHotEncoder(inputCols=[stringIndexer.getOutputCol()], outputCols=[cat_col + 'classVec'])
        #print(cat_col)
        #print(encoder)
    stage += [stringIndexer, encoder]

  if LooseVersion(pyspark.__version__) < LooseVersion("3.0"):


In [5]:
label_stringIdx = StringIndexer(inputCol='RainTomorrow', outputCol='label')
stage += [label_stringIdx]

In [8]:
ignore = ['Date', 'Location','WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']
assembler = VectorAssembler(
                            inputCols=[x for x in dataset.columns if not x in ignore],
                            outputCol='features')
assembler.transform(dataset)
#Date, Location, MinTemp, MaxTemp, Rainfall, Evaporation, Sunshine, WindGustDir, WindGustSpeed, WindDir9am, WindDir3pm, WindSpeed9am, WindSpeed3pm, Humidity9am, Humidity3pm, Pressure9am, Pressure3pm, Cloud9am, Cloud3pm, Temp9am, Temp3pm, RainToday, RainTomorrow

DataFrame[Date: string, Location: string, MinTemp: double, MaxTemp: double, Rainfall: double, Evaporation: double, Sunshine: double, WindGustDir: string, WindGustSpeed: int, WindDir9am: string, WindDir3pm: string, WindSpeed9am: int, WindSpeed3pm: int, Humidity9am: int, Humidity3pm: int, Pressure9am: double, Pressure3pm: double, Cloud9am: int, Cloud3pm: int, Temp9am: double, Temp3pm: double, RainToday: string, RainTomorrow: string, features: vector]

In [25]:
#numericCols = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']
#asemblerInput = [c + 'classVec' for c in categorical_columns] + numericCols
#asembler = VectorAssembler(inputCols=asemblerInput, outputCol='features')
#stage += [asembler]

In [26]:
#partialPipeline = Pipeline().setStages(stage)
#pipelineModel = partialPipeline.fit(dataset)
#preppedDataDF = pipelineModel.transform(dataset)

In [27]:
#lrmodel = LogisticRegression().fit(preppedDataDF)

#display(lrmodel,preppedDataDF, "ROC")

LogisticRegressionModel: uid=LogisticRegression_11c25549b31e, numClasses=3, numFeatures=3709

DataFrame[Date: string, Location: string, MinTemp: double, MaxTemp: double, Rainfall: double, Evaporation: double, Sunshine: double, WindGustDir: string, WindGustSpeed: int, WindDir9am: string, WindDir3pm: string, WindSpeed9am: int, WindSpeed3pm: int, Humidity9am: int, Humidity3pm: int, Pressure9am: double, Pressure3pm: double, Cloud9am: int, Cloud3pm: int, Temp9am: double, Temp3pm: double, RainToday: string, RainTomorrow: string, MinTempIndex: double, MinTempclassVec: vector, MaxTempIndex: double, MaxTempclassVec: vector, RainfallIndex: double, RainfallclassVec: vector, EvaporationIndex: double, EvaporationclassVec: vector, SunshineIndex: double, SunshineclassVec: vector, WindGustSpeedIndex: double, WindGustSpeedclassVec: vector, WindSpeed9amIndex: double, WindSpeed9amclassVec: vector, WindSpeed3pmIndex: double, WindSpeed3pmclassVec: vector, Humidity9amIndex: double, Humidity9amclassVec: vector, Humidity3pmIndex: double, Humidity3pmclassVec: vector, Pressure9amIndex: double, Pressure9

'ROC'

In [9]:
dt = DecisionTreeClassifier(labelCol="label")#, featuresCol='MinTemp', maxDepth=3)
dtModel = dt.fit(trainingData)

IllegalArgumentException: features does not exist. Available: Date, Location, MinTemp, MaxTemp, Rainfall, Evaporation, Sunshine, WindGustDir, WindGustSpeed, WindDir9am, WindDir3pm, WindSpeed9am, WindSpeed3pm, Humidity9am, Humidity3pm, Pressure9am, Pressure3pm, Cloud9am, Cloud3pm, Temp9am, Temp3pm, RainToday, RainTomorrow

In [None]:
paramGrid = (ParamGridBuilder().addGrid

In [10]:
https://stackoverflow.com/questions/32606294/create-feature-vector-programmatically-in-spark-ml-pyspark

SyntaxError: invalid syntax (226211973.py, line 1)

In [11]:
https://docs.databricks.com/_static/notebooks/binary-classification.html

SyntaxError: invalid syntax (3714101564.py, line 1)

In [None]:
https://docs.databricks.com/_static/notebooks/mllib-mlflow-integration.html