In [0]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Model building

In [0]:
train_data_path = dbutils.widgets.get("train_data")
# here instead of directly giving path, take from adf parameters

# Load train data from Parquet
train_data = spark.read.parquet(train_data_path)



## Linear Regression

In [0]:
from pyspark.ml.regression import LinearRegression

# Initialize Linear Regression model
lr = LinearRegression(featuresCol="scaled_features", labelCol="quality")

# Fit the model to the training data
lr_model = lr.fit(train_data)

print("Coefficients: {}".format(lr_model.coefficients))
print("Intercept: {:.2f}".format(lr_model.intercept))

Coefficients: [-7.003280721090691e-14,-6.427689675616689e-15,5.842492157733723e-15,-2.1182478323304876e-14,-8.198562512768448e-15,7.064576272058961e-15,-9.495068460411978e-15,6.503040566376214e-14,-4.027475827968103e-14,-3.6911352113981356e-15,4.6042390481824355e-14,0.8075694397346824]
Intercept: 5.64


In [0]:
model_path = "/mnt/capstonejs/models/lr_model"
lr_model.write().overwrite().save(model_path)

## Decision Tree

In [0]:
from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol="scaled_features", labelCol="quality",maxDepth=5)
dt_model = dt.fit(train_data)

In [0]:
param_grid = ParamGridBuilder() \
    .addGrid(dt_model.maxDepth, [5, 10, 15]) \
    .addGrid(dt_model.minInstancesPerNode, [1, 5, 10]) \
    .build()

evaluator = MulticlassClassificationEvaluator(labelCol="quality")

# Define CrossValidator with 5 folds
cv = CrossValidator(estimator=dt,
                    estimatorParamMaps=param_grid,
                    evaluator=evaluator,
                    numFolds=5)

In [0]:
cv_model_dt = cv.fit(train_data)
dt_best = cv_model_dt.bestModel



In [0]:
model_path = "/mnt/capstonejs/models/dt_model"
dt_model.write().overwrite().save(model_path)

model_path = "/mnt/capstonejs/models/dt_best"
dt_best.write().overwrite().save(model_path)


## Random Forest

In [0]:
num_classes = train_data.select("quality").distinct().count()
rf = RandomForestClassifier(labelCol="quality", featuresCol="scaled_features")

# Define parameter grid for grid search
param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [10, 20, 30]) \
    .addGrid(rf.maxDepth, [5, 10, 15]) \
    .build()

# Define evaluator
evaluator = MulticlassClassificationEvaluator(labelCol="quality")

# Define CrossValidator with 5 folds
cv = CrossValidator(estimator=rf,
                    estimatorParamMaps=param_grid,
                    evaluator=evaluator,
                    numFolds=5)

# Fit CrossValidator
cv_model_rf = cv.fit(train_data)

# Get best model from CrossValidator
rf_model = cv_model_rf.bestModel

In [0]:
model_path = "/mnt/capstonejs/models/rf_model"
rf_model.write().overwrite().save(model_path)