In [1]:
import os

import warnings
warnings.filterwarnings("ignore")

In [2]:
os.chdir("../src/")
%pwd

'/home/jovyan/nfs-home/scalable_ml_pipelines/src'

In [3]:
import socket

LOCAL_IP = socket.gethostbyname(socket.gethostname())

In [4]:
name_space = "eabraham-373705"

# Master node
kubernetes_master_url = "k8s://https://10.32.7.103:6443"

# Resource settings
driver_cores = "8"
executor_cores = "8"
driver_memory = "30g"
executor_memory = "30g"
executor_memory_overhead = "2g"

# These are the limits
cpu_limit = "3"  # 12 cores
memory_limit = "32g"  # Upto 32 GB
executor_limit = "8"

In [5]:
from pyspark.sql import SparkSession

APP_NAME = 'scalables_executor'


spark = SparkSession\
    .builder\
    .appName(APP_NAME)\
    .master(kubernetes_master_url)\
    .config("spark.driver.host", LOCAL_IP)\
    .config("spark.driver.bindAddress", "0.0.0.0")\
    .config("spark.executor.instances", "2")\
    .config("spark.executor.cores", executor_cores)\
    .config("spark.executor.memory", executor_memory)\
    .config("spark.memory.fraction", "0.8")\
    .config("spark.memory.storageFraction", "0.2")\
    .config("spark.kubernetes.executor.limit.cores", executor_limit)\
    .config("spark.kubernetes.namespace", name_space)\
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")\
    .config("spark.kubernetes.driver.label.appname", APP_NAME)\
    .config("spark.kubernetes.executor.label.appname", APP_NAME)\
    .config("spark.kubernetes.executor.deleteOnTermination", "false") \
    .config("spark.kubernetes.container.image.pullPolicy", "Always") \
    .config("spark.kubernetes.container.image", "node03.st:5000/pyspark-hdfs-jupyter:eabraham-373705-v4-executor")\
    .config("spark.local.dir", "/tmp/spark")\
    .config("spark.kubernetes.driver.volumes.emptyDir.spark-local-dir-tmp-spark.mount.path", "/tmp/spark")\
    .config("spark.kubernetes.driver.volumes.emptyDir.spark-local-dir-tmp-spark.mount.readOnly", "false")\
    .config("spark.kubernetes.executor.volumes.emptyDir.spark-local-dir-tmp-spark.mount.path", "/tmp/spark")\
    .config("spark.kubernetes.executor.volumes.emptyDir.spark-local-dir-tmp-spark.mount.readOnly", "false")\
    .getOrCreate()


23/12/17 14:49:36 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/17 14:49:37 WARN spark.SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [6]:
spark

In [7]:
from us_used_cars_ml_pipeline.config.configuration import ConfigurationManager
from us_used_cars_ml_pipeline.components.models_tuning import ModelsTuning

config_manager = ConfigurationManager()
models_tuning_config = config_manager.get_models_tuning_config()

models_tuning = ModelsTuning(models_tuning_config)

[2023-12-17 14:49:44,563: 145: numexpr.utils: INFO: utils:  Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.]
[2023-12-17 14:49:44,564: 157: numexpr.utils: INFO: utils:  NumExpr defaulting to 8 threads.]
[2023-12-17 14:49:45,166: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/config/config.yaml loaded successfully]
[2023-12-17 14:49:45,169: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/params.yaml loaded successfully]
[2023-12-17 14:49:45,170: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/schema.yaml loaded successfully]


In [8]:
models_tuning_config

ModelsTuningConfig(path_to_prepared_data='hdfs:///home/eabraham-373705/data/prepared/prepared_data.parquet', path_to_importances='us_used_cars_ml_pipeline/tests/importances/', path_to_feature_selection_scores='us_used_cars_ml_pipeline/tests/feature_selection_scores/', path_to_parameters_grid='us_used_cars_ml_pipeline/constants/grid_search/', path_to_best_models='hdfs:///home/eabraham-373705/models/best/', path_to_best_params='us_used_cars_ml_pipeline/tests/models_tuning/best_params/', path_to_scores='us_used_cars_ml_pipeline/tests/models_tuning/scores/', metric='r2', tuner='CVTuner', seed=42, test_ratio=0.05, n_folds=5, models=BoxList(['LinearRegression', 'RandomForestRegressor', 'GBTRegressor']), n_feats=ConfigBox({'LinearRegression': 70, 'RandomForestRegressor': 80, 'GBTRegressor': 70}))

In [9]:
%%time
models_tuning.run_stage(spark)

                                                                                

[2023-12-17 14:49:49,323: 153: us_used_cars_ml_pipeline_logger: INFO: models_tuning:  Prepared data has been read]
[2023-12-17 14:49:49,325: 156: us_used_cars_ml_pipeline_logger: INFO: models_tuning:  STARTING]


Tuning models:   0%|          | 0/3 [00:00<?, ?it/s]

[2023-12-17 14:49:49,443: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/tests/importances/LinearRegression.yaml loaded successfully]
[2023-12-17 14:49:50,054: 127: us_used_cars_ml_pipeline_logger: INFO: models_tuning:  1.1.1. Best set of features for LinearRegression model has been selected]
[2023-12-17 14:49:50,055: 137: us_used_cars_ml_pipeline_logger: INFO: models_tuning:  2.1.2. CVTuner has been initialized.]
[2023-12-17 14:49:50,071: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/grid_search/LinearRegression.yaml loaded successfully]
[2023-12-17 14:49:50,124: 130: us_used_cars_ml_pipeline_logger: INFO: tuners:  2.1.3. Cross Validation based tuner has been initialized]


23/12/17 14:49:50 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
23/12/17 14:50:13 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/12/17 14:50:13 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
23/12/17 14:50:13 WARN netlib.LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
23/12/17 14:50:13 WARN netlib.LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK

[2023-12-17 14:55:08,667: 134: us_used_cars_ml_pipeline_logger: INFO: tuners:  2.1.4. Tuner has been fitted]
[2023-12-17 14:55:08,669: 138: us_used_cars_ml_pipeline_logger: INFO: tuners:  2.1.5. Scores and best params set have been extracted from tuner]
[2023-12-17 14:55:08,683: 66: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file saved at: us_used_cars_ml_pipeline/tests/models_tuning/scores/CVTuner/LinearRegression.yaml]
[2023-12-17 14:55:08,684: 142: us_used_cars_ml_pipeline_logger: INFO: tuners:  2.1.6. Scores for LinearRegression model have been saved]
[2023-12-17 14:55:08,689: 66: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file saved at: us_used_cars_ml_pipeline/tests/models_tuning/best_params/CVTuner/LinearRegression.yaml]
[2023-12-17 14:55:08,690: 146: us_used_cars_ml_pipeline_logger: INFO: tuners:  2.1.7. Best set of parameters for LinearRegression model has been saved]
[2023-12-17 14:55:08,709: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: u



[2023-12-17 14:55:08,877: 127: us_used_cars_ml_pipeline_logger: INFO: models_tuning:  1.2.1. Best set of features for RandomForestRegressor model has been selected]
[2023-12-17 14:55:08,878: 137: us_used_cars_ml_pipeline_logger: INFO: models_tuning:  2.2.2. CVTuner has been initialized.]
[2023-12-17 14:55:08,889: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/grid_search/RandomForestRegressor.yaml loaded successfully]
[2023-12-17 14:55:08,917: 130: us_used_cars_ml_pipeline_logger: INFO: tuners:  2.2.3. Cross Validation based tuner has been initialized]




[2023-12-17 15:14:27,591: 134: us_used_cars_ml_pipeline_logger: INFO: tuners:  2.2.4. Tuner has been fitted]
[2023-12-17 15:14:27,594: 138: us_used_cars_ml_pipeline_logger: INFO: tuners:  2.2.5. Scores and best params set have been extracted from tuner]
[2023-12-17 15:14:27,604: 66: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file saved at: us_used_cars_ml_pipeline/tests/models_tuning/scores/CVTuner/RandomForestRegressor.yaml]
[2023-12-17 15:14:27,604: 142: us_used_cars_ml_pipeline_logger: INFO: tuners:  2.2.6. Scores for RandomForestRegressor model have been saved]
[2023-12-17 15:14:27,609: 66: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file saved at: us_used_cars_ml_pipeline/tests/models_tuning/best_params/CVTuner/RandomForestRegressor.yaml]
[2023-12-17 15:14:27,610: 146: us_used_cars_ml_pipeline_logger: INFO: tuners:  2.2.7. Best set of parameters for RandomForestRegressor model has been saved]
[2023-12-17 15:14:27,631: 44: us_used_cars_ml_pipeline_logger: INFO: c

                                                                                

[2023-12-17 15:14:27,770: 127: us_used_cars_ml_pipeline_logger: INFO: models_tuning:  1.3.1. Best set of features for GBTRegressor model has been selected]
[2023-12-17 15:14:27,771: 137: us_used_cars_ml_pipeline_logger: INFO: models_tuning:  2.3.2. CVTuner has been initialized.]
[2023-12-17 15:14:27,781: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/grid_search/GBTRegressor.yaml loaded successfully]
[2023-12-17 15:14:27,800: 130: us_used_cars_ml_pipeline_logger: INFO: tuners:  2.3.3. Cross Validation based tuner has been initialized]


                                                                                

[2023-12-17 15:29:09,291: 134: us_used_cars_ml_pipeline_logger: INFO: tuners:  2.3.4. Tuner has been fitted]
[2023-12-17 15:29:09,293: 138: us_used_cars_ml_pipeline_logger: INFO: tuners:  2.3.5. Scores and best params set have been extracted from tuner]
[2023-12-17 15:29:09,301: 66: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file saved at: us_used_cars_ml_pipeline/tests/models_tuning/scores/CVTuner/GBTRegressor.yaml]
[2023-12-17 15:29:09,302: 142: us_used_cars_ml_pipeline_logger: INFO: tuners:  2.3.6. Scores for GBTRegressor model have been saved]
[2023-12-17 15:29:09,307: 66: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file saved at: us_used_cars_ml_pipeline/tests/models_tuning/best_params/CVTuner/GBTRegressor.yaml]
[2023-12-17 15:29:09,308: 146: us_used_cars_ml_pipeline_logger: INFO: tuners:  2.3.7. Best set of parameters for GBTRegressor model has been saved]
[2023-12-17 15:29:09,311: 158: us_used_cars_ml_pipeline_logger: INFO: models_tuning:  COMPLETED]
CPU times