In [1]:
import os

import warnings
warnings.filterwarnings("ignore")

In [2]:
os.chdir("../src/")
%pwd

'/home/jovyan/nfs-home/scalable_ml_pipelines/src'

In [3]:
import socket

LOCAL_IP = socket.gethostbyname(socket.gethostname())

In [4]:
name_space = "eabraham-373705"

# Master node
kubernetes_master_url = "k8s://https://10.32.7.103:6443"

# Resource settings
driver_cores = "8"
executor_cores = "8"
driver_memory = "30g"
executor_memory = "30g"
executor_memory_overhead = "2g"

# These are the limits
cpu_limit = "3"  # 12 cores
memory_limit = "32g"  # Upto 32 GB
executor_limit = "8"

In [5]:
from pyspark.sql import SparkSession

APP_NAME = 'scalables_executor'


spark = SparkSession\
    .builder\
    .appName(APP_NAME)\
    .master(kubernetes_master_url)\
    .config("spark.driver.host", LOCAL_IP)\
    .config("spark.driver.bindAddress", "0.0.0.0")\
    .config("spark.executor.instances", "2")\
    .config("spark.executor.cores", executor_cores)\
    .config("spark.executor.memory", executor_memory)\
    .config("spark.memory.fraction", "0.8")\
    .config("spark.memory.storageFraction", "0.2")\
    .config("spark.kubernetes.executor.limit.cores", executor_limit)\
    .config("spark.kubernetes.namespace", name_space)\
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")\
    .config("spark.kubernetes.driver.label.appname", APP_NAME)\
    .config("spark.kubernetes.executor.label.appname", APP_NAME)\
    .config("spark.kubernetes.executor.deleteOnTermination", "false") \
    .config("spark.kubernetes.container.image.pullPolicy", "Always") \
    .config("spark.kubernetes.container.image", "node03.st:5000/pyspark-hdfs-jupyter:eabraham-373705-v4-executor")\
    .config("spark.local.dir", "/tmp/spark")\
    .config("spark.kubernetes.driver.volumes.emptyDir.spark-local-dir-tmp-spark.mount.path", "/tmp/spark")\
    .config("spark.kubernetes.driver.volumes.emptyDir.spark-local-dir-tmp-spark.mount.readOnly", "false")\
    .config("spark.kubernetes.executor.volumes.emptyDir.spark-local-dir-tmp-spark.mount.path", "/tmp/spark")\
    .config("spark.kubernetes.executor.volumes.emptyDir.spark-local-dir-tmp-spark.mount.readOnly", "false")\
    .getOrCreate()


23/12/18 18:35:26 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/18 18:35:26 WARN spark.SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [6]:
spark

In [7]:
from us_used_cars_ml_pipeline.config.configuration import ConfigurationManager
from us_used_cars_ml_pipeline.components.stacking_regressor_modeling import StackingRegressorModeling

config_manager = ConfigurationManager()
stacking_regressor_modeling_config = config_manager.get_stacking_regressor_modeling_config()

stacking_regressor_modeling = StackingRegressorModeling(stacking_regressor_modeling_config)

[2023-12-18 18:35:34,108: 145: numexpr.utils: INFO: utils:  Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.]
[2023-12-18 18:35:34,109: 157: numexpr.utils: INFO: utils:  NumExpr defaulting to 8 threads.]
[2023-12-18 18:35:34,759: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/config/config.yaml loaded successfully]
[2023-12-18 18:35:34,763: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/params.yaml loaded successfully]
[2023-12-18 18:35:34,766: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/schema.yaml loaded successfully]


In [8]:
stacking_regressor_modeling_config

StackingRegressorModelingConfig(path_to_prepared_data='hdfs:///home/eabraham-373705/data/prepared/', path_to_importances='us_used_cars_ml_pipeline/tests/importances/', path_to_parameters_grid='us_used_cars_ml_pipeline/constants/grid_search/', path_to_best_params='us_used_cars_ml_pipeline/tests/models_tuning/best_params/CVTuner/', path_to_predictions='hdfs:///home/eabraham-373705/data/predictions', path_to_stacking_models='hdfs:///home/eabraham-373705/models/stacking_models', n_folds=5, seed=42, base_models_names=BoxList(['LinearRegression', 'RandomForestRegressor', 'GBTRegressor']), n_feats=ConfigBox({'LinearRegression': 70, 'RandomForestRegressor': 80, 'GBTRegressor': 70}), meta_model_name='LinearRegression', meta_model_params=ConfigBox({'labelCol': 'price', 'predictionCol': 'prediction', 'featuresCol': 'first_level_predictions', 'maxIter': 100, 'regParam': 1, 'elasticNetParam': 0.5}), tuner='CVTuner', tuner_config=ConfigBox({'metric': 'r2', 'seed': 42, 'n_folds': 5, 'path_to_paramete

In [9]:
%%time
stacking_regressor_modeling.run_stage(spark, is_new_data=False)

[2023-12-17 16:13:02,770: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/grid_search/LinearRegression.yaml loaded successfully]
[2023-12-17 16:13:02,774: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/tests/models_tuning/best_params/CVTuner/LinearRegression.yaml loaded successfully]
[2023-12-17 16:13:02,779: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/grid_search/RandomForestRegressor.yaml loaded successfully]
[2023-12-17 16:13:02,783: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/tests/models_tuning/best_params/CVTuner/RandomForestRegressor.yaml loaded successfully]
[2023-12-17 16:13:02,789: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/grid_search/GBTRegressor.yaml loaded successfully]
[2023-12-17 16:13:02,793: 44: us_used_cars_ml_pipeline_logger: 

                                                                                

[2023-12-17 16:13:06,881: 142: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  Prepared data has been loaded]
[2023-12-17 16:13:07,225: 146: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  Fold column has been created]
[2023-12-17 16:13:07,226: 149: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  PART I. STARTING]


Training base models:   0%|          | 0/3 [00:00<?, ?it/s]

[2023-12-17 16:13:07,286: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/tests/importances/LinearRegression.yaml loaded successfully]


23/12/17 16:13:08 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Training LinearRegression models:   0%|          | 0/5 [00:00<?, ?it/s]

23/12/17 16:13:23 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/12/17 16:13:23 WARN netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

[2023-12-17 16:13:33,388: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/tests/importances/RandomForestRegressor.yaml loaded successfully]


                                                                                

Training RandomForestRegressor models:   0%|          | 0/5 [00:00<?, ?it/s]

                                                                                

[2023-12-17 16:14:13,849: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/tests/importances/GBTRegressor.yaml loaded successfully]


                                                                                

Training GBTRegressor models:   0%|          | 0/5 [00:00<?, ?it/s]

                                                                                

[2023-12-17 16:15:45,956: 151: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  PART I. COMPLETED]
[2023-12-17 16:15:45,961: 154: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  PART II. STARTING]


Predicting using base models:   0%|          | 0/3 [00:00<?, ?it/s]

[2023-12-17 16:15:45,997: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/tests/importances/LinearRegression.yaml loaded successfully]


                                                                                

LinearRegression models:   0%|          | 0/5 [00:00<?, ?it/s]



[2023-12-17 16:15:55,071: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/tests/importances/RandomForestRegressor.yaml loaded successfully]


                                                                                

RandomForestRegressor models:   0%|          | 0/5 [00:00<?, ?it/s]

                                                                                

[2023-12-17 16:16:11,016: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/tests/importances/GBTRegressor.yaml loaded successfully]


23/12/17 16:16:11 WARN execution.CacheManager: Asked to cache already cached data.


GBTRegressor models:   0%|          | 0/5 [00:00<?, ?it/s]



[2023-12-17 16:16:16,636: 156: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  PART II. COMPLETED]
[2023-12-17 16:16:16,637: 159: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  PART III. STARTING]


                                                                                

  0%|          | 0/3 [00:00<?, ?it/s]

                                                                                

[2023-12-17 16:16:21,790: 161: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  PART III. COMPLETED]
[2023-12-17 16:16:21,792: 164: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  PART IV. STARTING]


Training base models:   0%|          | 0/3 [00:00<?, ?it/s]

[2023-12-17 16:16:21,827: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/tests/importances/LinearRegression.yaml loaded successfully]


                                                                                

[2023-12-17 16:16:28,773: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/tests/importances/RandomForestRegressor.yaml loaded successfully]


                                                                                

[2023-12-17 16:16:42,933: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/tests/importances/GBTRegressor.yaml loaded successfully]


23/12/17 16:16:43 WARN execution.CacheManager: Asked to cache already cached data.
                                                                                

[2023-12-17 16:17:01,826: 166: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  PART IV. COMPLETED]
[2023-12-17 16:17:01,828: 169: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  PART V. STARTING]


Training base models:   0%|          | 0/3 [00:00<?, ?it/s]

[2023-12-17 16:17:01,895: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/tests/importances/LinearRegression.yaml loaded successfully]


23/12/17 16:17:02 WARN execution.CacheManager: Asked to cache already cached data.

[2023-12-17 16:17:06,559: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/tests/importances/RandomForestRegressor.yaml loaded successfully]


23/12/17 16:17:06 WARN execution.CacheManager: Asked to cache already cached data.
                                                                                

[2023-12-17 16:17:12,291: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/tests/importances/GBTRegressor.yaml loaded successfully]


23/12/17 16:17:12 WARN execution.CacheManager: Asked to cache already cached data.
                                                                                

[2023-12-17 16:17:17,210: 171: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  PART V. COMPLETED]
[2023-12-17 16:17:17,211: 174: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  PART VI. STARTING]


  0%|          | 0/3 [00:00<?, ?it/s]

                                                                                

[2023-12-17 16:17:22,075: 176: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  PART VI. COMPLETED]
[2023-12-17 16:17:22,247: 181: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  Data with predictions has been loaded]
[2023-12-17 16:17:22,248: 184: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  PART VII. STARTING]
[2023-12-17 16:17:22,279: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/grid_search/meta_model/LinearRegression.yaml loaded successfully]


  0%|          | 0/35 [00:00<?, ?it/s]

23/12/17 16:17:23 WARN netlib.LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
23/12/17 16:17:23 WARN netlib.LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK

[2023-12-17 16:17:26,536: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0, 'regParam': 0.0001} -> r2: 0.7783355600566575]


                                                                                

[2023-12-17 16:17:29,206: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0, 'regParam': 0.001} -> r2: 0.7783355624168347]


[Stage 1080:===>                                                  (1 + 15) / 16]

[2023-12-17 16:17:31,948: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0, 'regParam': 0.01} -> r2: 0.7783355860182011]




[2023-12-17 16:17:34,833: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0, 'regParam': 0.1} -> r2: 0.7783358219923369]


                                                                                

[2023-12-17 16:17:37,293: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0, 'regParam': 1} -> r2: 0.7783381777821271]




[2023-12-17 16:17:40,306: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0, 'regParam': 10} -> r2: 0.7783613422920752]


                                                                                

[2023-12-17 16:17:42,780: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0, 'regParam': 100} -> r2: 0.778555345974673]




[2023-12-17 16:17:45,973: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.25, 'regParam': 0.0001} -> r2: 0.7783355759632404]


                                                                                

[2023-12-17 16:17:48,879: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.25, 'regParam': 0.001} -> r2: 0.7783355762136623]


                                                                                

[2023-12-17 16:17:51,367: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.25, 'regParam': 0.01} -> r2: 0.7783355675041337]




[2023-12-17 16:17:54,716: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.25, 'regParam': 0.1} -> r2: 0.778335625662456]


                                                                                

[2023-12-17 16:17:57,583: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.25, 'regParam': 1} -> r2: 0.7783361579732738]




[2023-12-17 16:18:00,517: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.25, 'regParam': 10} -> r2: 0.778341186264713]


                                                                                

[2023-12-17 16:18:03,049: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.25, 'regParam': 100} -> r2: 0.7783383734512731]




[2023-12-17 16:18:06,264: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.5, 'regParam': 0.0001} -> r2: 0.7783355611259637]


                                                                                

[2023-12-17 16:18:08,739: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.5, 'regParam': 0.001} -> r2: 0.7783355745275726]


                                                                                

[2023-12-17 16:18:11,289: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.5, 'regParam': 0.01} -> r2: 0.7783355475212124]


                                                                                

[2023-12-17 16:18:13,868: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.5, 'regParam': 0.1} -> r2: 0.7783358665404304]




[2023-12-17 16:18:16,568: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.5, 'regParam': 1} -> r2: 0.7783341755715355]




[2023-12-17 16:18:19,954: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.5, 'regParam': 10} -> r2: 0.7783208666545118]


                                                                                

[2023-12-17 16:18:22,367: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.5, 'regParam': 100} -> r2: 0.7781048761269813]


                                                                                

[2023-12-17 16:18:25,056: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.75, 'regParam': 0.0001} -> r2: 0.7783355609261371]


[Stage 1220:===>                                                  (1 + 15) / 16]

[2023-12-17 16:18:27,928: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.75, 'regParam': 0.001} -> r2: 0.7783355578909835]


                                                                                

[2023-12-17 16:18:31,397: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.75, 'regParam': 0.01} -> r2: 0.7783355165791247]




[2023-12-17 16:18:34,238: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.75, 'regParam': 0.1} -> r2: 0.7783359666859345]


                                                                                

[2023-12-17 16:18:37,565: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.75, 'regParam': 1} -> r2: 0.7783321714693108]


[Stage 1248:===>                                                  (1 + 15) / 16]

[2023-12-17 16:18:40,583: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.75, 'regParam': 10} -> r2: 0.7783003796960505]




[2023-12-17 16:18:43,560: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 0.75, 'regParam': 100} -> r2: 0.7778544740041459]


                                                                                

[2023-12-17 16:18:45,921: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 1, 'regParam': 0.0001} -> r2: 0.7783355607263116]


                                                                                

[2023-12-17 16:18:48,763: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 1, 'regParam': 0.001} -> r2: 0.7783355558927183]


                                                                                

[2023-12-17 16:18:51,710: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 1, 'regParam': 0.01} -> r2: 0.7783354965886762]


                                                                                

[2023-12-17 16:18:54,161: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 1, 'regParam': 0.1} -> r2: 0.7783350028681215]


                                                                                

[2023-12-17 16:18:56,993: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 1, 'regParam': 1} -> r2: 0.7783301662123067]


[Stage 1297:===>                                                  (1 + 15) / 16]

[2023-12-17 16:18:59,849: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 1, 'regParam': 10} -> r2: 0.7782797249467981]


                                                                                

[2023-12-17 16:19:02,826: 337: us_used_cars_ml_pipeline_logger: INFO: tuners:  {'elasticNetParam': 1, 'regParam': 100} -> r2: 0.7775867798578073]
[2023-12-17 16:19:02,846: 66: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file saved at: us_used_cars_ml_pipeline/tests/meta_model_tuning/scores/LinearRegression.yaml]
[2023-12-17 16:19:02,847: 347: us_used_cars_ml_pipeline_logger: INFO: tuners:  Scores for LinearRegression model have been saved]
[2023-12-17 16:19:02,851: 66: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file saved at: us_used_cars_ml_pipeline/tests/meta_model_tuning/best_params/LinearRegression.yaml]
[2023-12-17 16:19:02,852: 351: us_used_cars_ml_pipeline_logger: INFO: tuners:  Best set of parameters for LinearRegression model has been saved]
[2023-12-17 16:19:02,853: 186: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  PART VII. COMPLETED]
[2023-12-17 16:19:02,854: 189: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling: 



[2023-12-17 16:19:07,332: 191: us_used_cars_ml_pipeline_logger: INFO: stacking_regressor_modeling:  PART VIII. COMPLETED]
CPU times: user 2.35 s, sys: 776 ms, total: 3.12 s
Wall time: 6min 4s


                                                                                