In [1]:
import os, socket
from pyspark.sql import SparkSession


import warnings
warnings.filterwarnings("ignore")

In [2]:
os.chdir("../src/")
%pwd

'/home/jovyan/nfs-home/scalable_ml_pipelines/src'

In [3]:
import socket

LOCAL_IP = socket.gethostbyname(socket.gethostname())

In [4]:
name_space = "eabraham-373705"

# Master node
kubernetes_master_url = "k8s://https://10.32.7.103:6443"

# Resource settings
driver_cores = "8"
executor_cores = "8"
driver_memory = "30g"
executor_memory = "30g"
executor_memory_overhead = "2g"

# These are the limits
cpu_limit = "3"  # 12 cores
memory_limit = "32g"  # Upto 32 GB
executor_limit = "8"

In [5]:
from pyspark.sql import SparkSession

APP_NAME = 'scalables_executor'


spark = SparkSession\
    .builder\
    .appName(APP_NAME)\
    .master(kubernetes_master_url)\
    .config("spark.driver.host", LOCAL_IP)\
    .config("spark.driver.bindAddress", "0.0.0.0")\
    .config("spark.executor.instances", "2")\
    .config("spark.executor.cores", executor_cores)\
    .config("spark.executor.memory", executor_memory)\
    .config("spark.memory.fraction", "0.7")\
    .config("spark.memory.storageFraction", "0.3")\
    .config("spark.kubernetes.executor.limit.cores", executor_limit)\
    .config("spark.kubernetes.namespace", name_space)\
    .config("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")\
    .config("spark.kubernetes.driver.label.appname", APP_NAME)\
    .config("spark.kubernetes.executor.label.appname", APP_NAME)\
    .config("spark.kubernetes.executor.deleteOnTermination", "false") \
    .config("spark.kubernetes.container.image.pullPolicy", "Always") \
    .config("spark.kubernetes.container.image", "node03.st:5000/pyspark-hdfs-jupyter:eabraham-373705-v4-executor")\
    .config("spark.local.dir", "/tmp/spark")\
    .config("spark.kubernetes.driver.volumes.emptyDir.spark-local-dir-tmp-spark.mount.path", "/tmp/spark")\
    .config("spark.kubernetes.driver.volumes.emptyDir.spark-local-dir-tmp-spark.mount.readOnly", "false")\
    .config("spark.kubernetes.executor.volumes.emptyDir.spark-local-dir-tmp-spark.mount.path", "/tmp/spark")\
    .config("spark.kubernetes.executor.volumes.emptyDir.spark-local-dir-tmp-spark.mount.readOnly", "false")\
    .getOrCreate()


23/12/15 14:05:18 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/15 14:05:18 WARN spark.SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [6]:
spark

In [7]:
from us_used_cars_ml_pipeline.config.configuration import ConfigurationManager
from us_used_cars_ml_pipeline.components.data_cleaning import CleanData

[2023-12-15 14:05:25,640: 145: numexpr.utils: INFO: utils:  Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.]
[2023-12-15 14:05:25,641: 157: numexpr.utils: INFO: utils:  NumExpr defaulting to 8 threads.]


In [8]:
config_manager = ConfigurationManager()
data_cleaning_config = config_manager.get_clean_data_config()

data_cleaning = CleanData(data_cleaning_config)

data_cleaning_config

[2023-12-15 14:05:26,441: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/config/config.yaml loaded successfully]
[2023-12-15 14:05:26,444: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/params.yaml loaded successfully]
[2023-12-15 14:05:26,447: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/schema.yaml loaded successfully]


CleanDataConfig(path_to_cleaned_data='hdfs:///home/eabraham-373705/data/cleaned/', path_to_raw_data='hdfs:///home/eabraham-373705/data/raw/raw_data.parquet', features_to_encode='us_used_cars_ml_pipeline/constants/features_to_encode.yaml', features_with_nans='us_used_cars_ml_pipeline/constants/features_with_nans.yaml', features_to_drop='us_used_cars_ml_pipeline/constants/features_to_drop.yaml', glove_model='us_used_cars_ml_pipeline/models/glove/glove-twitter-25.bin', data_types='us_used_cars_ml_pipeline/constants/data_types.yaml', popular_options='us_used_cars_ml_pipeline/constants/popular_options.yaml', label_encodings='us_used_cars_ml_pipeline/constants/label_encodings', kfold_encodings='hdfs:///home/eabraham-373705/data/encodings/kfold_encodings', rare_classes='us_used_cars_ml_pipeline/constants/rare_classes.yaml', seed=42, n_folds=5, rare_classes_count=BoxList([1, 2, 3, 4, 5, 6, 7]))

In [9]:
%%time
data_cleaning.run_stage(spark, is_new_data=False)

                                                                                

[2023-12-15 14:05:44,914: 821: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  Raw data has been read]
[2023-12-15 14:05:44,915: 824: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  PART I. STARTING]
[2023-12-15 14:05:44,921: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/data_types.yaml loaded successfully]
[2023-12-15 14:05:44,928: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/features_with_nans.yaml loaded successfully]
[2023-12-15 14:05:44,931: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/popular_options.yaml loaded successfully]
[2023-12-15 14:05:44,935: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/features_to_encode.yaml loaded successfully]
[2023-12-15 14:05:45,147: 624: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  1. Rows with nans in specified colu

  0%|          | 0/43 [00:00<?, ?it/s]

[2023-12-15 14:05:46,146: 645: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  3. Values in specified columns have been converted]
[2023-12-15 14:05:46,316: 650: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  4. Features power and torque have been splitted]
[2023-12-15 14:05:46,849: 654: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  5. Popular options features have been extracted]


  0%|          | 0/11 [00:00<?, ?it/s]

                                                                                

[2023-12-15 14:05:53,490: 109: us_used_cars_ml_pipeline_logger: INFO: common:  json file saved at: us_used_cars_ml_pipeline/constants/label_encodings/body_type.json]


                                                                                

[2023-12-15 14:05:56,373: 109: us_used_cars_ml_pipeline_logger: INFO: common:  json file saved at: us_used_cars_ml_pipeline/constants/label_encodings/fleet.json]


                                                                                

[2023-12-15 14:05:58,202: 109: us_used_cars_ml_pipeline_logger: INFO: common:  json file saved at: us_used_cars_ml_pipeline/constants/label_encodings/frame_damaged.json]




[2023-12-15 14:06:00,152: 109: us_used_cars_ml_pipeline_logger: INFO: common:  json file saved at: us_used_cars_ml_pipeline/constants/label_encodings/franchise_dealer.json]


                                                                                

[2023-12-15 14:06:02,390: 109: us_used_cars_ml_pipeline_logger: INFO: common:  json file saved at: us_used_cars_ml_pipeline/constants/label_encodings/fuel_type.json]


                                                                                

[2023-12-15 14:06:05,008: 109: us_used_cars_ml_pipeline_logger: INFO: common:  json file saved at: us_used_cars_ml_pipeline/constants/label_encodings/has_accidents.json]




[2023-12-15 14:06:07,483: 109: us_used_cars_ml_pipeline_logger: INFO: common:  json file saved at: us_used_cars_ml_pipeline/constants/label_encodings/is_new.json]




[2023-12-15 14:06:10,224: 109: us_used_cars_ml_pipeline_logger: INFO: common:  json file saved at: us_used_cars_ml_pipeline/constants/label_encodings/listing_color.json]


                                                                                

[2023-12-15 14:06:13,069: 109: us_used_cars_ml_pipeline_logger: INFO: common:  json file saved at: us_used_cars_ml_pipeline/constants/label_encodings/salvage.json]




[2023-12-15 14:06:16,094: 109: us_used_cars_ml_pipeline_logger: INFO: common:  json file saved at: us_used_cars_ml_pipeline/constants/label_encodings/transmission.json]




[2023-12-15 14:06:19,584: 109: us_used_cars_ml_pipeline_logger: INFO: common:  json file saved at: us_used_cars_ml_pipeline/constants/label_encodings/wheel_system.json]
[2023-12-15 14:06:19,654: 659: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  6. Specified features have been encoded using Label Encoder]


23/12/15 14:06:19 WARN util.package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

[2023-12-15 14:06:31,499: 664: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  7. Features with nans, for which modeling is required, have been saved]
[2023-12-15 14:06:31,500: 826: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  PART I. COMPLETED]




[2023-12-15 14:06:56,027: 830: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  Data has been cached]
[2023-12-15 14:06:56,028: 833: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  PART II. STARTING]
[2023-12-15 14:06:56,034: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/features_with_nans.yaml loaded successfully]
[2023-12-15 14:06:56,038: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/features_to_encode.yaml loaded successfully]
[2023-12-15 14:06:56,041: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/features_to_drop.yaml loaded successfully]
[2023-12-15 14:06:56,042: 482: gensim.utils: INFO: utils:  loading KeyedVectors object from us_used_cars_ml_pipeline/models/glove/glove-twitter-25.bin]


                                                                                

[2023-12-15 14:06:56,647: 521: gensim.utils: INFO: utils:  loading vectors from us_used_cars_ml_pipeline/models/glove/glove-twitter-25.bin.vectors.npy with mmap=None]
[2023-12-15 14:06:56,703: 448: gensim.utils: INFO: utils:  KeyedVectors lifecycle event {'fname': 'us_used_cars_ml_pipeline/models/glove/glove-twitter-25.bin', 'datetime': '2023-12-15T14:06:56.703005', 'gensim': '4.3.2', 'python': '3.8.8 | packaged by conda-forge | (default, Feb 20 2021, 16:22:27) \n[GCC 9.3.0]', 'platform': 'Linux-4.18.0-240.15.1.el8_3.x86_64-x86_64-with-glibc2.10', 'event': 'loaded'}]


  0%|          | 0/38 [00:00<?, ?it/s]

[2023-12-15 14:06:58,793: 741: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  1. Columns to indicate nans presence have been created]
[2023-12-15 14:06:58,794: 744: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  2. No need to merge data]




[2023-12-15 14:07:02,533: 749: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  3. Data has been cached]


                                                                                

  0%|          | 0/4 [00:00<?, ?it/s]

[2023-12-15 14:07:18,690: 754: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  4. Specified features have been encoded using glove-twitter-25 model]


                                                                                

[2023-12-15 14:09:23,401: 758: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  5. Data has been saved]
[2023-12-15 14:09:23,550: 762: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  6. Data has been uncached]
[2023-12-15 14:09:23,607: 835: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  PART II. COMPLETED]
[2023-12-15 14:09:23,608: 838: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  PART III. STARTING]
[2023-12-15 14:09:23,621: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/features_with_nans.yaml loaded successfully]
[2023-12-15 14:09:23,625: 44: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file: us_used_cars_ml_pipeline/constants/features_to_encode.yaml loaded successfully]
[2023-12-15 14:09:23,854: 795: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  1. Time-related features have been computed]


  0%|          | 0/12 [00:00<?, ?it/s]

[2023-12-15 14:09:23,868: 132: us_used_cars_ml_pipeline_logger: INFO: common:  json file loaded successfully from: us_used_cars_ml_pipeline/constants/label_encodings/wheel_system.json]
[2023-12-15 14:09:23,920: 805: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  2. Nans have been replaced with median value or top class]


  0%|          | 0/7 [00:00<?, ?it/s]

                                                                                

[2023-12-15 14:09:53,157: 453: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  1. Rare classes have been extracted]
[2023-12-15 14:09:55,197: 66: us_used_cars_ml_pipeline_logger: INFO: common:  yaml file saved at: us_used_cars_ml_pipeline/constants/rare_classes.yaml]
[2023-12-15 14:09:55,198: 457: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  2. Rare classes names have been saved]
[2023-12-15 14:09:55,371: 461: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  3. Fold column has been created]


  0%|          | 0/7 [00:00<?, ?it/s]

[2023-12-15 14:09:56,740: 476: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  4. Rare classes have been replaced]




[2023-12-15 14:10:00,052: 479: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  5. Temporary data has been cached]


                                                                                

  0%|          | 0/7 [00:00<?, ?it/s]

[2023-12-15 14:10:00,063: 18: us_used_cars_ml_pipeline_logger: INFO: encoders:  Starting K-Fold Mean Target Encoding for column 'engine_cylinders'...]


                                                                                

[2023-12-15 14:10:06,082: 18: us_used_cars_ml_pipeline_logger: INFO: encoders:  Starting K-Fold Mean Target Encoding for column 'exterior_color'...]


                                                                                

[2023-12-15 14:10:19,850: 18: us_used_cars_ml_pipeline_logger: INFO: encoders:  Starting K-Fold Mean Target Encoding for column 'interior_color'...]


                                                                                

[2023-12-15 14:10:32,555: 18: us_used_cars_ml_pipeline_logger: INFO: encoders:  Starting K-Fold Mean Target Encoding for column 'city'...]


                                                                                

[2023-12-15 14:10:45,200: 18: us_used_cars_ml_pipeline_logger: INFO: encoders:  Starting K-Fold Mean Target Encoding for column 'model_name'...]


                                                                                

[2023-12-15 14:10:56,447: 18: us_used_cars_ml_pipeline_logger: INFO: encoders:  Starting K-Fold Mean Target Encoding for column 'make_name'...]


                                                                                

[2023-12-15 14:11:00,636: 18: us_used_cars_ml_pipeline_logger: INFO: encoders:  Starting K-Fold Mean Target Encoding for column 'transmission_display'...]


                                                                                

[2023-12-15 14:11:05,414: 484: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  6. Encodings have been calculated and saved]
[2023-12-15 14:11:05,426: 487: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  7. Temporary data has been uncached]


  0%|          | 0/7 [00:00<?, ?it/s]

[2023-12-15 14:11:05,511: 497: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.1. Rare classes have been replaced]
[2023-12-15 14:11:05,641: 500: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.2. Encodings have been read]
[2023-12-15 14:11:05,667: 503: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.3. engine_cylinders column has been encoded with encodings]
[2023-12-15 14:11:05,707: 509: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.4. Average encodings have been calculated]




[2023-12-15 14:11:08,029: 512: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.5. Average encodings have been saved]


                                                                                

[2023-12-15 14:11:08,417: 497: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.1. Rare classes have been replaced]
[2023-12-15 14:11:08,609: 500: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.2. Encodings have been read]
[2023-12-15 14:11:08,640: 503: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.3. exterior_color column has been encoded with encodings]
[2023-12-15 14:11:08,687: 509: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.4. Average encodings have been calculated]


                                                                                

[2023-12-15 14:11:13,279: 512: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.5. Average encodings have been saved]
[2023-12-15 14:11:13,918: 497: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.1. Rare classes have been replaced]
[2023-12-15 14:11:14,125: 500: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.2. Encodings have been read]
[2023-12-15 14:11:14,154: 503: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.3. interior_color column has been encoded with encodings]
[2023-12-15 14:11:14,202: 509: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.4. Average encodings have been calculated]


                                                                                

[2023-12-15 14:11:21,949: 512: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.5. Average encodings have been saved]
[2023-12-15 14:11:22,035: 497: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.1. Rare classes have been replaced]
[2023-12-15 14:11:22,173: 500: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.2. Encodings have been read]
[2023-12-15 14:11:22,255: 503: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.3. city column has been encoded with encodings]
[2023-12-15 14:11:22,318: 509: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.4. Average encodings have been calculated]


                                                                                

[2023-12-15 14:11:32,491: 512: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.5. Average encodings have been saved]
[2023-12-15 14:11:32,591: 497: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.1. Rare classes have been replaced]
[2023-12-15 14:11:32,733: 500: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.2. Encodings have been read]
[2023-12-15 14:11:32,763: 503: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.3. model_name column has been encoded with encodings]
[2023-12-15 14:11:32,817: 509: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.4. Average encodings have been calculated]


                                                                                

[2023-12-15 14:11:43,296: 512: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.5. Average encodings have been saved]
[2023-12-15 14:11:43,391: 497: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.1. Rare classes have been replaced]
[2023-12-15 14:11:43,490: 500: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.2. Encodings have been read]
[2023-12-15 14:11:43,523: 503: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.3. make_name column has been encoded with encodings]
[2023-12-15 14:11:43,579: 509: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.4. Average encodings have been calculated]




[2023-12-15 14:11:53,002: 512: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.5. Average encodings have been saved]


                                                                                

[2023-12-15 14:11:53,090: 497: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.1. Rare classes have been replaced]
[2023-12-15 14:11:53,173: 500: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.2. Encodings have been read]
[2023-12-15 14:11:53,206: 503: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.3. transmission_display column has been encoded with encodings]
[2023-12-15 14:11:53,261: 509: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.4. Average encodings have been calculated]




[2023-12-15 14:12:04,773: 512: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  8.5. Average encodings have been saved]
[2023-12-15 14:12:04,802: 809: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  3. Specified features have been encoded using k-fold mean target encoder]




[2023-12-15 14:12:19,105: 840: us_used_cars_ml_pipeline_logger: INFO: data_cleaning:  PART III. COMPLETED]
CPU times: user 9.2 s, sys: 2.05 s, total: 11.3 s
Wall time: 6min 38s


