## 5.0 Feature Ablation

In [None]:
#Author: Tan Xin Hui

### 5.1 Determining Features

In [4]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

class RandomForestModeler:
    def __init__(self, train_path: str, test_path: str):
        self.spark = SparkSession.builder.appName("RandomForestModeler").getOrCreate()
        self.train_path = train_path
        self.test_path = test_path
        self.train_df = self.load_data(self.train_path)
        self.test_df = self.load_data(self.test_path)

    def load_data(self, path: str):
        """
        Load dataset from the given HDFS path.
        """
        return self.spark.read.csv(path, header=True, inferSchema=True)

    def prepare_data(self, df, feature_cols, label_col):
        """
        Prepares the dataset by assembling features into a single vector column.
        
        :param df: DataFrame containing the dataset.
        :param feature_cols: List of column names to be used as features.
        :param label_col: Column name to be used as the label.
        """
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
        df_prepared = assembler.transform(df).select("features", label_col)
        return df_prepared

    def train_random_forest(self, feature_cols, label_col, num_trees=10):
        """
        Trains a Random Forest classifier on the training data.
        
        :param feature_cols: List of columns used as features.
        :param label_col: Column used as the label.
        :param num_trees: Number of trees in the Random Forest.
        """
        # Prepare training data
        train_data = self.prepare_data(self.train_df, feature_cols, label_col)

        # Initialize the Random Forest Classifier
        rf = RandomForestClassifier(featuresCol="features", labelCol=label_col, numTrees=num_trees)
        
        # Train the model
        self.rf_model = rf.fit(train_data)
        print("Random Forest model trained.")

    def evaluate_model(self, feature_cols, label_col):
        """
        Evaluates the trained Random Forest model on the test data using accuracy, F1 score, precision, and recall.
        
        :param feature_cols: List of columns used as features.
        :param label_col: Column used as the label.
        """
        # Prepare test data
        test_data = self.prepare_data(self.test_df, feature_cols, label_col)

        # Make predictions
        predictions = self.rf_model.transform(test_data)

        # Initialize evaluators for different metrics
        accuracy_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="accuracy")
        f1_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="f1")
        precision_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="weightedPrecision")
        recall_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="weightedRecall")

        # Compute metrics
        accuracy = accuracy_evaluator.evaluate(predictions)
        f1_score = f1_evaluator.evaluate(predictions)
        precision = precision_evaluator.evaluate(predictions)
        recall = recall_evaluator.evaluate(predictions)

        print(f"Accuracy: {accuracy * 100:.2f}%")
        print(f"F1 Score: {f1_score:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        
        return {
            "accuracy": accuracy,
            "f1_score": f1_score,
            "precision": precision,
            "recall": recall
        }

    def close(self):
        """
        Stops the Spark session.
        """
        self.spark.stop()
        print("Spark session stopped.")

if __name__ == "__main__":
    train_path = "/user/student/train_data"
    test_path = "/user/student/test_data"

    # Instantiate the RandomForestModeler
    rf_modeler = RandomForestModeler(train_path, test_path)

    # Specify features and label columns
    feature_columns = ['Carat','Color_encoded'] 
    label_column = 'price_label'  

    # Train the Random Forest model
    rf_modeler.train_random_forest(feature_cols=feature_columns, label_col=label_column, num_trees=20)
    
    #Metrics
    metrics = rf_modeler.evaluate_model(feature_cols=feature_columns, label_col=label_column)

24/09/10 13:30:16 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Random Forest model trained.
Accuracy: 88.78%
F1 Score: 0.8871
Precision: 0.8868
Recall: 0.8878


#### HDFS Reader

In [6]:
#check saved train_data dataset
class HDFSDataReader:
    def __init__(self, hdfs_path: str):
        self.spark = SparkSession.builder \
            .appName("ReadHDFS") \
            .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
            .getOrCreate()
        self.hdfs_path = hdfs_path

    def read_data(self) -> DataFrame:
        self.df = self.spark.read.json(self.hdfs_path)
        return self.df
    
    def read_csv(self, path: str) -> DataFrame:
        self.df = self.spark.read.csv(path, header=True, inferSchema=True)
        return self.df

    def show_data(self, num_rows=5):
        if hasattr(self, 'df'):
            self.df.show(num_rows)
        else:
            print("DataFrame not loaded yet. Call read_data() or read_csv() first.")

### 5.2 Logarithmic and Root Features

In [11]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, log, sqrt
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
    
class GradientBoostingRegressor:
    def __init__(self, train_df: DataFrame, test_df: DataFrame, feature_cols: list, label_col: str):
        self.train_df = self.create_engineered_features(train_df)
        self.test_df = self.create_engineered_features(test_df)
        self.feature_cols = feature_cols
        self.label_col = label_col
        
        # Assemble features into a single vector column
        self.feature_assembler = VectorAssembler(inputCols=self.feature_cols, outputCol='features')
        self.train_df = self.feature_assembler.transform(self.train_df)
        self.test_df = self.feature_assembler.transform(self.test_df)

    def create_engineered_features(self, df: DataFrame) -> DataFrame:
        # Example of novel features
        df = df.withColumn('Carat_log', log(col('Carat') + 1))
        df = df.withColumn('Carat_sqrt', sqrt(col('Carat')))
        df = df.withColumn('Price_log', log(col('Price') + 1))
        
        return df
    
    def train(self, max_iter: int = 100):
        gbt = GBTRegressor(
            featuresCol='features',
            labelCol=self.label_col,
            maxIter=max_iter
        )
        
        # Fit the model
        self.model = gbt.fit(self.train_df)
        print("Gradient Boosting Regressor model trained.")
    
    def evaluate(self):
        if self.model is None:
            raise ValueError("Model has not been trained.")
        
        # Make predictions
        predictions = self.model.transform(self.test_df)
        
        # Initialize evaluators for different metrics
        evaluator_rmse = RegressionEvaluator(labelCol=self.label_col, predictionCol='prediction', metricName='rmse')
        evaluator_mae = RegressionEvaluator(labelCol=self.label_col, predictionCol='prediction', metricName='mae')
        evaluator_r2 = RegressionEvaluator(labelCol=self.label_col, predictionCol='prediction', metricName='r2')
        
        # Compute metrics
        rmse = evaluator_rmse.evaluate(predictions)
        mae = evaluator_mae.evaluate(predictions)
        r2 = evaluator_r2.evaluate(predictions)
        
        # Print metrics
        print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")
        print(f"Mean Absolute Error (MAE) on test data = {mae}")
        print(f"R-squared on test data = {r2}")

        # Show some predictions
        predictions.select('features', self.label_col, 'prediction').show(10)
        
        return {
            "rmse": rmse,
            "mae": mae,
            "r2": r2
        }

if __name__ == "__main__":
    train_path = "/user/student/train_data"
    test_path = "/user/student/test_data"

    train_reader = HDFSDataReader(train_path)
    hdfs_reader = HDFSDataReader(train_path)
    
    train_df = train_reader.read_csv(train_path)
    test_df = hdfs_reader.read_csv(test_path)
    
    # Specify features and label columns
    feature_columns = ['Carat', 'Shape_encoded', 'Clarity_encoded', 'Color_encoded', 'Polish_encoded', 'Symmetry_encoded', 'Fluorescence_encoded', 'Carat_log','Carat_sqrt', 'Price_log']
    label_column = 'Price' 
    
    # Initialize GradientBoostingRegressor with train and test datasets
    gbt_regressor = GradientBoostingRegressor(train_df=train_df, test_df=test_df, feature_cols=feature_columns, label_col=label_column)
    
    # Train the Gradient Boosting model
    gbt_regressor.train(max_iter=100)
    
    # Evaluate the model
    metrics = gbt_regressor.evaluate()

Gradient Boosting Regressor model trained.
Root Mean Squared Error (RMSE) on test data = 101.25296827270557
Mean Absolute Error (MAE) on test data = 55.45587683662959
R-squared on test data = 0.999383244640879
+--------------------+------+------------------+
|            features| Price|        prediction|
+--------------------+------+------------------+
|(10,[0,6,7,8,9],[...|2696.0| 2697.464679658072|
|(10,[0,5,7,8,9],[...|2802.0|2776.7803370912925|
|(10,[0,7,8,9],[0....|2989.0| 2998.579181959588|
|(10,[0,7,8,9],[0....|2989.0| 2998.579181959588|
|(10,[0,5,7,8,9],[...|2991.0|2975.0576246914306|
|(10,[0,7,8,9],[0....|3081.0|3116.3934946364107|
|(10,[0,7,8,9],[0....|3290.0| 3273.023952194792|
|(10,[0,7,8,9],[0....|3314.0| 3367.529403755707|
|(10,[0,7,8,9],[0....|3314.0| 3367.529403755707|
|(10,[0,7,8,9],[0....|3314.0| 3367.529403755707|
+--------------------+------+------------------+
only showing top 10 rows



### 5.3 Interaction Features

In [None]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, log, sqrt
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
    
class GradientBoostingRegressor:
    def __init__(self, train_df: DataFrame, test_df: DataFrame, feature_cols: list, label_col: str):
        self.train_df = self.create_interaction_features(train_df)
        self.test_df = self.create_interaction_features(test_df)
        self.feature_cols = feature_cols
        self.label_col = label_col
        
        # Assemble features into a single vector column
        self.feature_assembler = VectorAssembler(inputCols=self.feature_cols, outputCol='features')
        self.train_df = self.feature_assembler.transform(self.train_df)
        self.test_df = self.feature_assembler.transform(self.test_df)

    def create_interaction_features(self, df: DataFrame) -> DataFrame:
        df = df.withColumn('Carat_Color_Interaction', col('Carat') * col('Color_encoded'))
        df = df.withColumn('Carat_Clarity_Interaction', col('Carat') * col('Clarity_encoded'))
        df = df.withColumn('Carat_Polish_Interaction', col('Carat') * col('Polish_encoded'))
        df = df.withColumn('Carat_Symmetry_Interaction', col('Carat') * col('Symmetry_encoded'))
        df = df.withColumn('Carat_Fluorescence_Interaction', col('Carat') * col('Fluorescence_encoded'))
        
        return df
    
    def train(self, max_iter: int = 100):
        """
        Train a Gradient Boosted Trees Regressor model.
        """
        gbt = GBTRegressor(
            featuresCol='features',
            labelCol=self.label_col,
            maxIter=max_iter
        )
        
        # Fit the model
        self.model = gbt.fit(self.train_df)
        print("Gradient Boosting Regressor model trained.")
    
    def evaluate(self):
        """
        Evaluate the Gradient Boosting model using the test dataset.
        """
        if self.model is None:
            raise ValueError("Model has not been trained.")
        
        # Make predictions
        predictions = self.model.transform(self.test_df)
        
        # Initialize evaluators for different metrics
        evaluator_rmse = RegressionEvaluator(labelCol=self.label_col, predictionCol='prediction', metricName='rmse')
        evaluator_mae = RegressionEvaluator(labelCol=self.label_col, predictionCol='prediction', metricName='mae')
        evaluator_r2 = RegressionEvaluator(labelCol=self.label_col, predictionCol='prediction', metricName='r2')
        
        # Compute metrics
        rmse = evaluator_rmse.evaluate(predictions)
        mae = evaluator_mae.evaluate(predictions)
        r2 = evaluator_r2.evaluate(predictions)
        
        # Print metrics
        print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")
        print(f"Mean Absolute Error (MAE) on test data = {mae}")
        print(f"R-squared on test data = {r2}")

        # Show some predictions
        predictions.select('features', self.label_col, 'prediction').show(10)
        
        return {
            "rmse": rmse,
            "mae": mae,
            "r2": r2
        }

if __name__ == "__main__":
    train_path = "/user/student/train_data"
    test_path = "/user/student/test_data"

    train_df = hdfs_reader.read_csv(train_path)
    test_df = hdfs_reader.read_csv(test_path)
    
    # Specify features and label columns
    label_column = 'Price'  
    feature_columns = ['Carat', 'Color_encoded', 'Clarity_encoded', 'Polish_encoded', 'Symmetry_encoded', 'Fluorescence_encoded', 
                       'Carat_Color_Interaction', 'Carat_Clarity_Interaction', 'Carat_Polish_Interaction', 'Carat_Symmetry_Interaction', 
                       'Carat_Fluorescence_Interaction']
    
    # Initialize GradientBoostingRegressor with train and test datasets
    gbt_regressor = GradientBoostingRegressor(train_df=train_df, test_df=test_df, feature_cols=feature_columns, label_col=label_column)
    
    # Train the Gradient Boosting model
    gbt_regressor.train(max_iter=100)
    
    # Evaluate the model
    metrics = gbt_regressor.evaluate()