## 2.2 Data Preparation

In [1]:
#Author: Tan Xin Hui

In [8]:
from pyspark.sql import DataFrame, SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.sql import functions as F

class HDFSDataReader:
    def __init__(self, hdfs_path: str):
        self.spark = SparkSession.builder \
            .appName("ReadHDFS") \
            .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
            .getOrCreate()
        self.hdfs_path = hdfs_path

    def read_data(self) -> DataFrame:
        self.df = self.spark.read.json(self.hdfs_path)
        return self.df

    def show_data(self, num_rows=5):
        if hasattr(self, 'df'):
            print("Data Understanding")
            self.df.show(num_rows)
        else:
            print("DataFrame not loaded yet. Call read_data() first.")

    def stop_spark(self):
        self.spark.stop()
        print("Spark session stopped.")

class LabelEncoder:
    def __init__(self, dataframe: DataFrame, categorical_columns: list):
        self.df = dataframe
        self.categorical_columns = categorical_columns

    def encode(self) -> DataFrame:
        for column in self.categorical_columns:
            indexer = StringIndexer(inputCol=column, outputCol=column + '_encoded', handleInvalid="keep")  # Handling nulls
            self.df = indexer.fit(self.df).transform(self.df)
        
        return self.df

    def show_encoded_columns(self, num_rows=10):
        encoded_columns = [col + '_encoded' for col in self.categorical_columns]
        print("2.2(b) Label Encoding: ")
        self.df.select(encoded_columns).show(num_rows)

    def detect_and_remove_outliers(self, column_name: str, z_threshold: float = 3.0) -> DataFrame:
        initial_count = self.df.count()
        stats = self.df.select(
            F.mean(column_name).alias('mean'),
            F.stddev(column_name).alias('stddev')
        ).collect()[0]
        mean = stats['mean']
        stddev = stats['stddev']
        self.df = self.df.withColumn(
            f'{column_name}_z_score',
            (F.col(column_name) - mean) / stddev
        )
        self.df = self.df.filter(F.abs(F.col(f'{column_name}_z_score')) < z_threshold)
        self.df = self.df.drop(f'{column_name}_z_score')
        final_count = self.df.count()
        print("2.2(c) Data Cleaning")
        print(f"Number of rows before removing outliers: {initial_count}")
        print(f"Number of rows after removing outliers: {final_count}")
        return self.df

    def add_price_per_carat(self) -> DataFrame:
        self.df = self.df.withColumn(
            'Price per Carat',
            (F.col('Price') / F.col('Carat'))
        ).withColumn(
            'Price per Carat',
            F.round(F.col('Price per Carat'), 2)
        )
        return self.df

    def show_price_per_carat(self, num_rows=10):
        print("2.2(d) Data Transformation")
        self.df.select('Price per Carat').show(num_rows)

    def show_final_table(self, num_rows=10):
        columns_to_show = self.df.columns
        print("Final DataFrame with Encoded Columns and Price per Carat:")
        self.df.select(columns_to_show).show(num_rows)

class HDFSDataSaver:
    def __init__(self, output_path: str, file_format: str = 'csv', mode: str = 'overwrite'):
        self.output_path = output_path
        self.file_format = file_format
        self.mode = mode

    def save(self, dataframe: DataFrame):
        try:
            if self.file_format == 'csv':
                # Add header=True to save column names
                dataframe.write.format(self.file_format).option("header", "true").mode(self.mode).save(self.output_path)
            else:
                dataframe.write.format(self.file_format).mode(self.mode).save(self.output_path)
            
            print(f"Data saved to {self.output_path} as {self.file_format}")
        except Exception as e:
            print(f"Error while saving DataFrame: {e}")

    def set_file_format(self, file_format: str):
        self.file_format = file_format

    def set_output_path(self, output_path: str):
        self.output_path = output_path

    def set_write_mode(self, mode: str):
        self.mode = mode
        
if __name__ == "__main__":
    hdfs_path = "/user/student/de-dir"  # Set the file path to read from
    hdfs_reader = HDFSDataReader(hdfs_path)
    df = hdfs_reader.read_data()
    
    if df.count() > 0:
        hdfs_reader.show_data(5)

        encoder = LabelEncoder(dataframe=df, categorical_columns=['Shape', 'Clarity', 'Color', 'Polish', 'Symmetry', 'Fluorescence'])
        df_encoded = encoder.encode()

        encoder.show_encoded_columns(10)
        df_cleaned = encoder.detect_and_remove_outliers('Price', z_threshold=3.0)
        df_with_price_per_carat = encoder.add_price_per_carat()
        encoder.show_price_per_carat(10)        
        encoder.show_final_table(10)
        
        saver = HDFSDataSaver(output_path="/user/student/processed_data", file_format='csv', mode='overwrite')
        saver.save(df_with_price_per_carat)
    else:
        print("No data found in the specified HDFS path.")

                                                                                

Data Understanding
+-----+-------+-----+------------+------+-------+-----+--------+
|Carat|Clarity|Color|Fluorescence|Polish|  Price|Shape|Symmetry|
+-----+-------+-----+------------+------+-------+-----+--------+
| 0.73|   VVS1|   H+|           N|    EX|10500.0|   RD|      EX|
| 0.58|   VVS1|   E+|           N|    EX|10749.0|   RD|      EX|
| 0.58|   VVS1|   E+|           N|    EX|10749.0|   RD|      EX|
| 0.58|   VVS1|   E+|           N|    EX|10749.0|   RD|      EX|
| 0.57|   VVS1|   E+|           N|    EX|10250.0|   RD|      EX|
+-----+-------+-----+------------+------+-------+-----+--------+
only showing top 5 rows



                                                                                

2.2(b) Label Encoding: 
+-------------+---------------+-------------+--------------+----------------+--------------------+
|Shape_encoded|Clarity_encoded|Color_encoded|Polish_encoded|Symmetry_encoded|Fluorescence_encoded|
+-------------+---------------+-------------+--------------+----------------+--------------------+
|          0.0|            0.0|          8.0|           0.0|             0.0|                 0.0|
|          0.0|            0.0|          3.0|           0.0|             0.0|                 0.0|
|          0.0|            0.0|          3.0|           0.0|             0.0|                 0.0|
|          0.0|            0.0|          3.0|           0.0|             0.0|                 0.0|
|          0.0|            0.0|          3.0|           0.0|             0.0|                 0.0|
|          0.0|            0.0|          8.0|           0.0|             0.0|                 0.0|
|          0.0|            0.0|          8.0|           0.0|             0.0|        

                                                                                

2.2(c) Data Cleaning
Number of rows before removing outliers: 1935
Number of rows after removing outliers: 1915
2.2(d) Data Transformation
+---------------+
|Price per Carat|
+---------------+
|       14383.56|
|       18532.76|
|       18532.76|
|       18532.76|
|       17982.46|
|       14383.56|
|       14383.56|
|       17982.46|
|       17982.46|
|       17982.46|
+---------------+
only showing top 10 rows

Final DataFrame with Encoded Columns and Price per Carat:
+-----+-------+-----+------------+------+-------+-----+--------+-------------+---------------+-------------+--------------+----------------+--------------------+---------------+
|Carat|Clarity|Color|Fluorescence|Polish|  Price|Shape|Symmetry|Shape_encoded|Clarity_encoded|Color_encoded|Polish_encoded|Symmetry_encoded|Fluorescence_encoded|Price per Carat|
+-----+-------+-----+------------+------+-------+-----+--------+-------------+---------------+-------------+--------------+----------------+--------------------+--------



Data saved to /user/student/processed_data as csv


                                                                                

## Save to HBase

In [39]:
import happybase
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.ml.feature import StringIndexer
from pyspark.sql import functions as F

class HDFSDataProcessor:
    def __init__(self, hdfs_path: str):
        self.spark = SparkSession.builder \
            .appName("HDFSDataProcessor") \
            .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
            .getOrCreate()
        self.hdfs_path = hdfs_path

    def read_data(self) -> DataFrame:
        self.df = self.spark.read.json(self.hdfs_path)
        return self.df

    def encode_labels(self, categorical_columns: list) -> DataFrame:
        for column in categorical_columns:
            indexer = StringIndexer(inputCol=column, outputCol=column + '_encoded', handleInvalid="keep")
            self.df = indexer.fit(self.df).transform(self.df)
        return self.df

    def detect_and_remove_outliers(self, column_name: str, z_threshold: float = 3.0) -> DataFrame:
        initial_count = self.df.count()
        stats = self.df.select(
            F.mean(column_name).alias('mean'),
            F.stddev(column_name).alias('stddev')
        ).collect()[0]
        mean = stats['mean']
        stddev = stats['stddev']
        self.df = self.df.withColumn(
            f'{column_name}_z_score',
            (F.col(column_name) - mean) / stddev
        )
        self.df = self.df.filter(F.abs(F.col(f'{column_name}_z_score')) < z_threshold)
        self.df = self.df.drop(f'{column_name}_z_score')
        final_count = self.df.count()
        print("Data Cleaning")
        print(f"Number of rows before removing outliers: {initial_count}")
        print(f"Number of rows after removing outliers: {final_count}")
        return self.df

    def add_price_per_carat(self) -> DataFrame:
        self.df = self.df.withColumn(
            'Price per Carat',
            (F.col('Price') / F.col('Carat'))
        ).withColumn(
            'Price per Carat',
            F.round(F.col('Price per Carat'), 2)
        )
        return self.df

    def stop_spark(self):
        self.spark.stop()
        print("Spark session stopped.")

class HBaseDataSaver:
    def __init__(self, table_name: str, hbase_host: str = 'localhost'):
        self.table_name = table_name
        self.hbase_host = hbase_host
        self.connection = happybase.Connection(self.hbase_host)
        self.table = self.connection.table(self.table_name)

    def save(self, dataframe: DataFrame):
        try:
            dataframe = dataframe.withColumn('row_key', monotonically_increasing_id())
            row_key_col = 'row_key'
            columns = dataframe.columns
            
            for row in dataframe.collect():
                row_dict = row.asDict()
                row_key = str(row_dict.pop(row_key_col))
                data = {f'cf:{col}': str(value) for col, value in row_dict.items() if value is not None}
                self.table.put(row_key, data)
            print(f"Data saved to HBase table {self.table_name} successfully.")
        except Exception as e:
            print(f"Error while saving data to HBase: {e}")
            print(f"DataFrame columns: {dataframe.columns}")

    def close(self):
        try:
            self.connection.close()
            print("HBase connection closed.")
        except Exception as e:
            print(f"Error while closing HBase connection: {e}")

if __name__ == "__main__":
    try:
        # HDFS path
        hdfs_path = "/user/student/de-dir"

        # Initialize HDFS Data Processor
        processor = HDFSDataProcessor(hdfs_path)
        df = processor.read_data()

        if df.count() > 0:
            # Perform preprocessing
            categorical_columns = ['Shape', 'Clarity', 'Color', 'Polish', 'Symmetry', 'Fluorescence']
            df_encoded = processor.encode_labels(categorical_columns)
            df_cleaned = processor.detect_and_remove_outliers('Carat')
            df_with_price_per_carat = processor.add_price_per_carat()

            # Initialize HBase Data Saver
            hbase_saver = HBaseDataSaver(table_name='processed_data_hbase')
            hbase_saver.save(df_with_price_per_carat)
            hbase_saver.close()

    except Exception as e:
        print(f"Error in processing and saving data: {e}")

                                                                                

Data Cleaning
Number of rows before removing outliers: 1935
Number of rows after removing outliers: 1931


                                                                                

Data saved to HBase table processed_data_hbase successfully.
HBase connection closed.


### HBase Reader

In [44]:
import happybase
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql import Row

class HBaseDataReader:
    def __init__(self, table_name: str, hbase_host: str = 'localhost'):
        self.table_name = table_name
        self.hbase_host = hbase_host
        self.connection = happybase.Connection(self.hbase_host)
        self.table = self.connection.table(self.table_name)
    
    def read_data(self):
        # Scan the table and collect data
        rows = self.table.scan()
        data = []
        for key, row in rows:
            # Convert each row to a dictionary
            row_dict = {col.decode('utf-8'): val.decode('utf-8') for col, val in row.items()}
            row_dict['row_key'] = key.decode('utf-8')
            data.append(row_dict)
        return data
    
    def to_dataframe(self, data: list) -> DataFrame:
        # Create a SparkSession
        spark = SparkSession.builder \
            .appName("HBase to DataFrame") \
            .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
            .getOrCreate()
        
        # Convert the list of dictionaries to a DataFrame
        df = spark.createDataFrame([Row(**row) for row in data])
        return df
    
    def show_data(self, num_rows=5):
        # Read and convert data to DataFrame
        data = self.read_data()
        df = self.to_dataframe(data)
        # Show DataFrame
        df.show(num_rows)

    def close(self):
        self.connection.close()
        print("HBase connection closed.")

if __name__ == "__main__":
    try:
        # Initialize HBaseDataReader
        hbase_reader = HBaseDataReader(table_name='processed_data_hbase')
        
        # Show some rows from HBase
        hbase_reader.show_data(5)
        
    except Exception as e:
        print(f"Error in HBase reader: {e}")

    finally:
        # Ensure connection is closed
        hbase_reader.close()


24/09/10 21:57:36 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+--------+----------+------------------+--------+----------------+---------------+-----------------------+---------+-----------------+--------+------------------+--------+----------------+-----------+-------------------+------------+
|cf:Carat|cf:Clarity|cf:Clarity_encoded|cf:Color|cf:Color_encoded|cf:Fluorescence|cf:Fluorescence_encoded|cf:Polish|cf:Polish_encoded|cf:Price|cf:Price per Carat|cf:Shape|cf:Shape_encoded|cf:Symmetry|cf:Symmetry_encoded|     row_key|
+--------+----------+------------------+--------+----------------+---------------+-----------------------+---------+-----------------+--------+------------------+--------+----------------+-----------+-------------------+------------+
|    0.73|      VVS1|               0.0|      H+|             8.0|              N|                    0.0|       EX|              0.0| 10500.0|          14383.56|      RD|             0.0|         EX|                0.0|           0|
|    0.58|      VVS1|               0.0|      E+|             3.

### HBase Query

In [45]:
import happybase

def describe_table(hbase_host, table_name):
    connection = happybase.Connection(hbase_host)
    table = connection.table(table_name)
    families = table.families()
    print("Column Families:")
    for family, attributes in families.items():
        print(f"Family: {family.decode('utf-8')}, Attributes: {attributes}")
    connection.close()

# Call the function
describe_table('localhost', 'processed_data_hbase')


Column Families:
Family: cf, Attributes: {'name': b'cf:', 'max_versions': 1, 'compression': b'NONE', 'in_memory': False, 'bloom_filter_type': b'ROW', 'bloom_filter_vector_size': 0, 'bloom_filter_nb_hashes': 0, 'block_cache_enabled': True, 'time_to_live': 2147483647}


In [40]:
# Query 1: Get Specific Row: Retrieve a specific row by its row key.

connection = happybase.Connection('localhost')
table = connection.table('processed_data_hbase')

row = table.row(b'0')
print(row)

connection.close()

{b'cf:Carat': b'0.73', b'cf:Clarity': b'VVS1', b'cf:Clarity_encoded': b'0.0', b'cf:Color': b'H+', b'cf:Color_encoded': b'8.0', b'cf:Fluorescence': b'N', b'cf:Fluorescence_encoded': b'0.0', b'cf:Polish': b'EX', b'cf:Polish_encoded': b'0.0', b'cf:Price': b'10500.0', b'cf:Price per Carat': b'14383.56', b'cf:Shape': b'RD', b'cf:Shape_encoded': b'0.0', b'cf:Symmetry': b'EX', b'cf:Symmetry_encoded': b'0.0'}


In [41]:
# Get Specific Columns: Retrieve specific columns for a row.

connection = happybase.Connection('localhost')
table = connection.table('processed_data_hbase')

columns = [b'cf:Carat']
row = table.row(b'0', columns=columns)
print(row)

connection.close()


{b'cf:Carat': b'0.73'}


In [42]:
import happybase

# Establish connection to HBase
connection = happybase.Connection('localhost')
table = connection.table('processed_data_hbase')

# Define the filter
filter = 'SingleColumnValueFilter(' \
          "'cf', 'Price', >, 'binary:2000')"

# Scan the table with the filter and limit the number of rows returned
rows = table.scan(filter=filter, limit=10)

# Print the results
for key, data in rows:
    print(f"Row key: {key.decode('utf-8')}")
    for column, value in data.items():
        print(f"  {column.decode('utf-8')}: {value.decode('utf-8')}")
    print()

# Close the connection
connection.close()


Row key: 103079215104
  cf:Carat: 0.35
  cf:Clarity: VVS1
  cf:Clarity_encoded: 0.0
  cf:Color: E
  cf:Color_encoded: 1.0
  cf:Fluorescence: N
  cf:Fluorescence_encoded: 0.0
  cf:Polish: EX
  cf:Polish_encoded: 0.0
  cf:Price: 3866.0
  cf:Price per Carat: 11045.71
  cf:Shape: RD
  cf:Shape_encoded: 0.0
  cf:Symmetry: EX
  cf:Symmetry_encoded: 0.0

Row key: 103079215105
  cf:Carat: 0.31
  cf:Clarity: VVS1
  cf:Clarity_encoded: 0.0
  cf:Color: E
  cf:Color_encoded: 1.0
  cf:Fluorescence: N
  cf:Fluorescence_encoded: 0.0
  cf:Polish: EX
  cf:Polish_encoded: 0.0
  cf:Price: 3220.0
  cf:Price per Carat: 10387.1
  cf:Shape: RD
  cf:Shape_encoded: 0.0
  cf:Symmetry: EX
  cf:Symmetry_encoded: 0.0

Row key: 103079215106
  cf:Carat: 0.37
  cf:Clarity: VVS2
  cf:Clarity_encoded: 1.0
  cf:Color: D
  cf:Color_encoded: 0.0
  cf:Fluorescence: F
  cf:Fluorescence_encoded: 1.0
  cf:Polish: EX
  cf:Polish_encoded: 0.0
  cf:Price: 3184.0
  cf:Price per Carat: 8605.41
  cf:Shape: RD
  cf:Shape_encoded: 0.

In [43]:
# Count Rows:

connection = happybase.Connection('localhost')
table = connection.table('processed_data_hbase')

count = 0
for _ in table.scan():
    count += 1

print(f"Number of rows: {count}")

connection.close()


Number of rows: 1931


### HDFS Reader

In [55]:
#check saved processed_data dataset 
class HDFSDataReader:
    def __init__(self, hdfs_path: str):
        self.spark = SparkSession.builder \
            .appName("ReadHDFS") \
            .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
            .getOrCreate()
        self.hdfs_path = hdfs_path

    def read_data(self) -> DataFrame:
        self.df = self.spark.read.json(self.hdfs_path)
        return self.df
    
    def read_csv(self, path: str) -> DataFrame:
        self.df = self.spark.read.csv(path, header=True, inferSchema=True)
        return self.df

    def show_data(self, num_rows=5):
        if hasattr(self, 'df'):
            print("Preprocessed Data")
            self.df.show(num_rows)
        else:
            print("DataFrame not loaded yet. Call read_data() or read_csv() first.")

if __name__ == "__main__":
    hdfs_reader = HDFSDataReader("/user/student/processed_data")
    df_saved = hdfs_reader.read_csv("/user/student/processed_data")
    hdfs_reader.show_data(5)

Preprocessed Data
+-----+-------+-----+------------+------+-------+-----+--------+-------------+---------------+-------------+--------------+----------------+--------------------+---------------+
|Carat|Clarity|Color|Fluorescence|Polish|  Price|Shape|Symmetry|Shape_encoded|Clarity_encoded|Color_encoded|Polish_encoded|Symmetry_encoded|Fluorescence_encoded|Price per Carat|
+-----+-------+-----+------------+------+-------+-----+--------+-------------+---------------+-------------+--------------+----------------+--------------------+---------------+
| 0.73|   VVS1|   H+|           N|    EX|10500.0|   RD|      EX|          0.0|            0.0|          8.0|           0.0|             0.0|                 0.0|       14383.56|
| 0.58|   VVS1|   E+|           N|    EX|10749.0|   RD|      EX|          0.0|            0.0|          3.0|           0.0|             0.0|                 0.0|       18532.76|
| 0.58|   VVS1|   E+|           N|    EX|10749.0|   RD|      EX|          0.0|            0.

In [26]:
df_saved.printSchema()

root
 |-- Carat: double (nullable = true)
 |-- Clarity: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Fluorescence: string (nullable = true)
 |-- Polish: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Shape: string (nullable = true)
 |-- Symmetry: string (nullable = true)
 |-- Shape_encoded: double (nullable = true)
 |-- Clarity_encoded: double (nullable = true)
 |-- Color_encoded: double (nullable = true)
 |-- Polish_encoded: double (nullable = true)
 |-- Symmetry_encoded: double (nullable = true)
 |-- Fluorescence_encoded: double (nullable = true)
 |-- Price per Carat: double (nullable = true)



## 2.3 Data Annotation from HDFS

In [53]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import when, col

class DataAnnotator:
    def __init__(self, hdfs_path: str):
        self.spark = SparkSession.builder \
            .appName("DataAnnotatorApp") \
            .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
            .getOrCreate()
        
        # Read data from HDFS with header information
        self.df = self.spark.read.option("header", "true").csv(hdfs_path)
    
    def annotate_price(self, high_threshold: float, low_threshold: float):
        if self.df is not None:
            for column in self.df.columns:
                if column.endswith('_encoded'):
                    self.df = self.df.withColumn(column, col(column).cast('double'))
        
            self.df = self.df.withColumn(
                'price_label',
                when(col('Price').cast('double') > high_threshold, 2)
                .when(col('Price').cast('double') < low_threshold, 0)
                .otherwise(1)
            )
        else:
            print("DataFrame is not initialized.")
        return self.df

    def show_data(self, num_rows=10):
        if self.df is not None:
            self.df.show(num_rows)
        else:
            print("DataFrame is not initialized.")
    
    def close(self):
        if self.spark:
            self.spark.stop()
            print("Spark session stopped.")
        else:
            print("Spark session is not initialized.")

if __name__ == "__main__":
    hdfs_path = "/user/student/processed_data" #set file path to read from 

    annotator = DataAnnotator(hdfs_path)
    
    # thresholds for annotation
    high_threshold = 10000.0
    low_threshold = 5000.0
    
    # Annotate data
    df_annotated = annotator.annotate_price(high_threshold, low_threshold)
    annotator.show_data()

    #Save to HDFS 
    saver = HDFSDataSaver(output_path="/user/student/annotated_data", file_format='csv', mode='overwrite')
    saver.save(df_annotated)

24/09/10 22:07:24 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+-----+-------+-----+------------+------+-------+-----+--------+-------------+---------------+-------------+--------------+----------------+--------------------+---------------+-----------+
|Carat|Clarity|Color|Fluorescence|Polish|  Price|Shape|Symmetry|Shape_encoded|Clarity_encoded|Color_encoded|Polish_encoded|Symmetry_encoded|Fluorescence_encoded|Price per Carat|price_label|
+-----+-------+-----+------------+------+-------+-----+--------+-------------+---------------+-------------+--------------+----------------+--------------------+---------------+-----------+
| 0.73|   VVS1|   H+|           N|    EX|10500.0|   RD|      EX|          0.0|            0.0|          8.0|           0.0|             0.0|                 0.0|       14383.56|          2|
| 0.58|   VVS1|   E+|           N|    EX|10749.0|   RD|      EX|          0.0|            0.0|          3.0|           0.0|             0.0|                 0.0|       18532.76|          2|
| 0.58|   VVS1|   E+|           N|    EX|10749.0| 

                                                                                

In [38]:
df_annotated.printSchema()

root
 |-- Carat: string (nullable = true)
 |-- Clarity: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Fluorescence: string (nullable = true)
 |-- Polish: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Shape: string (nullable = true)
 |-- Symmetry: string (nullable = true)
 |-- Shape_encoded: double (nullable = true)
 |-- Clarity_encoded: double (nullable = true)
 |-- Color_encoded: double (nullable = true)
 |-- Polish_encoded: double (nullable = true)
 |-- Symmetry_encoded: double (nullable = true)
 |-- Fluorescence_encoded: double (nullable = true)
 |-- Price per Carat: string (nullable = true)
 |-- price_label: integer (nullable = false)



### HDFS Query

In [52]:
def run_query(query: str):
    # Initialize Spark session
    spark = SparkSession.builder \
        .appName("HDFSQueryApp") \
        .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
        .getOrCreate()

    # Path to the HDFS file
    hdfs_path = "/user/student/annotated_data"

    # Read the CSV file from HDFS
    df = spark.read.option("header", "true").csv(hdfs_path)
    
    # Show first 5 rows to check the column names
    df.show(5)
    
    # Print schema to inspect column names
    df.printSchema()

    # Register DataFrame as a temporary view
    df.createOrReplaceTempView("annotated_data")

    # Run the SQL query
    result_df = spark.sql(query)

    # Show the results
    result_df.show()

    # Stop Spark session
    spark.stop()

# Example query
query = """
SELECT Shape, MAX(Carat) as max_carat, MIN(Carat) as min_carat
FROM annotated_data
GROUP BY Shape
ORDER BY Shape
"""
run_query(query)


24/09/10 22:07:10 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+----+----+---+---+---+-------+---+---+----+----+---+-----+-----+-----+--------+---+
|0.73|VVS1| H+|  N|EX4|10500.0| RD|EX7|0.08|0.09|8.0|0.011|0.012|0.013|14383.56|  2|
+----+----+---+---+---+-------+---+---+----+----+---+-----+-----+-----+--------+---+
|0.58|VVS1| E+|  N| EX|10749.0| RD| EX| 0.0| 0.0|3.0|  0.0|  0.0|  0.0|18532.76|  2|
|0.58|VVS1| E+|  N| EX|10749.0| RD| EX| 0.0| 0.0|3.0|  0.0|  0.0|  0.0|18532.76|  2|
|0.58|VVS1| E+|  N| EX|10749.0| RD| EX| 0.0| 0.0|3.0|  0.0|  0.0|  0.0|18532.76|  2|
|0.57|VVS1| E+|  N| EX|10250.0| RD| EX| 0.0| 0.0|3.0|  0.0|  0.0|  0.0|17982.46|  2|
|0.73|VVS1| H+|  N| EX|10500.0| RD| EX| 0.0| 0.0|8.0|  0.0|  0.0|  0.0|14383.56|  2|
+----+----+---+---+---+-------+---+---+----+----+---+-----+-----+-----+--------+---+
only showing top 5 rows

root
 |-- 0.73: string (nullable = true)
 |-- VVS1: string (nullable = true)
 |-- H+: string (nullable = true)
 |-- N: string (nullable = true)
 |-- EX4: string (nullable = true)
 |-- 10500.0: string (nullable 

24/09/10 22:07:10 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 0.73, VVS1, H+, N, EX, 10500.0, RD, EX, 0.0, 0.0, 8.0, 0.0, 0.0, 0.0, 14383.56, 2
 Schema: 0.73, VVS1, H+, N, EX4, 10500.0, RD, EX7, 0.08, 0.09, 8.0, 0.011, 0.012, 0.013, 14383.56, 2
Expected: EX4 but found: EX
CSV file: hdfs://localhost:9000/user/student/annotated_data/part-00000-a1004098-63d2-409a-b980-5def7b925555-c000.csv


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `Shape` cannot be resolved. Did you mean one of the following? [`2`, `8.0`, `EX4`, `EX7`, `H+`].; line 2 pos 7;
'Sort ['Shape ASC NULLS FIRST], true
+- 'Aggregate ['Shape], ['Shape, 'MAX('Carat) AS max_carat#12333, 'MIN('Carat) AS min_carat#12334]
   +- SubqueryAlias annotated_data
      +- View (`annotated_data`, [0.73#12219,VVS1#12220,H+#12221,N#12222,EX4#12223,10500.0#12224,RD#12225,EX7#12226,0.08#12227,0.09#12228,8.0#12229,0.011#12230,0.012#12231,0.013#12232,14383.56#12233,2#12234])
         +- Relation [0.73#12219,VVS1#12220,H+#12221,N#12222,EX4#12223,10500.0#12224,RD#12225,EX7#12226,0.08#12227,0.09#12228,8.0#12229,0.011#12230,0.012#12231,0.013#12232,14383.56#12233,2#12234] csv


## Data Annotation from HBase

In [47]:
import happybase
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import when, col, lit
from pyspark.sql.types import DoubleType, StringType, StructType, StructField

class DataAnnotator:
    def __init__(self, hbase_table: str, hbase_host: str = 'localhost'):
        """
        Initializes the DataAnnotator by reading data from HBase.

        :param hbase_table: The HBase table name
        :param hbase_host: The HBase host address
        """
        # Initialize Spark session
        self.spark = SparkSession.builder \
            .appName("DataAnnotatorApp") \
            .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
            .getOrCreate()
        
        # Initialize HBase connection
        self.connection = happybase.Connection(hbase_host)
        self.table = self.connection.table(hbase_table)
        
        # Read data from HBase and convert it to DataFrame
        self.df = self._read_from_hbase()

    def _read_from_hbase(self) -> DataFrame:
        """
        Reads data from HBase table and converts it to a DataFrame.

        :return: DataFrame with the data from HBase
        """
        try:
            # Scan the HBase table
            rows = self.table.scan()
            
            # Extract rows into a list of dictionaries
            data = []
            for key, row in rows:
                row_dict = {
                    'row_key': key.decode('utf-8'),
                    'Carat': row.get(b'cf:Carat', b'').decode('utf-8'),
                    'Clarity': row.get(b'cf:Clarity', b'').decode('utf-8'),
                    'Color': row.get(b'cf:Color', b'').decode('utf-8'),
                    'Fluorescence': row.get(b'cf:Fluorescence', b'').decode('utf-8'),
                    'Polish': row.get(b'cf:Polish', b'').decode('utf-8'),
                    'Price': row.get(b'cf:Price', b'').decode('utf-8'),
                    'Shape': row.get(b'cf:Shape', b'').decode('utf-8'),
                    'Symmetry': row.get(b'cf:Symmetry', b'').decode('utf-8')
                }
                data.append(row_dict)
            
            # Define schema
            schema = StructType([
                StructField("row_key", StringType(), True),
                StructField("Carat", StringType(), True),
                StructField("Clarity", StringType(), True),
                StructField("Color", StringType(), True),
                StructField("Fluorescence", StringType(), True),
                StructField("Polish", StringType(), True),
                StructField("Price", StringType(), True),
                StructField("Shape", StringType(), True),
                StructField("Symmetry", StringType(), True),
            ])
            
            # Convert list of dictionaries to DataFrame
            df = self.spark.createDataFrame(data, schema)
            
            # Convert columns to appropriate data types
            df = df.withColumn("Carat", col("Carat").cast(DoubleType())) \
                   .withColumn("Price", col("Price").cast(DoubleType()))
            
            # Fill null values with defaults
            df = df.na.fill({
                'Carat': 0.0,
                'Price': 0.0,
                'Clarity': 'Unknown',
                'Color': 'Unknown',
                'Fluorescence': 'Unknown',
                'Polish': 'Unknown',
                'Shape': 'Unknown',
                'Symmetry': 'Unknown'
            })

            return df

        except Exception as e:
            print(f"Error reading from HBase: {e}")
            # Use a schema with the same structure to return an empty DataFrame
            schema = StructType([
                StructField("row_key", StringType(), True),
                StructField("Carat", DoubleType(), True),
                StructField("Clarity", StringType(), True),
                StructField("Color", StringType(), True),
                StructField("Fluorescence", StringType(), True),
                StructField("Polish", StringType(), True),
                StructField("Price", DoubleType(), True),
                StructField("Shape", StringType(), True),
                StructField("Symmetry", StringType(), True),
            ])
            return self.spark.createDataFrame(self.spark.sparkContext.emptyRDD(), schema)

    def annotate_price(self, high_threshold: float, low_threshold: float):
        """
        Adds the 'price_label' column based on the thresholds.

        :param high_threshold: The price threshold for labeling 'High'
        :param low_threshold: The price threshold for labeling 'Low'
        :return: DataFrame with the new 'price_label' column
        """
        if self.df is not None:
            self.df = self.df.withColumn(
                'price_label',
                when(col('Price') > high_threshold, lit(2))
                .when(col('Price') < low_threshold, lit(0))
                .otherwise(lit(1))
            )
        else:
            print("DataFrame is not initialized.")
        return self.df

    def show_data(self, num_rows=10):
        """
        Shows the updated DataFrame with the annotated 'price_label' column.

        :param num_rows: Number of rows to display
        """
        if self.df is not None:
            self.df.show(num_rows)
        else:
            print("DataFrame is not initialized.")
    
    def close(self):
        """
        Stops the Spark session and closes the HBase connection.
        """
        if self.spark:
            self.spark.stop()
            print("Spark session stopped.")
        else:
            print("Spark session is not initialized.")
        
        if self.connection:
            self.connection.close()
            print("HBase connection closed.")

class HDFSDataSaver:
    def __init__(self, output_path: str, file_format: str = 'csv', mode: str = 'overwrite'):
        """
        Initializes the HDFSDataSaver.

        :param output_path: Path where the data should be saved
        :param file_format: Format of the saved file
        :param mode: Save mode
        """
        self.output_path = output_path
        self.file_format = file_format
        self.mode = mode

    def save(self, df: DataFrame):
        """
        Saves the DataFrame to HDFS.

        :param df: DataFrame to save
        """
        try:
            df.write \
                .format(self.file_format) \
                .mode(self.mode) \
                .save(self.output_path)
            print(f"Data saved to HDFS at {self.output_path}")
        except Exception as e:
            print(f"Error saving data to HDFS: {e}")

if __name__ == "__main__":
    # Define HBase table name and host
    hbase_table = "processed_data_hbase"
    hbase_host = "localhost"

    # Initialize DataAnnotator with HBase table
    annotator = DataAnnotator(hbase_table=hbase_table, hbase_host=hbase_host)
    
    # Define thresholds for annotation
    high_threshold = 10000.0
    low_threshold = 5000.0
    
    # Annotate data
    df_annotated = annotator.annotate_price(high_threshold, low_threshold)
    annotator.show_data()

    # Save to HDFS
    saver = HDFSDataSaver(output_path="/user/student/annotated_data", file_format='csv', mode='overwrite')
    saver.save(df_annotated)

    # Close connections
    annotator.close()


+------------+-----+-------+-----+------------+------+-------+-----+--------+-----------+
|     row_key|Carat|Clarity|Color|Fluorescence|Polish|  Price|Shape|Symmetry|price_label|
+------------+-----+-------+-----+------------+------+-------+-----+--------+-----------+
|           0| 0.73|   VVS1|   H+|           N|    EX|10500.0|   RD|      EX|          2|
|           1| 0.58|   VVS1|   E+|           N|    EX|10749.0|   RD|      EX|          2|
|          10| 0.73|   VVS1|   H+|           N|    EX|10500.0|   RD|      EX|          2|
|103079215104| 0.35|   VVS1|    E|           N|    EX| 3866.0|   RD|      EX|          0|
|103079215105| 0.31|   VVS1|    E|           N|    EX| 3220.0|   RD|      EX|          0|
|103079215106| 0.37|   VVS2|    D|           F|    EX| 3184.0|   RD|      EX|          0|
|103079215107|  0.5|   VVS1|   F+|           N|    EX| 7719.0|   RD|      EX|          1|
|103079215108|  0.5|   VVS1|   E+|           N|    EX| 8603.0|   RD|      EX|          1|
|103079215

                                                                                

Data saved to HDFS at /user/student/annotated_data
Spark session stopped.
HBase connection closed.


### HDFS Reader

In [54]:
#check saved annotated_data dataset 
class HDFSDataReader:
    def __init__(self, hdfs_path: str):
        self.spark = SparkSession.builder \
            .appName("ReadHDFS") \
            .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
            .getOrCreate()
        self.hdfs_path = hdfs_path

    def read_data(self) -> DataFrame:
        self.df = self.spark.read.json(self.hdfs_path)
        return self.df

    def read_csv(self, path: str) -> DataFrame:
        self.df = self.spark.read.csv(path, header=True, inferSchema=True)
        return self.df

    def show_data(self, num_rows=5):
        if hasattr(self, 'df'):
            print("Preprocessed Data")
            self.df.show(num_rows)
        else:
            print("DataFrame not loaded yet. Call read_data() or read_csv() first.")

if __name__ == "__main__":
    hdfs_reader = HDFSDataReader("/user/student/annotated_data")

    # Reading the saved CSV file
    df_saved = hdfs_reader.read_csv("/user/student/annotated_data")

    hdfs_reader.show_data(5)

24/09/10 22:07:54 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Preprocessed Data
+----+----+---+---+---+-------+---+---+----+----+---+-----+-----+-----+--------+---+
|0.73|VVS1| H+|  N|EX4|10500.0| RD|EX7|0.08|0.09|8.0|0.011|0.012|0.013|14383.56|  2|
+----+----+---+---+---+-------+---+---+----+----+---+-----+-----+-----+--------+---+
|0.58|VVS1| E+|  N| EX|10749.0| RD| EX| 0.0| 0.0|3.0|  0.0|  0.0|  0.0|18532.76|  2|
|0.58|VVS1| E+|  N| EX|10749.0| RD| EX| 0.0| 0.0|3.0|  0.0|  0.0|  0.0|18532.76|  2|
|0.58|VVS1| E+|  N| EX|10749.0| RD| EX| 0.0| 0.0|3.0|  0.0|  0.0|  0.0|18532.76|  2|
|0.57|VVS1| E+|  N| EX|10250.0| RD| EX| 0.0| 0.0|3.0|  0.0|  0.0|  0.0|17982.46|  2|
|0.73|VVS1| H+|  N| EX|10500.0| RD| EX| 0.0| 0.0|8.0|  0.0|  0.0|  0.0|14383.56|  2|
+----+----+---+---+---+-------+---+---+----+----+---+-----+-----+-----+--------+---+
only showing top 5 rows



24/09/10 22:07:55 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: 0.73, VVS1, H+, N, EX, 10500.0, RD, EX, 0.0, 0.0, 8.0, 0.0, 0.0, 0.0, 14383.56, 2
 Schema: 0.73, VVS1, H+, N, EX4, 10500.0, RD, EX7, 0.08, 0.09, 8.0, 0.011, 0.012, 0.013, 14383.56, 2
Expected: EX4 but found: EX
CSV file: hdfs://localhost:9000/user/student/annotated_data/part-00000-8c3357fc-ab3e-49f1-9c4f-0b301da0b087-c000.csv


In [30]:
df_saved.printSchema()

root
 |-- Carat: double (nullable = true)
 |-- Clarity: string (nullable = true)
 |-- Color: string (nullable = true)
 |-- Fluorescence: string (nullable = true)
 |-- Polish: string (nullable = true)
 |-- Price: double (nullable = true)
 |-- Shape: string (nullable = true)
 |-- Symmetry: string (nullable = true)
 |-- Shape_encoded: double (nullable = true)
 |-- Clarity_encoded: double (nullable = true)
 |-- Color_encoded: double (nullable = true)
 |-- Polish_encoded: double (nullable = true)
 |-- Symmetry_encoded: double (nullable = true)
 |-- Fluorescence_encoded: double (nullable = true)
 |-- Price per Carat: double (nullable = true)
 |-- price_label: integer (nullable = true)



## Spark Structured Streaming

## 2.4 Train Test Split

In [13]:
from pyspark.sql import SparkSession, DataFrame

class TrainTestSplitter:
    def __init__(self, dataframe: DataFrame, train_ratio: float = 0.8, seed: int = 42):
        self.df = dataframe
        self.train_ratio = train_ratio
        self.seed = seed
        self.spark = SparkSession.builder \
            .appName("TrainTestSplitter") \
            .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
            .getOrCreate()

    def split(self):
        self.train_df, self.test_df = self.df.randomSplit([self.train_ratio, 1 - self.train_ratio], seed=self.seed)
        print(f"Training Data Count: {self.train_df.count()}")
        print(f"Test Data Count: {self.test_df.count()}")
        return self.train_df, self.test_df

    def save_as_csv(self, train_path: str, test_path: str):
        if hasattr(self, 'train_df') and hasattr(self, 'test_df'):
            self.train_df.write.mode('overwrite').csv(train_path)
            self.test_df.write.mode('overwrite').csv(test_path)
            print(f"Training data saved to {train_path}")
            print(f"Testing data saved to {test_path}")
        else:
            print("DataFrames not split yet. Call split() first.")

    def close(self):
        self.spark.stop()
        print("Spark session stopped.")

if __name__ == "__main__":
    hdfs_path = "/user/student/streamed_data"  # Set file path to read from
    hdfs_reader = HDFSDataReader(hdfs_path)
    df = hdfs_reader.read_csv(hdfs_path)

    if df.count() > 0:
        hdfs_reader.show_data(5)
        
        # Split the data into train and test datasets 
        splitter = TrainTestSplitter(df, train_ratio=0.8, seed=1234)
        train_df, test_df = splitter.split()  # No argument needed here
        
        # Save the train and test datasets
        train_saver = HDFSDataSaver(output_path="/user/student/train_data", file_format='csv', mode='overwrite')
        test_saver = HDFSDataSaver(output_path="/user/student/test_data", file_format='csv', mode='overwrite')

        # Save the datasets
        train_saver.save(train_df)
        test_saver.save(test_df)

24/09/10 21:00:42 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/09/10 21:00:42 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


Preprocessed Data
+-----+-------+-----+------------+------+-------+-----+--------+-------------+---------------+-------------+--------------+----------------+--------------------+---------------+-----------+----+
|Carat|Clarity|Color|Fluorescence|Polish|  Price|Shape|Symmetry|Shape_encoded|Clarity_encoded|Color_encoded|Polish_encoded|Symmetry_encoded|Fluorescence_encoded|Price per Carat|price_label| low|
+-----+-------+-----+------------+------+-------+-----+--------+-------------+---------------+-------------+--------------+----------------+--------------------+---------------+-----------+----+
| 0.73|   VVS1|   H+|           N|    EX|10500.0|   RD|      EX|          0.0|            0.0|          8.0|           0.0|             0.0|                 0.0|       14383.56|          2|high|
| 0.58|   VVS1|   E+|           N|    EX|10749.0|   RD|      EX|          0.0|            0.0|          3.0|           0.0|             0.0|                 0.0|       18532.76|          2|high|
| 0.58|

24/09/10 21:00:43 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Training Data Count: 1503
Test Data Count: 412
Data saved to /user/student/train_data as csv
Data saved to /user/student/test_data as csv


## Store: Redis

In [25]:
#store train_df and test_df into redis
import redis
from pyspark.sql import Row

class RedisDataSaver:
    def __init__(self, host='localhost', port=6379, db=0):
        # Initialize Redis connection
        self.r = redis.StrictRedis(host=host, port=port, db=db, decode_responses=True)

    def save_to_redis(self, dataframe: DataFrame, key_prefix: str):
        try:
            # Convert DataFrame rows to dictionary format for Redis storage
            for idx, row in enumerate(dataframe.collect()):
                # Convert the row to a dictionary, handling NoneType values
                row_dict = row.asDict()
                sanitized_row = {k: (str(v) if v is not None else "null") for k, v in row_dict.items()}

                # Create a Redis key for each row
                redis_key = f"{key_prefix}:{idx}"

                # Save the dictionary to Redis as a hash
                self.r.hmset(redis_key, sanitized_row)

            print(f"Data successfully saved to Redis with key prefix: {key_prefix}")

        except Exception as e:
            print(f"Error saving data to Redis: {e}")

if __name__ == "__main__":
    redis_saver = RedisDataSaver()
    
    # Save the processed DataFrame to Redis
    redis_saver.save_to_redis(train_df, key_prefix="train_df")
    redis_saver.save_to_redis(train_df, key_prefix="test_df")


  self.r.hmset(redis_key, sanitized_row)


Data successfully saved to Redis with key prefix: train_df
Data successfully saved to Redis with key prefix: test_df


### Read from Redis

In [26]:
#read train_df from redis
import redis

class RedisDataReader:
    def __init__(self, host='localhost', port=6379, db=0):
        self.redis_client = redis.StrictRedis(host=host, port=port, db=db)

    def read_from_redis(self, key_prefix: str, count: int):
        data = []
        
        for i in range(count):
            redis_key = f"{key_prefix}:{i}"
            row = self.redis_client.hgetall(redis_key)
            
            row_decoded = {k.decode('utf-8'): v.decode('utf-8') for k, v in row.items()}
            
            if row_decoded:  # If the row exists
                data.append(row_decoded)
            else:
                print(f"No data found for key: {redis_key}")
        
        return data

if __name__ == "__main__":
    redis_reader = RedisDataReader()
    redis_data = redis_reader.read_from_redis(key_prefix="train_df", count=10)  # Fetch the first 10 rows

    for row in redis_data:
        print(row)

{'Carat': '0.3', 'Clarity': 'VVS1', 'Color': 'F+', 'Fluorescence': 'N', 'Polish': 'EX', 'Price': '2938.0', 'Shape': 'RD', 'Symmetry': 'EX', 'Shape_encoded': '0.0', 'Clarity_encoded': '0.0', 'Color_encoded': '7.0', 'Polish_encoded': '0.0', 'Symmetry_encoded': '0.0', 'Fluorescence_encoded': '0.0', 'Price per Carat': '9793.33', 'price_label': '0', 'low': 'low'}
{'Carat': '0.3', 'Clarity': 'VVS2', 'Color': 'G+', 'Fluorescence': 'F', 'Polish': 'EX', 'Price': '2028.0', 'Shape': 'RD', 'Symmetry': 'EX', 'Shape_encoded': '0.0', 'Clarity_encoded': '1.0', 'Color_encoded': '6.0', 'Polish_encoded': '0.0', 'Symmetry_encoded': '0.0', 'Fluorescence_encoded': '1.0', 'Price per Carat': '6760.0', 'price_label': '0', 'low': 'low'}
{'Carat': '0.3', 'Clarity': 'VVS2', 'Color': 'H+', 'Fluorescence': 'N', 'Polish': 'EX', 'Price': '2103.0', 'Shape': 'RD', 'Symmetry': 'EX', 'Shape_encoded': '0.0', 'Clarity_encoded': '1.0', 'Color_encoded': '8.0', 'Polish_encoded': '0.0', 'Symmetry_encoded': '0.0', 'Fluorescence

### Convert Redis to Spark_df

In [27]:
#retrieve from redis
import redis
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType
from pyspark.sql import Row

r = redis.StrictRedis(host='localhost', port=6379, db=0, decode_responses=True)
keys = r.keys('train_df:*')

data = []
for key in keys:
    record = r.hgetall(key)
    data.append(record)

spark = SparkSession.builder \
    .appName("Redis to Spark") \
    .getOrCreate()

#convert to pyspark dataframe
schema = StructType([
    StructField("Carat", DoubleType(), True),
    StructField("Clarity", StringType(), True),
    StructField("Color", StringType(), True),
    StructField("Fluorescence", StringType(), True),
    StructField("Polish", StringType(), True),
    StructField("Price", DoubleType(), True),
    StructField("Shape", StringType(), True),
    StructField("Symmetry", StringType(), True),
    StructField("Shape_encoded", DoubleType(), True),
    StructField("Clarity_encoded", DoubleType(), True),
    StructField("Color_encoded", DoubleType(), True),
    StructField("Polish_encoded", DoubleType(), True),
    StructField("Symmetry_encoded", DoubleType(), True),
    StructField("Fluorescence_encoded", DoubleType(), True),
    StructField("Price per Carat", DoubleType(), True),
    StructField("price_label", IntegerType(), True),
    StructField("low", StringType(), True)
])

# Convert data to rows and handle null values
def convert_value(k, v):
    if v is None or v == 'null':
        return None 
    if k.endswith('_encoded') or k in ['Carat', 'Price', 'Price per Carat']:
        return float(v) if v.replace('.', '', 1).isdigit() else None
    if k == 'price_label':
        return int(v) if v.isdigit() else None
    return v 

rows = [Row(**{k: convert_value(k, v) for k, v in record.items()}) for record in data]

train_df = spark.createDataFrame(rows, schema)
train_df.show(5)

24/09/10 21:15:01 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+-----+-------+-----+------------+------+-------+-----+--------+-------------+---------------+-------------+--------------+----------------+--------------------+---------------+-----------+----+
|Carat|Clarity|Color|Fluorescence|Polish|  Price|Shape|Symmetry|Shape_encoded|Clarity_encoded|Color_encoded|Polish_encoded|Symmetry_encoded|Fluorescence_encoded|Price per Carat|price_label| low|
+-----+-------+-----+------------+------+-------+-----+--------+-------------+---------------+-------------+--------------+----------------+--------------------+---------------+-----------+----+
| 0.62|   VVS1|    D|           M|    EX|13898.0|   RD|      EX|          0.0|            0.0|          0.0|           0.0|             0.0|                 2.0|       22416.13|          2|high|
| 0.32|   VVS2|    D|           M|    EX| 2428.0|   RD|      EX|          0.0|            1.0|          0.0|           0.0|             0.0|                 2.0|         7587.5|          0| low|
|  0.8|   VVS1|    E|    

In [28]:
#retrieve from redis
import redis
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, IntegerType
from pyspark.sql import Row

r = redis.StrictRedis(host='localhost', port=6379, db=0, decode_responses=True)
keys = r.keys('test_df:*')

data = []
for key in keys:
    record = r.hgetall(key)
    data.append(record)

spark = SparkSession.builder \
    .appName("Redis to Spark") \
    .getOrCreate()

#convert to pyspark dataframe
schema = StructType([
    StructField("Carat", DoubleType(), True),
    StructField("Clarity", StringType(), True),
    StructField("Color", StringType(), True),
    StructField("Fluorescence", StringType(), True),
    StructField("Polish", StringType(), True),
    StructField("Price", DoubleType(), True),
    StructField("Shape", StringType(), True),
    StructField("Symmetry", StringType(), True),
    StructField("Shape_encoded", DoubleType(), True),
    StructField("Clarity_encoded", DoubleType(), True),
    StructField("Color_encoded", DoubleType(), True),
    StructField("Polish_encoded", DoubleType(), True),
    StructField("Symmetry_encoded", DoubleType(), True),
    StructField("Fluorescence_encoded", DoubleType(), True),
    StructField("Price per Carat", DoubleType(), True),
    StructField("price_label", IntegerType(), True),
    StructField("low", StringType(), True)
])

# Convert data to rows and handle null values
def convert_value(k, v):
    if v is None or v == 'null':
        return None 
    if k.endswith('_encoded') or k in ['Carat', 'Price', 'Price per Carat']:
        return float(v) if v.replace('.', '', 1).isdigit() else None
    if k == 'price_label':
        return int(v) if v.isdigit() else None
    return v 

rows = [Row(**{k: convert_value(k, v) for k, v in record.items()}) for record in data]

test_df = spark.createDataFrame(rows, schema)
test_df.show(5)

+-----+-------+-----+------------+------+------+-----+--------+-------------+---------------+-------------+--------------+----------------+--------------------+---------------+-----------+---+
|Carat|Clarity|Color|Fluorescence|Polish| Price|Shape|Symmetry|Shape_encoded|Clarity_encoded|Color_encoded|Polish_encoded|Symmetry_encoded|Fluorescence_encoded|Price per Carat|price_label|low|
+-----+-------+-----+------------+------+------+-----+--------+-------------+---------------+-------------+--------------+----------------+--------------------+---------------+-----------+---+
| 0.31|     IF|    F|           N|    EX|3172.0|   RD|      EX|          0.0|            2.0|          2.0|           0.0|             0.0|                 0.0|       10232.26|          0|low|
|  0.3|   VVS2|   G+|           F|    EX|2028.0|   RD|      EX|          0.0|            1.0|          6.0|           0.0|             0.0|                 1.0|         6760.0|          0|low|
| 0.35|     IF|    E|           N| 

### Redis Query

In [34]:
# Query 1: Identify the number of diamonds for each price category
grouped_df = df.groupBy("price_label").count()
grouped_df.show()

+-----------+-----+
|price_label|count|
+-----------+-----+
|          1|  447|
|          2|  267|
|          0|  789|
+-----------+-----+



In [20]:
# Query 2: Average price of each carat
df.groupBy("Carat").agg({"Price": "avg"}).orderBy("Carat").show()

+-----+------------------+
|Carat|        avg(Price)|
+-----+------------------+
| NULL|              NULL|
|  0.3| 2858.780104712042|
| 0.31|       3280.390625|
| 0.32|          3153.675|
| 0.33|3682.8823529411766|
| 0.34|3608.2162162162163|
| 0.35|           3916.72|
| 0.36| 3929.032258064516|
| 0.37|            3759.0|
| 0.38|4040.6470588235293|
|  0.4| 4200.068627450981|
| 0.41|            4492.7|
| 0.42|            4816.0|
| 0.43| 4542.951219512195|
| 0.44| 4970.357142857143|
| 0.45| 4115.976744186047|
| 0.46|            5234.4|
| 0.47|            3840.0|
|  0.5|  6992.25974025974|
| 0.51|           7216.34|
+-----+------------------+
only showing top 20 rows



In [22]:
# Query 3: Correlation of feature columns with Price
numerical_columns = ["Carat", "Price per Carat", "Shape_encoded", "Clarity_encoded", "Color_encoded", 
                     "Polish_encoded", "Symmetry_encoded", "Fluorescence_encoded"]

for col in numerical_columns:
    correlation = df.stat.corr("Price", col)
    print(f"Correlation between Price and {col}: {correlation}")

Correlation between Price and Carat: 0.8856350156433647
Correlation between Price and Price per Carat: 0.9473245649727444
Correlation between Price and Shape_encoded: -0.08548364802734425
Correlation between Price and Clarity_encoded: -0.1364167124277153
Correlation between Price and Color_encoded: -0.15161489094051314
Correlation between Price and Polish_encoded: -0.13301420391677396
Correlation between Price and Symmetry_encoded: -0.17546699084461193
Correlation between Price and Fluorescence_encoded: 0.03088897919073895


## Read from HDFS

In [None]:
#check saved train_data dataset
class HDFSDataReader:
    def __init__(self, hdfs_path: str):
        self.spark = SparkSession.builder \
            .appName("ReadHDFS") \
            .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
            .getOrCreate()
        self.hdfs_path = hdfs_path

    def read_data(self) -> DataFrame:
        self.df = self.spark.read.json(self.hdfs_path)
        return self.df
    
    def read_csv(self, path: str) -> DataFrame:
        self.df = self.spark.read.csv(path, header=True, inferSchema=True)
        return self.df

    def show_data(self, num_rows=5):
        if hasattr(self, 'df'):
            self.df.show(num_rows)
        else:
            print("DataFrame not loaded yet. Call read_data() or read_csv() first.")

if __name__ == "__main__":
    hdfs_reader = HDFSDataReader("/user/student/train_data")
    df_saved = hdfs_reader.read_csv("/user/student/train_data")
    hdfs_reader.show_data(5)
    
    print(f"Train data count: {df_saved.count()}")

In [None]:
df_saved.printSchema()

In [None]:
#check saved test_data dataset
class HDFSDataReader:
    def __init__(self, hdfs_path: str):
        self.spark = SparkSession.builder \
            .appName("ReadHDFS") \
            .config("spark.hadoop.fs.defaultFS", "hdfs://localhost:9000") \
            .getOrCreate()
        self.hdfs_path = hdfs_path

    def read_data(self) -> DataFrame:
        self.df = self.spark.read.json(self.hdfs_path)
        return self.df
    
    # New method to read CSV
    def read_csv(self, path: str) -> DataFrame:
        self.df = self.spark.read.csv(path, header=True, inferSchema=True)
        return self.df

    def show_data(self, num_rows=5):
        if hasattr(self, 'df'):
            print("Preprocessed Data")
            self.df.show(num_rows)
        else:
            print("DataFrame not loaded yet. Call read_data() or read_csv() first.")

if __name__ == "__main__":
    hdfs_reader = HDFSDataReader("/user/student/test_data")
    df_saved = hdfs_reader.read_csv("/user/student/test_data")
    hdfs_reader.show_data(5)
    
    print(f"Test data count: {df_saved.count()}")

In [None]:
df_saved.printSchema()

### 2.4(a) Random Forest with HDFS

In [32]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

class RandomForestModeler:
    def __init__(self, train_path: str, test_path: str):
        self.spark = SparkSession.builder.appName("RandomForestModeler").getOrCreate()
        self.train_path = train_path
        self.test_path = test_path
        self.train_df = self.load_data(self.train_path)
        self.test_df = self.load_data(self.test_path)

    def load_data(self, path: str):
        return self.spark.read.csv(path, header=True, inferSchema=True)

    def prepare_data(self, df, feature_cols, label_col):
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
        df_prepared = assembler.transform(df).select("features", label_col)
        return df_prepared

    def train_random_forest(self, feature_cols, label_col, num_trees=10):
        # Prepare training data
        train_data = self.prepare_data(self.train_df, feature_cols, label_col)

        # Initialize the Random Forest Classifier
        rf = RandomForestClassifier(featuresCol="features", labelCol=label_col, numTrees=num_trees)
        
        # Train the model
        self.rf_model = rf.fit(train_data)
        print("Random Forest model trained.")

    def evaluate_model(self, feature_cols, label_col):
        # Prepare test data
        test_data = self.prepare_data(self.test_df, feature_cols, label_col)

        # Make predictions
        predictions = self.rf_model.transform(test_data)

        # Initialize evaluators for different metrics
        accuracy_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="accuracy")
        f1_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="f1")
        precision_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="weightedPrecision")
        recall_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="weightedRecall")

        # Compute metrics
        accuracy = accuracy_evaluator.evaluate(predictions)
        f1_score = f1_evaluator.evaluate(predictions)
        precision = precision_evaluator.evaluate(predictions)
        recall = recall_evaluator.evaluate(predictions)

        print(f"Accuracy: {accuracy * 100:.2f}%")
        print(f"F1 Score: {f1_score:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        
        return {
            "accuracy": accuracy,
            "f1_score": f1_score,
            "precision": precision,
            "recall": recall
        }

    def close(self):
        """
        Stops the Spark session.
        """
        self.spark.stop()
        print("Spark session stopped.")

if __name__ == "__main__":
    train_path = "/user/student/train_data"
    test_path = "/user/student/test_data"

    # Instantiate the RandomForestModeler
    rf_modeler = RandomForestModeler(train_path, test_path)

    # Specify features and label columns
    feature_columns = ['Carat','Shape_encoded', 'Clarity_encoded', 'Color_encoded', 'Polish_encoded','Symmetry_encoded','Fluorescence_encoded'] 
    label_column = 'price_label'  

    # Train the Random Forest model
    rf_modeler.train_random_forest(feature_cols=feature_columns, label_col=label_column, num_trees=20)
    
    #Metrics
    metrics = rf_modeler.evaluate_model(feature_cols=feature_columns, label_col=label_column)

NameError: name 'train_path' is not defined

### Random Forest with Redis

In [33]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

class RandomForestModeler:
    def __init__(self, train_df, test_df):
        self.spark = SparkSession.builder.appName("RandomForestModeler").getOrCreate()
        self.train_df = train_df
        self.test_df = test_df

    def prepare_data(self, df, feature_cols, label_col):
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
        df_prepared = assembler.transform(df).select("features", label_col)
        return df_prepared

    def train_random_forest(self, feature_cols, label_col, num_trees=10):
        # Prepare training data
        train_data = self.prepare_data(self.train_df, feature_cols, label_col)

        # Initialize the Random Forest Classifier
        rf = RandomForestClassifier(featuresCol="features", labelCol=label_col, numTrees=num_trees)
        
        # Train the model
        self.rf_model = rf.fit(train_data)
        print("Random Forest model trained.")

    def evaluate_model(self, feature_cols, label_col):
        # Prepare test data
        test_data = self.prepare_data(self.test_df, feature_cols, label_col)

        # Make predictions
        predictions = self.rf_model.transform(test_data)

        # Initialize evaluators for different metrics
        accuracy_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="accuracy")
        f1_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="f1")
        precision_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="weightedPrecision")
        recall_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="weightedRecall")

        # Compute metrics
        accuracy = accuracy_evaluator.evaluate(predictions)
        f1_score = f1_evaluator.evaluate(predictions)
        precision = precision_evaluator.evaluate(predictions)
        recall = recall_evaluator.evaluate(predictions)

        print(f"Accuracy: {accuracy * 100:.2f}%")
        print(f"F1 Score: {f1_score:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        
        return {
            "accuracy": accuracy,
            "f1_score": f1_score,
            "precision": precision,
            "recall": recall
        }

    def close(self):
        self.spark.stop()
        print("Spark session stopped.")

from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.appName("MainExecution").getOrCreate()
    
    # Specify features and label columns
    feature_columns = ['Carat', 'Shape_encoded', 'Clarity_encoded', 'Color_encoded', 'Polish_encoded', 'Symmetry_encoded', 'Fluorescence_encoded'] 
    label_column = 'price_label'

    # Instantiate the RandomForestModeler with DataFrames
    rf_modeler = RandomForestModeler(train_df=train_df, test_df=test_df)

    # Train the Random Forest model
    rf_modeler.train_random_forest(feature_cols=feature_columns, label_col=label_column, num_trees=20)
    
    # Evaluate model metrics
    metrics = rf_modeler.evaluate_model(feature_cols=feature_columns, label_col=label_column)

24/09/10 21:31:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
24/09/10 21:31:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


Random Forest model trained.
Accuracy: 92.55%
F1 Score: 0.9248
Precision: 0.9252
Recall: 0.9255


### 2.4(b) Gradient Boosting

In [None]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

class GradientBoostingRegressor:
    def __init__(self, train_df: DataFrame, test_df: DataFrame, feature_cols: list, label_col: str):
        self.train_df = train_df
        self.test_df = test_df
        self.feature_cols = feature_cols
        self.label_col = label_col
        
        # Assemble features into a single vector column
        self.feature_assembler = VectorAssembler(inputCols=self.feature_cols, outputCol='features')
        self.train_df = self.feature_assembler.transform(self.train_df)
        self.test_df = self.feature_assembler.transform(self.test_df)

    

    
    def train(self, max_iter: int = 100):
        """
        Train a Gradient Boosted Trees Regressor model.
        """
        gbt = GBTRegressor(
            featuresCol='features',
            labelCol=self.label_col,
            maxIter=max_iter
        )
        
        # Fit the model
        self.model = gbt.fit(self.train_df)
        print("Gradient Boosting Regressor model trained.")
    
    def evaluate(self):
        """
        Evaluate the Gradient Boosting model using the test dataset.
        """
        if self.model is None:
            raise ValueError("Model has not been trained.")
        
        # Make predictions
        predictions = self.model.transform(self.test_df)
        
        # Initialize evaluators for different metrics
        evaluator_rmse = RegressionEvaluator(labelCol=self.label_col, predictionCol='prediction', metricName='rmse')
        evaluator_mae = RegressionEvaluator(labelCol=self.label_col, predictionCol='prediction', metricName='mae')
        evaluator_r2 = RegressionEvaluator(labelCol=self.label_col, predictionCol='prediction', metricName='r2')
        
        # Compute metrics
        rmse = evaluator_rmse.evaluate(predictions)
        mae = evaluator_mae.evaluate(predictions)
        r2 = evaluator_r2.evaluate(predictions)
        
        # Print metrics
        print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")
        print(f"Mean Absolute Error (MAE) on test data = {mae}")
        print(f"R-squared on test data = {r2}")

        # Show some predictions
        predictions.select('features', self.label_col, 'prediction').show(10)
        
        return {
            "rmse": rmse,
            "mae": mae,
            "r2": r2
        }

if __name__ == "__main__":
    train_path = "/user/student/train_data"
    test_path = "/user/student/test_data"

    train_df = hdfs_reader.read_csv(train_path)
    test_df = hdfs_reader.read_csv(test_path)
    
    # Specify features and label columns
    feature_columns = ['Carat', 'Shape_encoded', 'Clarity_encoded', 'Color_encoded', 'Polish_encoded', 'Symmetry_encoded', 'Fluorescence_encoded']
    label_column = 'Price'
    
    # Initialize GradientBoostingRegressor with train and test datasets
    gbt_regressor = GradientBoostingRegressor(train_df=train_df, test_df=test_df, feature_cols=feature_columns, label_col=label_column)
    
    # Train the Gradient Boosting model
    gbt_regressor.train(max_iter=100)
    
    # Evaluate the model
    metrics = gbt_regressor.evaluate()

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

class RandomForestModeler:
    def __init__(self, train_path: str, test_path: str):
        self.spark = SparkSession.builder.appName("RandomForestModeler").getOrCreate()
        self.train_path = train_path
        self.test_path = test_path
        self.train_df = self.load_data(self.train_path)
        self.test_df = self.load_data(self.test_path)

    def load_data(self, path: str):
        """
        Load dataset from the given HDFS path.
        """
        return self.spark.read.csv(path, header=True, inferSchema=True)

    def prepare_data(self, df, feature_cols, label_col):
        """
        Prepares the dataset by assembling features into a single vector column.
        
        :param df: DataFrame containing the dataset.
        :param feature_cols: List of column names to be used as features.
        :param label_col: Column name to be used as the label.
        """
        assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
        df_prepared = assembler.transform(df).select("features", label_col)
        return df_prepared

    def train_random_forest(self, feature_cols, label_col, num_trees=10):
        """
        Trains a Random Forest classifier on the training data.
        
        :param feature_cols: List of columns used as features.
        :param label_col: Column used as the label.
        :param num_trees: Number of trees in the Random Forest.
        """
        # Prepare training data
        train_data = self.prepare_data(self.train_df, feature_cols, label_col)

        # Initialize the Random Forest Classifier
        rf = RandomForestClassifier(featuresCol="features", labelCol=label_col, numTrees=num_trees)
        
        # Train the model
        self.rf_model = rf.fit(train_data)
        print("Random Forest model trained.")

    def evaluate_model(self, feature_cols, label_col):
        """
        Evaluates the trained Random Forest model on the test data using accuracy, F1 score, precision, and recall.
        
        :param feature_cols: List of columns used as features.
        :param label_col: Column used as the label.
        """
        # Prepare test data
        test_data = self.prepare_data(self.test_df, feature_cols, label_col)

        # Make predictions
        predictions = self.rf_model.transform(test_data)

        # Initialize evaluators for different metrics
        accuracy_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="accuracy")
        f1_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="f1")
        precision_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="weightedPrecision")
        recall_evaluator = MulticlassClassificationEvaluator(labelCol=label_col, predictionCol="prediction", metricName="weightedRecall")

        # Compute metrics
        accuracy = accuracy_evaluator.evaluate(predictions)
        f1_score = f1_evaluator.evaluate(predictions)
        precision = precision_evaluator.evaluate(predictions)
        recall = recall_evaluator.evaluate(predictions)

        print(f"Accuracy: {accuracy * 100:.2f}%")
        print(f"F1 Score: {f1_score:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        
        return {
            "accuracy": accuracy,
            "f1_score": f1_score,
            "precision": precision,
            "recall": recall
        }

    def close(self):
        """
        Stops the Spark session.
        """
        self.spark.stop()
        print("Spark session stopped.")

if __name__ == "__main__":
    train_path = "/user/student/train_data"
    test_path = "/user/student/test_data"

    # Instantiate the RandomForestModeler
    rf_modeler = RandomForestModeler(train_path, test_path)

    # Specify features and label columns
    feature_columns = ['Carat','Color_encoded'] 
    label_column = 'price_label'  

    # Train the Random Forest model
    rf_modeler.train_random_forest(feature_cols=feature_columns, label_col=label_column, num_trees=20)
    
    #Metrics
    metrics = rf_modeler.evaluate_model(feature_cols=feature_columns, label_col=label_column)

In [None]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import col, log, sqrt
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator
    
class GradientBoostingRegressor:
    def __init__(self, train_df: DataFrame, test_df: DataFrame, feature_cols: list, label_col: str):
        self.train_df = self.create_interaction_features(train_df)
        self.test_df = self.create_interaction_features(test_df)
        self.feature_cols = feature_cols
        self.label_col = label_col
        
        # Assemble features into a single vector column
        self.feature_assembler = VectorAssembler(inputCols=self.feature_cols, outputCol='features')
        self.train_df = self.feature_assembler.transform(self.train_df)
        self.test_df = self.feature_assembler.transform(self.test_df)

    def create_interaction_features(self, df: DataFrame) -> DataFrame:
        df = df.withColumn('Carat_Color_Interaction', col('Carat') * col('Color_encoded'))
        df = df.withColumn('Carat_Clarity_Interaction', col('Carat') * col('Clarity_encoded'))
        df = df.withColumn('Carat_Polish_Interaction', col('Carat') * col('Polish_encoded'))
        df = df.withColumn('Carat_Symmetry_Interaction', col('Carat') * col('Symmetry_encoded'))
        df = df.withColumn('Carat_Fluorescence_Interaction', col('Carat') * col('Fluorescence_encoded'))
        
        return df
    
    def train(self, max_iter: int = 100):
        """
        Train a Gradient Boosted Trees Regressor model.
        """
        gbt = GBTRegressor(
            featuresCol='features',
            labelCol=self.label_col,
            maxIter=max_iter
        )
        
        # Fit the model
        self.model = gbt.fit(self.train_df)
        print("Gradient Boosting Regressor model trained.")
    
    def evaluate(self):
        """
        Evaluate the Gradient Boosting model using the test dataset.
        """
        if self.model is None:
            raise ValueError("Model has not been trained.")
        
        # Make predictions
        predictions = self.model.transform(self.test_df)
        
        # Initialize evaluators for different metrics
        evaluator_rmse = RegressionEvaluator(labelCol=self.label_col, predictionCol='prediction', metricName='rmse')
        evaluator_mae = RegressionEvaluator(labelCol=self.label_col, predictionCol='prediction', metricName='mae')
        evaluator_r2 = RegressionEvaluator(labelCol=self.label_col, predictionCol='prediction', metricName='r2')
        
        # Compute metrics
        rmse = evaluator_rmse.evaluate(predictions)
        mae = evaluator_mae.evaluate(predictions)
        r2 = evaluator_r2.evaluate(predictions)
        
        # Print metrics
        print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")
        print(f"Mean Absolute Error (MAE) on test data = {mae}")
        print(f"R-squared on test data = {r2}")

        # Show some predictions
        predictions.select('features', self.label_col, 'prediction').show(10)
        
        return {
            "rmse": rmse,
            "mae": mae,
            "r2": r2
        }

if __name__ == "__main__":
    train_path = "/user/student/train_data"
    test_path = "/user/student/test_data"

    train_df = hdfs_reader.read_csv(train_path)
    test_df = hdfs_reader.read_csv(test_path)
    
    # Specify features and label columns
    label_column = 'Price'  
    feature_columns = ['Carat', 'Color_encoded', 'Clarity_encoded', 'Polish_encoded', 'Symmetry_encoded', 'Fluorescence_encoded', 
                       'Carat_Color_Interaction', 'Carat_Clarity_Interaction', 'Carat_Polish_Interaction', 'Carat_Symmetry_Interaction', 
                       'Carat_Fluorescence_Interaction']
    
    # Initialize GradientBoostingRegressor with train and test datasets
    gbt_regressor = GradientBoostingRegressor(train_df=train_df, test_df=test_df, feature_cols=feature_columns, label_col=label_column)
    
    # Train the Gradient Boosting model
    gbt_regressor.train(max_iter=100)
    
    # Evaluate the model
    metrics = gbt_regressor.evaluate()