In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor
import json
import numpy as np
import pandas as pd
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import RandomForestRegressor
from sklearn.model_selection import train_test_split


In [4]:
data = pd.read_csv("./docs/Clean_Dataset.csv")
data = data[data.price <= 100000]
train, test = train_test_split(data,test_size=0.30)
train['price'] = train['price'].astype(float)
test['price'] = test['price'].astype(float)

train.to_csv("./docs/train.csv",index=False)
test.to_csv("./docs/test.csv",index=False)


In [5]:

spark = SparkSession.builder.appName("demo2")\
.config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3") \
.config("spark.jars", "./jars/sqlite-jdbc-3.47.0.0.jar")\
.config("spark.driver.extraClassPath", "./jars/sqlite-jdbc-3.47.0.0.jar").getOrCreate()


In [6]:
def read_data(spark, filepath):    
    data = spark.read.csv(filepath, header=True, inferSchema=True)
    return data

In [7]:
# features_cols=['airline',  'source_city', 'departure_time','stops', 'arrival_time',
#             'destination_city', 'class', 'duration','days_left', ]
# target_col='price'


In [8]:


class pre_process:
    def __init__(self, train_data):
        self.train_data = train_data
        self.train_numerical_features = ['duration', 'days_left']
        self.train_categorical_features = self.get_categorical_features()

    def get_categorical_features(self):
        pandas_df = self.train_data.toPandas()
        categorical_features = pandas_df.select_dtypes(
            include='object').columns.tolist()
        return categorical_features

    # Convert categorical features into nominal features

    def get_indexers(self):
        indexers = [StringIndexer(inputCol=feature, outputCol=feature + "_index",
                                  handleInvalid='keep') for feature in self.train_categorical_features]
        return indexers

    # One hot encode categorical features
    def get_encoders(self):
        encoders = [OneHotEncoder(inputCols=[feature + "_index"], outputCols=[feature + "_encoded"],
                                  handleInvalid='keep') for feature in self.train_categorical_features]
        return encoders

    # Scale numerical features
    def get_numerical_f_assembler_scaler(self):
        numerical_assembler = VectorAssembler(
            inputCols=self.train_numerical_features, outputCol="numerical_features")
        scaler = MinMaxScaler(inputCol="numerical_features",
                              outputCol="numerical_features_scaled")
        return numerical_assembler, scaler

    def get_pipeline_model(self):

        indexers = self.get_indexers()
        encoders = self.get_encoders()
        numerical_assembler, scaler = self.get_numerical_f_assembler_scaler()
        encoded_feature_cols = [
            feature + "_encoded" for feature in self.train_categorical_features]
        
        final_assembler = VectorAssembler(
            inputCols=encoded_feature_cols + ["numerical_features_scaled"],
            outputCol="features"
        )


        pipeline_indexers_encoders = Pipeline(
            stages=indexers + encoders + [numerical_assembler, scaler, final_assembler])
        pipeline_model = pipeline_indexers_encoders.fit(self.train_data)

       

        return pipeline_model

    def save_fitted_pipeline_model(self, pipeline_model, path_string):
        try:
            pipeline_model.save(path_string)
        except:
            print("Error Saving Pipeline Model")


In [9]:
train_data = read_data(spark,"./docs/train.csv")
pre_processor = pre_process(train_data)
pipeline_model = pre_processor.get_pipeline_model()
prepared_data = pipeline_model.transform(train_data)
pre_processor.save_fitted_pipeline_model(pipeline_model, "./docs/pipeline_model_backup")
random_forest = RandomForestRegressor(
        featuresCol="features", labelCol="price")
random_forest_model = random_forest.fit(prepared_data)
random_forest_model.save("./docs/random_forest_model")


In [10]:
test_data = read_data(spark,"./docs/test.csv")


prepared_data = pipeline_model.transform(
    test_data).select("features", "price")

predictions = random_forest_model.transform(prepared_data)



r2_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="r2")
rmse_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="rmse")
mae_evaluator = RegressionEvaluator(labelCol="price", predictionCol="prediction", metricName="mae")


r2 = r2_evaluator.evaluate(predictions)
rmse = rmse_evaluator.evaluate(predictions)
mae = mae_evaluator.evaluate(predictions)


print("R²:", r2)
print("RMSE:", rmse)
print("MAE:", mae)
predictions.select("prediction", "price").show()


R²: 0.9429297855648334
RMSE: 5407.781430014901
MAE: 3279.9095001396186
+------------------+-------+
|        prediction|  price|
+------------------+-------+
| 6985.515678828156| 4977.0|
| 6197.119120766118| 4979.0|
| 3478.984800043037| 3999.0|
| 6197.119120766118| 8877.0|
| 57178.71626939456|52175.0|
|10543.504823219853|14094.0|
|7058.3513565095345| 3988.0|
| 4254.721948219717| 9736.0|
|47122.152250934465|39321.0|
| 4213.493524061561| 3393.0|
| 6244.419227667903| 6699.0|
| 4973.282581774701| 3788.0|
|  7078.68589514644| 4230.0|
| 5647.882162321157| 5289.0|
|56264.148308125696|60365.0|
| 5592.985464400039| 9136.0|
| 5911.097051480472| 4363.0|
| 7774.495685135902|16065.0|
| 6184.719542630145| 6568.0|
|3614.9918310625862| 2074.0|
+------------------+-------+
only showing top 20 rows

