In [1]:
import plotly.express as px
%matplotlib inline
import datetime
import numpy as np
import pandas as pd
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor, DecisionTreeRegressionModel
from pyspark.sql.functions import col, monotonically_increasing_id, lit, date_add, explode, sequence, to_date, sum
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import GBTRegressor, GBTRegressionModel

In [2]:
sc = SparkContext(master = 'local')
spark = SparkSession.builder \
          .appName("Python Spark SQL basic example") \
          .config("spark.some.config.option", "some-value") \
          .getOrCreate()

24/12/13 15:25:06 WARN Utils: Your hostname, Khim3PC resolves to a loopback address: 127.0.1.1; using 10.0.122.4 instead (on interface wlo1)
24/12/13 15:25:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/13 15:25:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.csv('./NFLX.csv', header=True, inferSchema=True)
df.show(5)

+----------+----------+----------+----------+----------+----------+--------+
|      Date|      Open|      High|       Low|     Close| Adj Close|  Volume|
+----------+----------+----------+----------+----------+----------+--------+
|2018-02-05|     262.0|267.899994|250.029999|254.259995|254.259995|11896100|
|2018-02-06|247.699997|266.700012|     245.0|265.720001|265.720001|12595800|
|2018-02-07|266.579987|272.450012|264.329987|264.559998|264.559998| 8981500|
|2018-02-08|267.079987|267.619995|     250.0|250.100006|250.100006| 9306700|
|2018-02-09|253.850006|255.800003|236.110001|249.470001|249.470001|16906900|
+----------+----------+----------+----------+----------+----------+--------+
only showing top 5 rows



In [4]:
split_index = int(df.count() * 0.8)
train = df.limit(split_index)
test = df.subtract(train)
test_copy = test.select("*") 
print(f"Training set row count: {train.count()}")
print(f"Testing set row count: {test.count()}")

Training set row count: 807
Testing set row count: 202


In [5]:
feature_columns = ["Open", "High", "Low", "Volume"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
train = assembler.transform(train).select("features", col("Close").alias("label"))
test = assembler.transform(test).select("features", col("Close").alias("label"))
train.show(5)

+--------------------+----------+
|            features|     label|
+--------------------+----------+
|[262.0,267.899994...|254.259995|
|[247.699997,266.7...|265.720001|
|[266.579987,272.4...|264.559998|
|[267.079987,267.6...|250.100006|
|[253.850006,255.8...|249.470001|
+--------------------+----------+
only showing top 5 rows



In [None]:
# Initialize the Decision Tree Regressor
dt_regressor = DecisionTreeRegressor(labelCol="label", featuresCol="features", seed=42)

# Set up the parameter grid for hyperparameter tuning
paramGrid = ParamGridBuilder() \
    .addGrid(dt_regressor.maxDepth, [5, 10, 15]) \
    .addGrid(dt_regressor.maxBins, [32, 64, 128]) \
    .addGrid(dt_regressor.minInstancesPerNode, [1, 2]) \
    .addGrid(dt_regressor.minInfoGain, [0.0, 0.01]) \
    .build()

# Use CrossValidator for grid search
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse")
crossval = CrossValidator(
    estimator=dt_regressor,
    estimatorParamMaps=paramGrid,
    evaluator=evaluator,
    numFolds=5  # Number of folds for cross-validation
)

# Train the model with cross-validation
cv_model = crossval.fit(train)

# Get the best model
best_model = cv_model.bestModel
# Make predictions on the test set
predictions = best_model.transform(test)

# Evaluate the best model on the test set
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print("Test Set Evaluation Metrics:")
print(f"MSE: {round(mse, 3)}")
print(f"RMSE: {round(rmse, 3)}")
print(f"MAE: {round(mae, 3)}")
print(f"R2: {round(r2, 3)}")

24/12/13 15:25:19 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


Best Model Params:
  Max Depth: 10
  Max Bins: 64
Test Set Evaluation Metrics:
MSE: 2409.527
RMSE: 49.087
MAE: 31.121
R2: 0.514


In [7]:
test_pred = test.join(
    predictions.select("features", "prediction"), 
    on="features", 
    how="inner"
)

# Rename the 'prediction' column to 'Close_Prediction'
test_pred = test_pred.withColumnRenamed("prediction", "Close_Prediction")

# Optionally drop the 'features' column if no longer needed
test_pred = test_pred.drop("features")

# Show the resulting DataFrame
test_pred.show(5)

+----------+-----------------+
|     label| Close_Prediction|
+----------+-----------------+
|569.190002|557.1300253333334|
|603.349976|        562.76001|
|488.940002|        491.07667|
|508.899994|515.5599974999999|
|505.549988|       502.770004|
+----------+-----------------+
only showing top 5 rows



In [8]:
# Merge the DataFrames on `Close` from test_copy and `label` from test_pred
merged_df = test_copy.join(test_pred, test_copy["Close"] == test_pred["label"], how="inner")

# Drop the duplicate column `label` after the join
merged_df = merged_df.drop("label")

# Show the merged DataFrame
merged_df.show(5)


+----------+----------+----------+----------+----------+----------+--------+-----------------+
|      Date|      Open|      High|       Low|     Close| Adj Close|  Volume| Close_Prediction|
+----------+----------+----------+----------+----------+----------+--------+-----------------+
|2021-08-31|566.119995| 569.47998|561.609985|569.190002|569.190002| 2431900|557.1300253333334|
|2021-10-04|613.390015|626.130005|594.679993|603.349976|603.349976| 4995900|        562.76001|
|2021-05-17|485.589996|492.709991|482.809998|488.940002|488.940002| 2705200|        491.07667|
|2021-04-21|     508.0|515.460022|503.600006|508.899994|508.899994|22897400|515.5599974999999|
|2021-04-23| 509.01001|509.700012|500.700012|505.549988|505.549988| 7307700|       502.770004|
+----------+----------+----------+----------+----------+----------+--------+-----------------+
only showing top 5 rows



In [9]:
merge_df = merged_df.toPandas()
# Sort the DataFrame by Date
merged_df_pandas = merge_df.sort_values(by="Date")

# Plot using Plotly
fig = px.line(
    merged_df_pandas,
    x="Date",
    y=["Close", "Close_Prediction"],
    title="Close Price vs Close Price Prediction"
)

# Adjust the layout for better visualization
fig.update_layout(width=900, height=600)

# Show the plot
fig.show()


In [10]:
# save the model 
best_model.save("./models/decision_tree_regressor")