In [None]:
import numpy as np
import matplotlib.pyplot as plt
import warnings
import plotly.express as px
warnings.filterwarnings('ignore')
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, monotonically_increasing_id, lit, date_add, explode, sequence

In [2]:
sc = SparkContext(master = 'local')
spark = SparkSession.builder \
          .appName("Python Spark SQL basic example") \
          .config("spark.some.config.option", "some-value") \
          .getOrCreate()

24/12/12 18:03:04 WARN Utils: Your hostname, Khim3PC resolves to a loopback address: 127.0.1.1; using 10.0.122.4 instead (on interface wlo1)
24/12/12 18:03:04 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/12 18:03:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.csv('./NFLX.csv', header=True, inferSchema=True)
df.show(5)

+----------+----------+----------+----------+----------+----------+--------+
|      Date|      Open|      High|       Low|     Close| Adj Close|  Volume|
+----------+----------+----------+----------+----------+----------+--------+
|2018-02-05|     262.0|267.899994|250.029999|254.259995|254.259995|11896100|
|2018-02-06|247.699997|266.700012|     245.0|265.720001|265.720001|12595800|
|2018-02-07|266.579987|272.450012|264.329987|264.559998|264.559998| 8981500|
|2018-02-08|267.079987|267.619995|     250.0|250.100006|250.100006| 9306700|
|2018-02-09|253.850006|255.800003|236.110001|249.470001|249.470001|16906900|
+----------+----------+----------+----------+----------+----------+--------+
only showing top 5 rows



In [4]:
viz = df
type(viz)

pyspark.sql.dataframe.DataFrame

In [5]:
# 1. Check for null values
null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns])
null_counts.show()

# 2. Get schema and row/column counts
df.printSchema()
print(f"Number of rows: {df.count()}")
print(f"Number of columns: {len(df.columns)}")

# 3. Drop duplicates
df = df.dropDuplicates()

+----+----+----+---+-----+---------+------+
|Date|Open|High|Low|Close|Adj Close|Volume|
+----+----+----+---+-----+---------+------+
|   0|   0|   0|  0|    0|        0|     0|
+----+----+----+---+-----+---------+------+

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)

Number of rows: 1009
Number of columns: 7


In [6]:
# Calculate split index
split_index = int(df.count() * 0.8)

# Split the dataset into training and testing sets
train = df.limit(split_index)  # Take the first 80% of rows
test = df.subtract(train)      # Subtract the training set from the original DataFrame to get the test set

# Display row counts of the resulting DataFrames to verify the split
print(f"Training set row count: {train.count()}")
print(f"Testing set row count: {test.count()}")
test.show(5)

Training set row count: 807
Testing set row count: 202
+----------+----------+----------+----------+----------+----------+-------+
|      Date|      Open|      High|       Low|     Close| Adj Close| Volume|
+----------+----------+----------+----------+----------+----------+-------+
|2021-01-15|     500.0|506.320007|495.100006|497.980011|497.980011|5895800|
|2021-08-31|566.119995| 569.47998|561.609985|569.190002|569.190002|2431900|
|2019-08-13|309.769989|316.429993|308.160004|312.279999|312.279999|5289400|
|2021-05-17|485.589996|492.709991|482.809998|488.940002|488.940002|2705200|
|2021-04-27|512.619995| 512.98999|504.579987|505.549988|505.549988|3761300|
+----------+----------+----------+----------+----------+----------+-------+
only showing top 5 rows



In [7]:
test_copy = test.select("*")  # Select all columns to create a new DataFrame

# Test to ensure both are independent
print("Original test DataFrame:")
test.show(5)

print("Copied test_pred DataFrame:")
test_copy.show(5)

Original test DataFrame:
+----------+----------+----------+----------+----------+----------+-------+
|      Date|      Open|      High|       Low|     Close| Adj Close| Volume|
+----------+----------+----------+----------+----------+----------+-------+
|2021-01-15|     500.0|506.320007|495.100006|497.980011|497.980011|5895800|
|2021-08-31|566.119995| 569.47998|561.609985|569.190002|569.190002|2431900|
|2019-08-13|309.769989|316.429993|308.160004|312.279999|312.279999|5289400|
|2021-05-17|485.589996|492.709991|482.809998|488.940002|488.940002|2705200|
|2021-04-27|512.619995| 512.98999|504.579987|505.549988|505.549988|3761300|
+----------+----------+----------+----------+----------+----------+-------+
only showing top 5 rows

Copied test_pred DataFrame:
+----------+----------+----------+----------+----------+----------+-------+
|      Date|      Open|      High|       Low|     Close| Adj Close| Volume|
+----------+----------+----------+----------+----------+----------+-------+
|2021-01-1

In [8]:
import numpy as np

# Select features for training and testing
x_train = np.array(train.select("Open", "High", "Low", "Volume").collect())
x_test = np.array(test.select("Open", "High", "Low", "Volume").collect())

# Select labels for training and testing
y_train = np.array(train.select("Close").collect()).flatten()
y_test = np.array(test.select("Close").collect()).flatten()

# Verify the shapes
print(f"x_train shape: {x_train.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

x_train shape: (807, 4)
x_test shape: (202, 4)
y_train shape: (807,)
y_test shape: (202,)


In [9]:
# 1. Assemble feature columns into a single vector column
feature_columns = ["Open", "High", "Low", "Volume"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

train = assembler.transform(train).select("features", col("Close").alias("label"))
test = assembler.transform(test).select("features", col("Close").alias("label"))
test.show(5)

+--------------------+----------+
|            features|     label|
+--------------------+----------+
|[500.0,506.320007...|497.980011|
|[566.119995,569.4...|569.190002|
|[309.769989,316.4...|312.279999|
|[485.589996,492.7...|488.940002|
|[512.619995,512.9...|505.549988|
+--------------------+----------+
only showing top 5 rows



In [10]:
# 2. Initialize and fit the Linear Regression model
lr = LinearRegression(featuresCol="features", labelCol="label")
model_lnr = lr.fit(train)

# 3. Make predictions on the test set
predictions = model_lnr.transform(test)

# 4. Evaluate the model
evaluator = RegressionEvaluator(labelCol="label", predictionCol="prediction")

# Metrics
mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
mae = evaluator.evaluate(predictions, {evaluator.metricName: "mae"})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})

print("MSE:", round(mse, 3))
print("RMSE:", round(rmse, 3))
print("MAE:", round(mae, 3))
print("R2 Score:", round(r2, 3))

24/12/12 18:03:15 WARN Instrumentation: [2d77a6e9] regParam is zero, which might cause numerical instability and overfitting.
24/12/12 18:03:15 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


MSE: 11.653
RMSE: 3.414
MAE: 2.661
R2 Score: 0.999


In [11]:
model_lnr.save("./models/linear_regression_model")

In [12]:
def style():
    plt.figure(facecolor='black', figsize=(15,10))
    ax = plt.axes()

    ax.tick_params(axis='x', colors='white')    #setting up X-axis tick color to white
    ax.tick_params(axis='y', colors='white')    #setting up Y-axis tick color to white

    ax.spines['left'].set_color('white')        #setting up Y-axis spine color to white
    #ax.spines['right'].set_color('white')
    #ax.spines['top'].set_color('white')
    ax.spines['bottom'].set_color('white')      #setting up X-axis spine color to white

    ax.set_facecolor("black")

In [13]:
from pyspark.sql.functions import to_date

# Assuming `viz` is your PySpark DataFrame
viz = viz.withColumn("Date", to_date(col("Date"), format="yyyy-MM-dd"))

In [None]:

# Step 1: Select 'Date' and 'Close' columns
data = viz.select("Date", "Close")

# Step 2: Reset the index (create a new column with an index)
data = data.withColumn("index", monotonically_increasing_id())

# Step 3: Drop duplicates
data = data.dropDuplicates(["Date"])

# Step 4: Generate a complete daily date range (asfreq equivalent)
# Get the min and max dates
min_date, max_date = data.agg({"Date": "min"}).first()[0], data.agg({"Date": "max"}).first()[0]

# Create a DataFrame with a full date range
date_range = (
    spark.sql(f"SELECT sequence(to_date('{min_date}'), to_date('{max_date}'), interval 1 day) as Date")
    .withColumn("Date", explode(col("Date")))
)

# Join the original data with the full date range to fill missing dates
data = date_range.join(data, on="Date", how="left")

# Step 5: Display the resulting DataFrame
data.show()


+----------+----------+-----+
|      Date|     Close|index|
+----------+----------+-----+
|2018-02-05|254.259995|    0|
|2018-02-06|265.720001|    1|
|2018-02-07|264.559998|    2|
|2018-02-08|250.100006|    3|
|2018-02-09|249.470001|    4|
|2018-02-10|      NULL| NULL|
|2018-02-11|      NULL| NULL|
|2018-02-12|257.950012|    5|
|2018-02-13|258.269989|    6|
|2018-02-14|     266.0|    7|
|2018-02-15|280.269989|    8|
|2018-02-16|278.519989|    9|
|2018-02-17|      NULL| NULL|
|2018-02-18|      NULL| NULL|
|2018-02-19|      NULL| NULL|
|2018-02-20|278.549988|   10|
|2018-02-21|281.040009|   11|
|2018-02-22|278.140015|   12|
|2018-02-23|285.929993|   13|
|2018-02-24|      NULL| NULL|
+----------+----------+-----+
only showing top 20 rows



In [15]:
# Join the predictions with the test DataFrame to add a 'Close_Prediction' column
test_pred = test.join(
    predictions.select("features", "prediction"), 
    on="features", 
    how="inner"
)

# Rename the 'prediction' column to 'Close_Prediction'
test_pred = test_pred.withColumnRenamed("prediction", "Close_Prediction")

# Optionally drop the 'features' column if no longer needed
test_pred = test_pred.drop("features")

# Show the resulting DataFrame
test_pred.show(5)

+----------+------------------+
|     label|  Close_Prediction|
+----------+------------------+
|497.980011|  501.417086956984|
|569.190002| 565.3540836414134|
|312.279999|313.87374094636016|
|488.940002|489.08816444062063|
|505.549988| 506.7195459103711|
+----------+------------------+
only showing top 5 rows



In [16]:
# Merge the DataFrames on `Close` from test_copy and `label` from test_pred
merged_df = test_copy.join(test_pred, test_copy["Close"] == test_pred["label"], how="inner")

# Drop the duplicate column `label` after the join
merged_df = merged_df.drop("label")

# Show the merged DataFrame
merged_df.show(5)


+----------+----------+----------+----------+----------+----------+-------+------------------+
|      Date|      Open|      High|       Low|     Close| Adj Close| Volume|  Close_Prediction|
+----------+----------+----------+----------+----------+----------+-------+------------------+
|2021-01-15|     500.0|506.320007|495.100006|497.980011|497.980011|5895800|  501.417086956984|
|2021-08-31|566.119995| 569.47998|561.609985|569.190002|569.190002|2431900| 565.3540836414134|
|2019-08-13|309.769989|316.429993|308.160004|312.279999|312.279999|5289400|313.87374094636016|
|2021-05-17|485.589996|492.709991|482.809998|488.940002|488.940002|2705200|489.08816444062063|
|2021-04-27|512.619995| 512.98999|504.579987|505.549988|505.549988|3761300| 506.7195459103711|
+----------+----------+----------+----------+----------+----------+-------+------------------+
only showing top 5 rows



In [17]:
import pandas as pd
merge_df = merged_df.toPandas()
merge_df.shape

(202, 8)

In [18]:
merge_df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Close_Prediction
0,2021-01-15,500.000000,506.320007,495.100006,497.980011,497.980011,5895800,501.417087
1,2021-08-31,566.119995,569.479980,561.609985,569.190002,569.190002,2431900,565.354084
2,2019-08-13,309.769989,316.429993,308.160004,312.279999,312.279999,5289400,313.873741
3,2021-05-17,485.589996,492.709991,482.809998,488.940002,488.940002,2705200,489.088164
4,2021-04-27,512.619995,512.989990,504.579987,505.549988,505.549988,3761300,506.719546
...,...,...,...,...,...,...,...,...
197,2019-03-25,359.000000,367.040009,357.440002,366.230011,366.230011,8473800,364.508331
198,2018-09-21,366.589996,372.220001,360.739990,361.190002,361.190002,11930600,367.023040
199,2019-12-03,302.220001,307.359985,301.880005,306.160004,306.160004,4992800,306.137666
200,2020-09-22,489.010010,491.820007,479.260010,491.170013,491.170013,3482300,483.580870


In [None]:
# Sort the DataFrame by Date
merged_df_pandas = merge_df.sort_values(by="Date")

# Plot using Plotly
fig = px.line(
    merged_df_pandas,
    x="Date",
    y=["Close", "Close_Prediction"],
    title="Close Price vs Close Price Prediction"
)

# Adjust the layout for better visualization
fig.update_layout(width=900, height=600)

# Show the plot
fig.show()


24/12/12 18:03:21 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
24/12/12 18:03:21 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
