In [44]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, isnan, when, abs, avg
spark = SparkSession.builder.appName('Stock Data').getOrCreate()

Loading file in dataframe

In [2]:
dataset = spark.read.csv('AMZN_data.csv', inferSchema=True, header=True)

In [3]:
dataset.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']

In [4]:
dataset.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)



In [5]:
dataset.show()

+----------+--------+--------+--------+--------+---------+----------+
|      Date|    Open|    High|     Low|   Close|Adj Close|    Volume|
+----------+--------+--------+--------+--------+---------+----------+
|1997-05-15|0.121875|   0.125|0.096354|0.097917| 0.097917|1443120000|
|1997-05-16|0.098438|0.098958|0.085417|0.086458| 0.086458| 294000000|
|1997-05-19|0.088021|0.088542| 0.08125|0.085417| 0.085417| 122136000|
|1997-05-20|0.086458|  0.0875|0.081771|0.081771| 0.081771| 109344000|
|1997-05-21|0.081771|0.082292| 0.06875|0.071354| 0.071354| 377064000|
|1997-05-22|0.071875|0.072396|0.065625|0.069792| 0.069792| 235536000|
|1997-05-23|0.070313|0.076042|0.066667|   0.075|    0.075| 318744000|
|1997-05-27|0.075521|0.082292|0.072917|0.079167| 0.079167| 173952000|
|1997-05-28| 0.08125|0.081771|0.076563|0.076563| 0.076563|  91488000|
|1997-05-29|0.077083|0.077083|0.073958| 0.07526|  0.07526|  69456000|
|1997-05-30|   0.075|0.075521|0.073958|   0.075|    0.075|  51888000|
|1997-06-02|0.075521

In [6]:
null_checks = [count(when(col(c).isNull(), c)).alias(c) for c in dataset.columns]
dataset.select(null_checks).show()

+----+----+----+---+-----+---------+------+
|Date|Open|High|Low|Close|Adj Close|Volume|
+----+----+----+---+-----+---------+------+
|   0|   0|   0|  0|    0|        0|     0|
+----+----+----+---+-----+---------+------+



In [7]:
from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

In [8]:
featureassembler=VectorAssembler(inputCols=["Open", "High", "Low"], outputCol='Features')

In [9]:
output=featureassembler.transform(dataset)

In [10]:
output.show()

+----------+--------+--------+--------+--------+---------+----------+--------------------+
|      Date|    Open|    High|     Low|   Close|Adj Close|    Volume|            Features|
+----------+--------+--------+--------+--------+---------+----------+--------------------+
|1997-05-15|0.121875|   0.125|0.096354|0.097917| 0.097917|1443120000|[0.121875,0.125,0...|
|1997-05-16|0.098438|0.098958|0.085417|0.086458| 0.086458| 294000000|[0.098438,0.09895...|
|1997-05-19|0.088021|0.088542| 0.08125|0.085417| 0.085417| 122136000|[0.088021,0.08854...|
|1997-05-20|0.086458|  0.0875|0.081771|0.081771| 0.081771| 109344000|[0.086458,0.0875,...|
|1997-05-21|0.081771|0.082292| 0.06875|0.071354| 0.071354| 377064000|[0.081771,0.08229...|
|1997-05-22|0.071875|0.072396|0.065625|0.069792| 0.069792| 235536000|[0.071875,0.07239...|
|1997-05-23|0.070313|0.076042|0.066667|   0.075|    0.075| 318744000|[0.070313,0.07604...|
|1997-05-27|0.075521|0.082292|0.072917|0.079167| 0.079167| 173952000|[0.075521,0.08229...|

In [11]:
output.select(['Features']).show()

+--------------------+
|            Features|
+--------------------+
|[0.121875,0.125,0...|
|[0.098438,0.09895...|
|[0.088021,0.08854...|
|[0.086458,0.0875,...|
|[0.081771,0.08229...|
|[0.071875,0.07239...|
|[0.070313,0.07604...|
|[0.075521,0.08229...|
|[0.08125,0.081771...|
|[0.077083,0.07708...|
|[0.075,0.075521,0...|
|[0.075521,0.07656...|
|[0.076563,0.07656...|
|[0.073958,0.07447...|
|[0.070833,0.07708...|
|[0.075781,0.08541...|
|[0.082813,0.08541...|
|[0.085417,0.08541...|
|[0.079688,0.08020...|
|[0.079167,0.08229...|
+--------------------+
only showing top 20 rows



In [12]:
output.columns

['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Features']

In [13]:
final_data=output.select("Date", "Features", "Close").sort("Date", ascending=True)

In [14]:
final_data.show()

+----------+--------------------+--------+
|      Date|            Features|   Close|
+----------+--------------------+--------+
|1997-05-15|[0.121875,0.125,0...|0.097917|
|1997-05-16|[0.098438,0.09895...|0.086458|
|1997-05-19|[0.088021,0.08854...|0.085417|
|1997-05-20|[0.086458,0.0875,...|0.081771|
|1997-05-21|[0.081771,0.08229...|0.071354|
|1997-05-22|[0.071875,0.07239...|0.069792|
|1997-05-23|[0.070313,0.07604...|   0.075|
|1997-05-27|[0.075521,0.08229...|0.079167|
|1997-05-28|[0.08125,0.081771...|0.076563|
|1997-05-29|[0.077083,0.07708...| 0.07526|
|1997-05-30|[0.075,0.075521,0...|   0.075|
|1997-06-02|[0.075521,0.07656...|0.075521|
|1997-06-03|[0.076563,0.07656...|0.073958|
|1997-06-04|[0.073958,0.07447...|0.070833|
|1997-06-05|[0.070833,0.07708...|0.077083|
|1997-06-06|[0.075781,0.08541...|0.082813|
|1997-06-09|[0.082813,0.08541...|0.084375|
|1997-06-10|[0.085417,0.08541...|0.079167|
|1997-06-11|[0.079688,0.08020...|0.077083|
|1997-06-12|[0.079167,0.08229...|0.080208|
+----------

In [15]:
train_data, test_data=final_data.randomSplit([0.75,0.25])
print(f"""There are {train_data.count()} rows in the trainning set,
      and {test_data.count()} in the test set""")

There are 5010 rows in the trainning set,
      and 1704 in the test set


Linear Regression model

In [16]:
from pyspark.ml.regression import LinearRegression

In [17]:
regressor=LinearRegression(featuresCol='Features', labelCol='Close')
lr_model=regressor.fit(train_data)

In [18]:
lr_model.coefficients

DenseVector([-0.6502, 0.8325, 0.8177])

In [19]:
lr_model.intercept

0.010217138896736644

Test

In [33]:
pred_test1=lr_model.transform(test_data)
pred_test1.select("Features", "Close", "Prediction").show()

+--------------------+--------+-------------------+
|            Features|   Close|         Prediction|
+--------------------+--------+-------------------+
|[0.081771,0.08229...|0.071354|0.08177596426284292|
|[0.08125,0.081771...|0.076563|0.08806981207004791|
|[0.076563,0.07656...|0.073958| 0.0846514703759724|
|[0.077083,0.07708...|   0.075|0.08474626734924844|
|[0.07526,0.076302...|0.075521|0.08528140833520062|
|[0.076042,0.07604...|0.075521| 0.0856211593282947|
|[0.075521,0.07994...|0.077083| 0.0881470415746153|
|[0.077083,0.07708...|0.075781|0.08602435985273203|
|[0.079948,0.09583...|0.095573|0.10317853365362847|
|[0.108333,0.10937...|0.106771|0.11516104541132866|
|[0.115104,0.11562...|0.113021|0.12235049981784177|
|[0.104688,0.11041...|0.108333|0.11797250707208895|
|[0.109375,0.10937...|0.108333|0.11490955749808528|
|[0.105208,0.11302...|0.110938|0.12193159509637179|
|[0.157031,0.16927...|0.161979|0.17275642669016902|
|[0.159375,0.18489...|0.184375|0.18956449037909573|
|[0.2,0.2338

In [21]:
from pyspark.ml.evaluation import RegressionEvaluator

In [22]:
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Close", metricName="mse")
mse = evaluator.evaluate(pred_test1)
evaluator_r2 = RegressionEvaluator(predictionCol="prediction", labelCol="Close", metricName="r2")
r2 = evaluator_r2.evaluate(pred_test1)
print('MSE: ', mse, '\n' + 'r2: ', r2, '\n')

MSE:  0.17757204446194236 
r2:  0.9999283259032998 



In [46]:
pred_test1=pred_test1.withColumn("MAPE",abs(col("Close")-col("Prediction"))/col("Close"))
mape = pred_test1.select(avg("MAPE")).collect()[0][0]*100
print('MAPE: ', mape)

MAPE:  1.047177146043894


Train

In [40]:
pred_train1=lr_model.transform(train_data)
pred_train1.select("Features", "Close", "Prediction").show()

+--------------------+--------+-------------------+
|            Features|   Close|         Prediction|
+--------------------+--------+-------------------+
|[0.121875,0.125,0...|0.097917|0.11382713402858353|
|[0.098438,0.09895...|0.086458|0.09844245516257126|
|[0.088021,0.08854...|0.085417| 0.0931368201186105|
|[0.086458,0.0875,...|0.081771|0.09371165177667856|
|[0.071875,0.07239...|0.069792|0.07741653765121662|
|[0.070313,0.07604...|   0.075|0.08231956526319639|
|[0.075521,0.08229...|0.079167| 0.0892472052878328|
|[0.077083,0.07708...| 0.07526|0.08474626734924844|
|[0.075,0.075521,0...|   0.075|0.08480027119954592|
|[0.075521,0.07656...|0.075521|0.08618105079269939|
|[0.073958,0.07447...|0.070833|0.08120370140835946|
|[0.070833,0.07708...|0.077083|0.08455140650593462|
|[0.075781,0.08541...|0.082813|0.09380908654310147|
|[0.082813,0.08541...|0.084375|0.09519960655862464|
|[0.085417,0.08541...|0.079167|0.08839572532034135|
|[0.079688,0.08020...|0.077083|0.08778422403467409|
|[0.079167,0

In [36]:
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="Close", metricName="mse")
mse = evaluator.evaluate(pred_train1)
evaluator_r2 = RegressionEvaluator(predictionCol="prediction", labelCol="Close", metricName="r2")
r2 = evaluator_r2.evaluate(pred_train1)
print('MSE: ', mse, '\n' + 'r2: ', r2, '\n')

MSE:  0.19707910888955235 
r2:  0.9999227290018733 



In [47]:
pred_train1=pred_train1.withColumn("MAPE",abs(col("Close")-col("Prediction"))/col("Close"))
mape = pred_train1.select(avg("MAPE")).collect()[0][0]*100
print('MAPE: ', mape)

MAPE:  1.0980761654210216


Random Forest Regressor model

In [23]:
from pyspark.ml.regression import RandomForestRegressor

In [24]:
rf=RandomForestRegressor(featuresCol='Features', labelCol='Close')
rf_model=rf.fit(train_data)

In [25]:
pred2=rf_model.transform(test_data)
pred2.select("Features", "Close", "Prediction").show()

+--------------------+--------+------------------+
|            Features|   Close|        Prediction|
+--------------------+--------+------------------+
|[0.081771,0.08229...|0.071354|0.6835193410047067|
|[0.08125,0.081771...|0.076563|0.6835193410047067|
|[0.076563,0.07656...|0.073958|0.6835193410047067|
|[0.077083,0.07708...|   0.075|0.6835193410047067|
|[0.07526,0.076302...|0.075521|0.6835193410047067|
|[0.076042,0.07604...|0.075521|0.6835193410047067|
|[0.075521,0.07994...|0.077083|0.6835193410047067|
|[0.077083,0.07708...|0.075781|0.6835193410047067|
|[0.079948,0.09583...|0.095573|0.6835193410047067|
|[0.108333,0.10937...|0.106771|0.6835193410047067|
|[0.115104,0.11562...|0.113021|0.6835193410047067|
|[0.104688,0.11041...|0.108333|0.6835193410047067|
|[0.109375,0.10937...|0.108333|0.6835193410047067|
|[0.105208,0.11302...|0.110938|0.6835193410047067|
|[0.157031,0.16927...|0.161979|0.6835193410047067|
|[0.159375,0.18489...|0.184375|0.6835193410047067|
|[0.2,0.233854,0.1...|   0.225|

In [26]:
mse = evaluator.evaluate(pred2)
r2 = evaluator_r2.evaluate(pred2)
print('MSE: ', mse, '\n' + 'r2: ', r2, '\n')

MSE:  8.369606842388059 
r2:  0.9966217429551978 



Gradient-Boosted Tree Regression

In [27]:
from pyspark.ml.regression import GBTRegressor

In [28]:
gbt = GBTRegressor(featuresCol='Features', labelCol='Close', maxIter=10)
gbt_model = gbt.fit(train_data)

In [29]:
pred3=gbt_model.transform(test_data)
pred3.select("Features", "Close", "Prediction").show(truncate=False)

+----------------------------+--------+-------------------+
|Features                    |Close   |Prediction         |
+----------------------------+--------+-------------------+
|[0.081771,0.082292,0.06875] |0.071354|0.33054568516178534|
|[0.08125,0.081771,0.076563] |0.076563|0.33054568516178534|
|[0.076563,0.076563,0.073958]|0.073958|0.33054568516178534|
|[0.077083,0.077083,0.073958]|0.075   |0.33054568516178534|
|[0.07526,0.076302,0.073958] |0.075521|0.33054568516178534|
|[0.076042,0.076042,0.07526] |0.075521|0.33054568516178534|
|[0.075521,0.079948,0.073958]|0.077083|0.33054568516178534|
|[0.077083,0.077083,0.075521]|0.075781|0.33054568516178534|
|[0.079948,0.095833,0.079688]|0.095573|0.33054568516178534|
|[0.108333,0.109375,0.103125]|0.106771|0.33054568516178534|
|[0.115104,0.115625,0.110938]|0.113021|0.33054568516178534|
|[0.104688,0.110417,0.102604]|0.108333|0.33054568516178534|
|[0.109375,0.109375,0.103646]|0.108333|0.33054568516178534|
|[0.105208,0.113021,0.105208]|0.110938|0

In [30]:
mse = evaluator.evaluate(pred3)
r2 = evaluator_r2.evaluate(pred3)
print('MSE: ', mse, '\n' + 'r2: ', r2, '\n')

MSE:  8.724402756191395 
r2:  0.9964785352970791 



Linear Regression model

In [31]:
pandas_df = pred.select("Close", "Prediction").toPandas()

In [32]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(pandas_df['Close'], label='Actual')
plt.plot(pandas_df['Prediction'], label='Prediction')
plt.legend()
plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
lr_model.write().overwrite().save("Model")

In [None]:
from pyspark.ml.regression import LinearRegressionModel
lrModel=LinearRegressionModel.load("Model)
final_output=lrModel.transform(test_data)
final_output.show()

+----------+--------------------+--------+-------------------+
|      Date|            Features|   Close|         prediction|
+----------+--------------------+--------+-------------------+
|1997-05-30|[0.075,0.075521,0...|   0.075|0.08273744019385013|
|1997-06-11|[0.079688,0.08020...|0.077083|0.08579686771765538|
|1997-06-19|[0.075521,0.07656...|0.075521|0.08410304526787288|
|1997-06-27|[0.075781,0.07578...|0.074479|0.08247027247162975|
|1997-06-30|[0.075521,0.07994...|0.077083|0.08613837908866005|
|1997-07-11|[0.126042,0.12604...|0.114583|0.12193546758405585|
|1997-07-17|[0.117188,0.11718...|0.110938|0.11796261138537718|
|1997-07-31|[0.121875,0.12187...|0.119792|0.12589821621306624|
|1997-08-01|[0.117188,0.12083...|0.120833|0.12468656683256488|
|1997-08-04|[0.11875,0.120573...|0.115625|0.12552757484301436|
|1997-08-14|[0.108854,0.1125,...|0.107813|0.11778104406904903|
|1997-08-18|[0.102604,0.10260...|0.102083| 0.1074506646666642|
|1997-08-22|[0.105208,0.10625...| 0.10625|0.11377470285