In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder
import pyspark.sql.functions as F
from wrapper import RegressorWrapper, LSTMWrapper
import warnings
warnings.filterwarnings('ignore')

# Chapter 1: Exploratory Data Analysis (EDA)

## 1.1. Load data to Spark

In [2]:
spark = SparkSession.builder.appName("Netflix Stock Prediction").getOrCreate()
spark

25/01/12 21:35:46 WARN Utils: Your hostname, WindowEnv resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/01/12 21:35:46 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/12 21:35:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.csv('./data/NFLX.csv', header=True, inferSchema=True)
print(f'n_rows: {df.count()}; n_cols{len(df.columns)}\n')
df.printSchema()
df.show(10)

n_rows: 1009; n_cols7

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Adj Close: double (nullable = true)
 |-- Volume: integer (nullable = true)

+----------+----------+----------+----------+----------+----------+--------+
|      Date|      Open|      High|       Low|     Close| Adj Close|  Volume|
+----------+----------+----------+----------+----------+----------+--------+
|2018-02-05|     262.0|267.899994|250.029999|254.259995|254.259995|11896100|
|2018-02-06|247.699997|266.700012|     245.0|265.720001|265.720001|12595800|
|2018-02-07|266.579987|272.450012|264.329987|264.559998|264.559998| 8981500|
|2018-02-08|267.079987|267.619995|     250.0|250.100006|250.100006| 9306700|
|2018-02-09|253.850006|255.800003|236.110001|249.470001|249.470001|16906900|
|2018-02-12|252.139999|259.149994|     249.0|257.950012|257.950012| 8534900|
|2018-02-13|257.2900

## 1.2. Data preprocessing

In [4]:
# null-values (no null)
df.select([F.sum(F.col(c).isNull().cast('int')).alias(c) for c in df.columns]).show()

# duplicates (no dup)
print(df.drop_duplicates().count(), "\n")

# naming convenient
df = df.toDF(*[c.lower() for c in df.columns])
df = df.withColumnRenamed('adj close', 'adj_close')
print(df.columns)

# change date to ascending order
df = df.orderBy('date')

+----+----+----+---+-----+---------+------+
|Date|Open|High|Low|Close|Adj Close|Volume|
+----+----+----+---+-----+---------+------+
|   0|   0|   0|  0|    0|        0|     0|
+----+----+----+---+-----+---------+------+

1009 

['date', 'open', 'high', 'low', 'close', 'adj_close', 'volume']


## 1.3. Feature importance

In [5]:
# identify target and features
target = 'close'
features = [c for c in df.columns if c not in ['close', 'adj_close', 'date']]

# calculate correlations
for c in features:
    corr = df.select(F.corr('close', c)).collect()[0][0]
    print(f"{c}: {corr}")

open: 0.9968121105128477
high: 0.9985508166555668
low: 0.9985438753668097
volume: -0.41336187467202007


25/01/12 21:35:58 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


# Chapter 2: Machine learning models

## 2.1 Linear regression

In [9]:
lr = LinearRegression()
ev = RegressionEvaluator()
lr_wrapper = RegressorWrapper(lr, ev, target, features, None)

paramGrid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

result = lr_wrapper.create(df, paramGrid, ['rmse', 'mse', 'r2'])

for key ,val in result.items():
    print(f'{key}: {val}')

train: {'rmse': 3.757339216949421, 'mse': 14.11759799122609, 'r2': 0.99881493996627}
test: {'rmse': 4.493396485242403, 'mse': 20.190611973588783, 'r2': 0.99811429537498}
model: PipelineModel_2b51f526aeba


## 2.2 Decision tree

In [8]:
dt = DecisionTreeRegressor()
dt_wrapper = RegressorWrapper(dt, ev, target, features, None)
paramGrid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10, 15]) \
    .addGrid(dt.maxBins, [32, 64, 128]) \
    .addGrid(dt.minInstancesPerNode, [1, 2, 3]) \
    .addGrid(dt.minInfoGain, [0.0, 0.01, 0.02]) \
    .build()

result = dt_wrapper.create(df, paramGrid, ['rmse', 'mse', 'r2'])

for key ,val in result.items():
    print(f'{key}: {val}')

                                                                                

train: {'rmse': 3.0031784769986167, 'mse': 9.019080964707731, 'r2': 0.9992429199075584}
test: {'rmse': 6.092744705207209, 'mse': 37.12153804283048, 'r2': 0.9965330295056539}
model: PipelineModel_afbbc7e1e52d


## 2.3. LSTM

In [6]:
lstm_df = df.select([c for c in df.columns if c != 'date'])
lstm = LSTMWrapper()
lstm.create(lstm_df, 1, 1)

                                                                                

Epoch 10/200, Train Loss: 0.0004, Val Loss: 0.0004
Epoch 20/200, Train Loss: 0.0001, Val Loss: 0.0002
Epoch 30/200, Train Loss: 0.0001, Val Loss: 0.0002
Epoch 40/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 50/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 60/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 70/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 80/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 90/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 100/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 110/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 120/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 130/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 140/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 150/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 160/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 170/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 180/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 190/200, Train Loss: 0.0000, Val Loss: 0.0001
Epoch 200/200, Train 