## Import required packages

In [0]:
# Download required packages
!pip -q install gdown missingno torch

%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark import SparkContext, SparkConf
from pyspark.sql.window import Window


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import missingno as msno
import torch
import torch.nn as nn
from typing import *
import datetime
import gdown

import tqdm as tq
def tqdm(*args, **kwargs):
  ''' Small trick to prevent tqdm printing newlines at each step. '''
  return tq.tqdm(*args, **kwargs, leave=True, position=0)

## Data aquisition
We retrieve our datasets and download them to a temporary directory in the driver node.

In [0]:
!rm -rf /tmp/data /tmp/__MACOSX
gdown.download('https://drive.google.com/uc?id=1ggmDp-AWFzbQReLG0pLpQE_3fO0C0RnM', '/tmp/data.zip', quiet=False)
!unzip -q /tmp/data.zip -d /tmp/
!rm /tmp/data.zip

Then we load the datasets to the DBFS.

In [0]:
dbutils.fs.mv("file:/tmp/data", "dbfs:/data", recurse=True)

In [0]:
%fs ls /data/

path,name,size
dbfs:/data/.DS_Store,.DS_Store,6148
dbfs:/data/key_stats_yahoo.csv,key_stats_yahoo.csv,2047081
dbfs:/data/prices/,prices/,0


In [0]:
%fs ls /data/

path,name,size
dbfs:/data/.DS_Store,.DS_Store,6148
dbfs:/data/key_stats_yahoo.csv,key_stats_yahoo.csv,2047081
dbfs:/data/prices/,prices/,0


## Dataset loading

In [0]:
key_stats_df = spark.read.load("dbfs:/data/key_stats_yahoo.csv", 
                           format="csv",
                           sep=",",
                           inferSchema="true",
                           header="true"
                          )

# Drop the first ID column
key_stats_df = sc.parallelize(key_stats_df.drop(key_stats_df.columns[0]).head(1005)).toDF()#TODO: remove head(n) (only meant for development)
key_stats_df.schema['Date'].nullable = False

# Use legacy format to parse dates
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
key_stats_df = key_stats_df.withColumn("Date", F.to_date(key_stats_df["Date"], 'MM/dd/yyyy HH:mm'))

# Cast numerical columns to double
for column in key_stats_df.columns[2:]:
  key_stats_df = key_stats_df.withColumn(column, key_stats_df[column].cast("double"))

# Prices dataframes for every stock #TODO: remove :N (only meant for development)
prices_files = [f.path for f in dbutils.fs.ls('/data/prices/')[:10] if f.path.endswith('.csv')]
dfs_names = [f.rsplit('/', 1)[1][:-len('.csv')] for f in prices_files]
prices_dfs = []
for f in tqdm(prices_files, desc='Reading stock price data', total=len(prices_files)):
  df = spark.read.load(f,
                       format="csv",
                       sep=",",
                       inferSchema="true",
                       header="true"
                      )
  df = df.withColumn("Date", F.to_date(df["Date"], 'dd-MM-yyyy'))
  df.schema['Date'].nullable = False
  prices_dfs.append(df)

## Dataset analysis

In [0]:
print("Prices dataframe format:")
prices_dfs[0].printSchema()

In [0]:
print("Key stats dataframe format:")
key_stats_df.printSchema()

### Utility functions

In [0]:
# TODO: add remaining utility functions

def prices_df_nan_summary(prices_dfs: List[pyspark.sql.DataFrame], names: List[str]) -> pd.DataFrame:
  ''' Utility function to summarize columns that have missing values. '''
  nan_dfs = []
  for prices_df, name in tqdm(zip(prices_dfs, names), total=len(prices_dfs), desc='Generating prices summary ...'):
    nan_absolute = prices_df.agg(*[F.count(F.when(F.isnull(c), c)).alias(c) for c in prices_df.columns]).first()
    if any(nan_absolute):
      # Simple conversion from Pyspark row -> Python set of values
      values = set(nan_absolute.asDict().values()).difference({0})
      # Either we don't have values for that row, or we have all of them (but Date which is non-nullable)
      # Values contains the no. of NaN values and 0 in correspondance of the Date column
      assert len(values) == 1
      nan_count = values.pop()
      nan_dfs.append((name, round(100*nan_count/prices_df.count(), 3), nan_count))

  return pd.DataFrame(nan_dfs, columns=['Stock name', 'Missing data (%)', 'Count'])

def remove_trailing_nan(df: pyspark.sql.DataFrame, ticker: str, col: str = 'Low') -> pyspark.sql.DataFrame:
  '''
    A trick to detect if the input DataFrame ends with a contiguous collection of NaN rows, returns the dataframe without them.
  '''
  # The total number of rows of the dataframe
  df_length = df.count()
  
  # Sort the input dataframe and add a new column to keep track of the relative position of each row
  df_sorted_id = df.sort('Date').withColumn('id', F.row_number().over(Window.orderBy('Date')))
  
  # Tricky part here: create a new column called 'cumsum' that will store the progressive number of consecutive NaN in our dataset.
  # Let's break it into smaller parts:
  # 1) create an index generator that will partition by 'Low' values [(...,Null...), (...,value1,...), (...,value2...), ... (...)] and within rows order by date
  # Example:
  ## |2019-06-05|null|null|  null|null| null|          null|624|        21|
  ## |2019-06-06|null|null|  null|null| null|          null|625|        22|
  ## |2019-06-07|null|null|  null|null| null|          null|626|        23| <- last column is the cumulative sum (i.e. the number of consecutive NaN)
  ## ...
  ## |2019-05-09|23.2|11.5|   7.3|4.2|  16.2|          29.1|1  |         0|
  ## |2019-05-10|23.2|11.5|   7.3|4.2|  16.2|          29.1|2  |         0|
  # 2) assign to each row a progressive index starting from 1 if it has null in correspondance of Low, zero otherwise
  # 3) store these values into a new column called cumsum (i.e. it behaves like pandas cumsum)
  # 4) at the end, the row whose ID corresponds to the length of the dataframe will contain at column 'cumsum' the no. of trailing NaN values.
  cumsum_df = df_sorted_id.withColumn('cumsum', F.when(F.isnull(df_sorted_id.Low), F.row_number().over(Window.partitionBy('Low').orderBy('Date'))).otherwise(0))

  # Retrieve the "last" row and read the value of cumsum
  end_idx = cumsum_df.where(cumsum_df['id'] == df_length).first().cumsum
  
  # Retain rows whose index is lower len(df) - end_idx + 1 (i.e. cut trailing NaN values)
  return df_sorted_id.where(df_sorted_id['id'] <= df_length-end_idx+1)


def merge_prices_fundamentals(
    prices_dfs: List[pyspark.sql.DataFrame],
    key_stats_df: pyspark.sql.DataFrame,
    dfs_names: List[str],
    drop_cols: List[str] = ['Date', 'Ticker', 'Price']
    ) -> List[pyspark.sql.DataFrame]:
  # Define the target list of dataframes
  prices_dfs_new = []
  for ticker in tqdm(key_stats_df.select('Ticker').distinct().collect(), desc='Merging the datasets ...'):
    ticker = ticker[0]
    ticker_df = key_stats_df.filter(F.col('Ticker') == ticker).collect()
    ticker_df = sc.parallelize(ticker_df).toDF()
    
    
    # Consider only stocks for which we have fundamental data
    if ticker.upper() not in dfs_names: continue
    
    # Dropping fundamentals useless columns
    for col in drop_cols:
      ticker_df = ticker_df.drop(col)
      
    
    # Consider the relative stock ticker dataframe
    prices_df_idx = dfs_names.index(ticker.upper())
    prices_df = prices_dfs[prices_df_idx]
    # Dropping unused price value columns (we are using adjusted close)
    for col in ['High', 'Low', 'Open', 'Close']:
      prices_df = prices_df.drop(col)
    fundamental_rows = []


    # The current financial report
    ticker_iterator = list(ticker_df.iterrows())
    ticker_iter_idx = 0
    
                
def fill_missing_days(aggregate_dfs: List[pyspark.sql.DataFrame], remove_weekends: bool = True, end_year: int = 2013) -> List[pyspark.sql.DataFrame]:
  pass

def missing_values_summary(df):
  ''' Returns a utility summary to view missing values in our dataframe. '''
  n = df.count()
  
  def to_percentage(x: pyspark.sql.column.Column, n: int) -> int:
    ''' Utility function to compute the amount of missing values as a percentage of the original dataframe. '''
    return F.round(100 * x / n, 3)
  
  # Aggregate using the count function over null values, and return a view over the obtained (single row) dataframe
  return df.agg(*[to_percentage(F.count(F.when(F.isnull(c), c)), n).alias(c) for c in df.columns]).first()

In [0]:
print("Overview of the missing values in the key_stats dataframe\n")
key_stats_summary = missing_values_summary(key_stats_df)
key_stats_summary

### Missing values imputation

In [0]:
summary = prices_df_nan_summary(prices_dfs, dfs_names)
px.bar(summary, x='Stock name', y='Missing data (%)', hover_data=['Count'], title="Stock price dataset before preprocessing (only columns with missing values are displayed)")

For most of the above stocks with missing values, we noticed that they indeed exist up to a given time and after that no more data is available. It may due to a business failure, hence no more stocks will be exchanged from that moment on.

In [0]:
# Clear our input data from training NaN values
prices_dfs_new = [remove_trailing_nan(df,name) for df,name in tqdm(zip(prices_dfs, dfs_names), total=len(prices_dfs), desc='Removing trailing NaN values ...')]

# Remove INTH stock from our dataset since it contains many inactivity periods
#inth_idx = dfs_names.index('INTH') #TODO: uncomment
#del dfs_names[inth_idx] #TODO: uncomment
#del prices_dfs_new[inth_idx] #TODO: uncomment

summary = prices_df_nan_summary(prices_dfs_new, dfs_names)
px.bar(summary, x='Stock name', y='Missing data (%)', hover_data=['Count'], title="Stock price dataset after preprocessing (only columns with missing values are displayed)")

At this point we use the fast forward imputation technique to fill-in missing values. Please note that in this case missing values are mostly due to holidays or periods when stocks are not exchanged.

### Building our new dataset

In [0]:
# Impute missing values in the prices dataset (i.e. fast-forward last valid values)

# define the window
window = Window.orderBy('Date')\
               .rowsBetween(Window.unboundedPreceding, 0)

# Forward filling values 
# (ref. https://stackoverflow.com/questions/38131982/forward-fill-missing-values-in-spark-python/50422240#50422240)
for i in range(len(prices_dfs_new)):
  for col_name in prices_dfs_new[i].schema.names:
    col = F.last(prices_dfs_new[i][col_name], ignorenulls=True).over(window)
    prices_dfs_new[i] = prices_dfs_new[i].withColumn(col_name, col)

# In this case this dataframe contains financial reports that may contain NaN values either because that
# metric was not available at that time OR because it was monitoring an initial stage of a company growth.
# What we do is to apply the classic fast-forward, and fill initial missing values with zeroes.
# Please note: we also discard the 'Forward P/E' column since the imputation here would introduce too much noise.
key_stats_df_new = key_stats_df.drop('Forward P/E')
for col_name in key_stats_df_new.schema.names:
    col = F.last(key_stats_df_new[col_name], ignorenulls=True).over(window)
    key_stats_df_new = key_stats_df_new.withColumn(col_name, col)
key_stats_df_new = key_stats_df_new.fillna(0.)


# Before merging the two datasets we need to sort them
#prices_dfs_new = [df.orderBy('Date').reset_index(drop=True) for df in prices_dfs_new]
#key_stats_df_new = key_stats_df_new.sort_values(by=['Date']).reset_index(drop=True)


# Merge the stock price dataset with fundamental data of the relative company
#aggregate_dfs = merge_prices_fundamentals(prices_dfs_new, key_stats_df_new, dfs_names)
                  
# TODO: add more functions

In [0]:
key_stats_df_new

### Technical indicators

In [0]:
def add_col_sma(pair):
          return dict(pair[0].asDict().items() + [("SMA", pair[1])])

def add_sma(dfs: List[pyspark.sql.DataFrame], period: int = 10) -> None:
    ''' Computes the Simple Moving Average from a given dataframe. '''
    for i in tqdm(range(len(dfs)), desc='Adding SMA ...'):
        ret = np.cumsum(dfs[i]["Adjusted Close"])#.to_numpy()
        ret[period:] = ret[period:] - ret[:-period]
        sma = ret[period - 1:] / period
        missing_values = dfs[i].count() - sma.shape[0]
        half_missing_values = missing_values//2
        sma_points = np.empty(dfs[i].count(), dtype=float)
        sma_points[half_missing_values:(half_missing_values)+sma.shape[0]] = sma

        first_values = np.array([row["Adjusted Close"] for row in dfs[i].head(half_missing_values)])
        last_values = np.array([row["Adjusted Close"] for row in dfs[i].orderBy(F.desc("Date")).take(half_missing_values+sma.shape[0])])[::-1]

        sma_points[:half_missing_values] = first_values
        sma_points[half_missing_values+sma.shape[0]:] = last_values
        
        n = dfs[i].rdd.getNumPartitions()
        
        sma_col = sc.parallelize(sma_points, n).map(float)

        rdd = (dfs[i]
            .rdd # Extract RDD
            .zip(sma_col) # Zip with new col
            .map(add_col_sma)) # Add new column

        
          
        dfs[i] = sqlContext.createDataFrame(rdd)
        
        

In [0]:
dfs = []

df1 = spark.createDataFrame(
    [
        (1, 0.5), 
        (2, 0.6),
        (3, 0.7),
        (4, 0.9),
        (5, 1.2),
        (6, 0.8),
        (7, 0.4),
        (8, 1.6),
        (9, 1.7),
        (10, 0.5), 
        (11, 0.6),
        (12, 0.7),
        (13, 0.9),
        (14, 1.2),
        (15, 0.8),
        (16, 0.4),
        (17, 1.6),
        (18, 1.7),
    ],
    ["Date", "Adjusted Close"]  # add your column names here
)

dfs = [df1]

# Add SMA indicator to each dataframe
add_sma(dfs)