In [3]:
import psycopg2

from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, LongType, TimestampType, ShortType, DateType
from pyspark.sql.functions import isnan, when, count, col

In [1]:
import pandas as pd

In [8]:
df_test = pd.read_csv("data/processed_btc_prices_ohlcvvw_2014-12-01_to_2018-11-11.csv", nrows=10)

In [11]:
df_test.columns

Index(['Unnamed: 0', 'date', 'Open', 'High', 'Low', 'Close', 'Volume_(BTC)',
       'Volume_(Currency)', 'Weighted_Price'],
      dtype='object')

In [10]:
df_test.to_csv("test_data/test_btc_price.csv")

In [3]:
from platform import python_version

print(python_version())

3.6.10


In [2]:
def initialize_spark():

    spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Simple etl job") \
        .getOrCreate()

    print("Spark Initialized", "\n")

    return spark

spark = initialize_spark()

Spark Initialized 



In [6]:
def load_df_without_schema(spark):

    df = spark.read.format("csv").option("header", "true").load("data/processed_btc_prices_ohlcvvw_2014-12-01_to_2018-11-11.csv")
    return df

df = load_df_without_schema(spark)


In [9]:
df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()

+---+----+----+----+---+-----+------------+-----------------+--------------+
|_c0|date|Open|High|Low|Close|Volume_(BTC)|Volume_(Currency)|Weighted_Price|
+---+----+----+----+---+-----+------------+-----------------+--------------+
|  0|   0|   0|   0|  0|    0|           0|                0|             0|
+---+----+----+----+---+-----+------------+-----------------+--------------+



In [11]:
CORRECT_HEADERS = [
    "date",
    "Open",
    "High",
    "Low",
    "Close",
    "Volume_(BTC)",
    "Volume_(Currency)",
    "Weighted_Price"]


def test_column_headers():
    assert df.schema.names == CORRECT_HEADERS, "Column headers error"

In [8]:
def count_missing(df, sort=True):
    if len(df.select([count(when(isnan(c), c)).alias(c) for c in df.columns])) == 0:
        print("No missing values")
        return None
    if sort:
        return df.rename(index={0: 'count'}).T.sort_values("count",ascending=False)

    return df
    