In [1]:
import pandas as pd
import numpy as np
import json
import os
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import pyarrow.parquet as pa
import tensorflow as tf
import pyspark as ps
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import arrays_zip, explode, col

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
os.environ["PYARROW_IGNORE_TIMEZONE"] = "1"

In [4]:
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [5]:
spark

In [6]:
df = spark.read.parquet('train.parquet')

In [7]:
df.show()

+-----+--------------------+--------------------+-----+
|   id|               dates|              values|label|
+-----+--------------------+--------------------+-----+
|19114|[2016-01-01, 2016...|[-1.86, 0.79, 1.4...|  0.0|
|22769|[2016-05-01, 2016...|[-1.04, -3.48, 0....|  1.0|
|76935|[2017-03-01, 2017...|[0.28, 0.63, 0.06...|  0.0|
|66297|[2016-01-01, 2016...|[-0.33, 0.58, 1.1...|  0.0|
| 2191|[2016-01-01, 2016...|[1.31, 0.5, -0.54...|  0.0|
|59504|[2016-03-01, 2016...|[0.08, 0.88, 1.46...|  0.0|
|49554|[2016-04-01, 2016...|[1.05, -0.28, 1.0...|  0.0|
|58344|[2016-12-01, 2017...|[-0.36, -0.45, -0...|  0.0|
|87449|[2016-01-01, 2016...|[2.92, 4.11, 3.39...|  1.0|
|43415|[2016-02-01, 2016...|[-0.86, -0.05, -0...|  1.0|
|76676|[2016-01-01, 2016...|[2.24, 0.46, 1.41...|  0.0|
| 4321|[2016-01-01, 2016...|[1.1, 1.73, 2.04,...|  0.0|
|17921|[2016-01-01, 2016...|[1.06, 1.04, 0.62...|  0.0|
|60176|[2016-01-01, 2016...|[1.41, -0.62, -1....|  0.0|
|61864|[2016-01-01, 2016...|[0.24, 0.37, 0.97...

In [8]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[id: bigint, dates: array<date>, values: array<double>, label: double]>

In [9]:
df.pandas_api().isna().mean()

id        0.0
dates     0.0
values    0.0
label     0.0
dtype: float64

In [10]:
df_zip = df.withColumn("zip_date_val", arrays_zip("dates", "values"))

In [11]:
df_exp = df_zip.withColumn("explod_date_val", explode("zip_date_val"))

In [25]:
df_new = df_exp.select(
    col("id"),
    col("explod_date_val.dates").alias("date"),
    col("explod_date_val.values").alias("value"),
    col("label")
)

In [26]:
df.printSchema

<bound method DataFrame.printSchema of DataFrame[id: bigint, dates: array<date>, values: array<double>, label: double]>

In [27]:
df_new = df_new.toPandas()

In [28]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5155412 entries, 0 to 5155411
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   date    object 
 2   value   float64
 3   label   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 157.3+ MB


In [29]:
df_new.isna().sum()

id          0
date        0
value    4582
label       0
dtype: int64

In [30]:
df_new = df_new.dropna()

In [31]:
df_new.isna().sum()

id       0
date     0
value    0
label    0
dtype: int64

In [32]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5150830 entries, 0 to 5155411
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   date    object 
 2   value   float64
 3   label   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 196.5+ MB


In [33]:
df_new.isna().sum()

id       0
date     0
value    0
label    0
dtype: int64

In [34]:
df_new['date'] = df_new['date'].values.astype("float64")
df_new['value'] = df_new['value'].values.astype("float64")

TypeError: float() argument must be a string or a real number, not 'datetime.date'

In [35]:
12


12