In [1]:
from pyspark.sql import SparkSession

import numpy as np
import pandas as pd


import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.max_columns = 50

import warnings
warnings.filterwarnings("ignore")

In [2]:
spk_sess = SparkSession \
    .builder \
    .appName("_Project_Spark_App") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

df = spk_sess.read.csv("./solar_generation_by_station.csv", header=True, sep=",");

df.select('time_step', 'AT11', 'AT12').show(10)

+---------+-------+-------------------+
|time_step|   AT11|               AT12|
+---------+-------+-------------------+
|        1|    0.0|                0.0|
|        2|    0.0|                0.0|
|        3|    0.0|                0.0|
|        4|    0.0|                0.0|
|        5|    0.0|                0.0|
|        6|    0.0|                0.0|
|        7|    0.0|                0.0|
|        8|    0.0|                0.0|
|        9|0.13127|0.08148999999999999|
|       10| 0.1259|             0.1032|
+---------+-------+-------------------+
only showing top 10 rows



In [3]:
# keep only columns relatives to france
col_fr = [c for c in df.columns if 'FR' in c]
col_fr.append('time_step')
df = df.select(col_fr)

# only keep 8 cols
df = df.select(df.columns[-8:])
df.show(10)

+-------+------+-------+-------+-------+-------------------+-------+---------+
|   FR62|  FR30|   FR51|   FR22|   FR53|               FR82|   FR71|time_step|
+-------+------+-------+-------+-------+-------------------+-------+---------+
|    0.0|   0.0|    0.0|    0.0|    0.0|                0.0|    0.0|        1|
|    0.0|   0.0|    0.0|    0.0|    0.0|                0.0|    0.0|        2|
|    0.0|   0.0|    0.0|    0.0|    0.0|                0.0|    0.0|        3|
|    0.0|   0.0|    0.0|    0.0|    0.0|                0.0|    0.0|        4|
|    0.0|   0.0|    0.0|    0.0|    0.0|                0.0|    0.0|        5|
|    0.0|   0.0|    0.0|    0.0|    0.0|                0.0|    0.0|        6|
|    0.0|   0.0|    0.0|    0.0|    0.0|                0.0|    0.0|        7|
|    0.0|   0.0|    0.0|    0.0|    0.0|                0.0|    0.0|        8|
|0.02609|   0.0|    0.0|    0.0|    0.0|0.11204000000000001|0.05039|        9|
|0.12628|0.0708|0.06277|0.07653|0.07488|            

In [4]:
from pyspark.sql.types import DoubleType

for c in df.columns[:-1]:
        df = df.withColumn(c, df[c].cast(DoubleType()))

In [5]:
df.dtypes

[('FR62', 'double'),
 ('FR30', 'double'),
 ('FR51', 'double'),
 ('FR22', 'double'),
 ('FR53', 'double'),
 ('FR82', 'double'),
 ('FR71', 'double'),
 ('time_step', 'string')]

In [9]:
df.select('FR30', 'FR22', 'FR53', 'FR71').describe().show()

+-------+-------------------+-------------------+------------------+-------------------+
|summary|               FR30|               FR22|              FR53|               FR71|
+-------+-------------------+-------------------+------------------+-------------------+
|  count|             262968|             262968|            262968|             262968|
|   mean|0.12286700176447361|  0.125993219212984|0.1446027320434425|0.14714927070974418|
| stddev|0.19786925029342559|0.20127526900909218|0.2187639356203176|0.21948052286799657|
|    min|                0.0|                0.0|               0.0|                0.0|
|    max|            0.93215| 0.9161299999999999|           0.91776|            0.93804|
+-------+-------------------+-------------------+------------------+-------------------+



In [7]:
# df.show()
# df.printSchema()

In [19]:
from pyspark.sql.functions import col


def generate_series(start, stop, interval):
    """
    :param start  - lower bound, inclusive
    :param stop   - upper bound, exclusive
    :interval int - increment interval in seconds
    """

    # Determine start and stops in epoch seconds
    start, stop = spk_sess.createDataFrame([(start, stop)], ("start", "stop")) \
                        .select([col(c).cast("timestamp") \
                        .cast("long") for c in ("start", "stop")]) \
                        .first()
    # Create range with increments and cast to timestamp
    return spk_sess.range(start, stop, interval) \
                .select(col("id").cast("timestamp").alias("value"))


# credits : https://stackoverflow.com/questions/43141671/sparksql-on-pyspark-how-to-generate-time-series
test_gen = generate_series("1985-01-01", "2016-01-01", 60 * 60) # By hour, by day use 60 * 60 * 24
test_gen.show(5)

+-------------------+
|              value|
+-------------------+
|1985-01-01 00:00:00|
|1985-01-01 01:00:00|
|1985-01-01 02:00:00|
|1985-01-01 03:00:00|
|1985-01-01 04:00:00|
+-------------------+
only showing top 5 rows



In [37]:
test_gen.count(), df.count()

(271728, 262968)

In [35]:
test_gen.orderBy('value', ascending=False).show(5)

+-------------------+
|              value|
+-------------------+
|2015-12-31 23:00:00|
|2015-12-31 22:00:00|
|2015-12-31 21:00:00|
|2015-12-31 20:00:00|
|2015-12-31 19:00:00|
+-------------------+
only showing top 5 rows



In [38]:
test_gen.to_Pandas()

AttributeError: 'DataFrame' object has no attribute 'to_Pandas'

In [14]:
type(generate_series("2000-01-01", "2000-01-05", 60 * 60))

pyspark.sql.dataframe.DataFrame

In [31]:
from pyspark.sql.functions import sequence, to_date, explode, col

spk_sess.sql("SELECT sequence(to_date('2018-01-01'), to_date('2018-03-01'), interval 1 month) as date").withColumn("date", explode(col("date"))).show()

+----------+
|      date|
+----------+
|2018-01-01|
|2018-02-01|
|2018-03-01|
+----------+



In [None]:
t = pd.date_range(start='1/1/1986', periods=df.count(), freq = 'H')
t = pd.DataFrame(t)


In [None]:
from pyspark.sql import functions as F

format = "yyyy-MM-dd'T'HH:mm:ss.SSSZ"
df1 = df.withColumn('Timestamp2', F.unix_timestamp('time_step', format).cast('timestamp'))

In [None]:
df = df.withColumn("test1",F.to_date(F.col("value"),"yyyy-MM-dd")).

In [None]:
df1.show()

In [None]:
from pyspark.sql.functions import lit

df.withColumn('new_column', lit(t))


In [None]:
df.show()

In [None]:
from pyspark.sql import functions as F

format = "yyyy-MM-dd'T'HH:mm:ss.SSSZ"
df2 = df1.withColumn('Timestamp2', F.unix_timestamp('Timestamp', format).cast('timestamp'))

In [None]:
from pyspark.sql.types import DateType

df1 = df.withColumn("time_step", df["time_step"].cast(DateType()))
df1.select('time_step','FR10').show(10, False)