# Spark dataframe

## Environnement initialization

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as sparkF
from pyspark.sql.types import *
# from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.functions import *
# from pyspark.sql.types import DoubleType

In [4]:
import pyarrow

In [7]:
spark = (SparkSession.builder 
        .master('spark://spark-master:7077') 
        .appName('~ TUTORIAL: Handling spark dataframe ~') 
        .config('spark.driver.cores', '1')
        .config('spark.executor.instances', '2')
        .config('spark.executor.memory', '4g')
        .config('spark.executor.cores', '2')
        .config('spark.cores.max', '4')
        .config('spark.yarn.appMasterEnv.ARROW_PRE_0_15_IPC_FORMAT',1)\
        .config('spark.executorEnv.ARROW_PRE_0_15_IPC_FORMAT',1) \
        .getOrCreate())
##- Enabling Apache Arrow
spark.conf.set('spark.sql.execution.arrow.enabled', 'true')

##- SQL context
sqlContext = SQLContext(spark.sparkContext)

print(spark)

<pyspark.sql.session.SparkSession object at 0x7fdbc530a978>


## Table of content

[<b>1.  Initialize and handle your spark dataframe</b>](#Initialize-and-handle-your-spark-dataframe)
- [Select column containing point](#Select-column-containing-point)
<br>

[<b>2.  Transform your spark dataframe</b>](#Transform-your-spark-dataframe)
- [Standardize a spark dataframe](#Standardize-a-spark-dataframe)
- [Compute the difference between two columns](#Compute-the-difference-between-two-columns)
<br>

[<b>3.  Operations on your spark dataframe</b>](#Operations-on-your-spark-dataframe)
- [Count the number of missing values by column](#Count-the-number-of-missing-values-by-column)
- [Aggregate with known function using groupby](#Aggregate-with-known-function-using-groupby)
- [Aggregate with your own function using groupby](#Aggregate-with-your-own-function-using-groupby)
- [Use multiple arguments with pandas UDF](#Use-multiple-arguments-with-pandas-UDF)
- [Compute mean and standard deviation on a column](#Compute-mean-and-standard-deviation-on-a-column)
<br>

[<b>4.  Spark ML on your spark dataframe</b>](#Spark-ML-on-your-spark-dataframe)
- [Correlation for spark DF](#Correlation-for-spark-DF)
<br>

## 1.  Initialize and handle your spark dataframe

### Select column containing point

Spark backtick issue : https://stackoverflow.com/questions/51253271/dropping-a-column-name-that-has-a-period-in-spark-dataframe

In [7]:
df = spark.createDataFrame(
[("a", 1, 0), ("a", -1, 42), ("b", 3, -1), ("b", 10, -2)],
("key", "value1", "value2")
)

df.show()

+---+------+------+
|key|value1|value2|
+---+------+------+
|  a|     1|     0|
|  a|    -1|    42|
|  b|     3|    -1|
|  b|    10|    -2|
+---+------+------+



In [4]:
df = df.withColumnRenamed('value1', 'value.1').withColumnRenamed('value2', 'value.2')
df.show()

+---+-------+-------+
|key|value.1|value.2|
+---+-------+-------+
|  a|      1|      0|
|  a|     -1|     42|
|  b|      3|     -1|
|  b|     10|     -2|
+---+-------+-------+



In [5]:
##- It will print an error: it is normal !!!
# df.select('value.1').show()

In [6]:
##- To select your column, you need to use backtick when your column name contain the character '.'
df.select('`value.1`').show()

+-------+
|value.1|
+-------+
|      1|
|     -1|
|      3|
|     10|
+-------+



## 2.  Transform your spark dataframe

### Standardize a spark dataframe (HOWTO)

In [3]:
df = spark.createDataFrame([ (1, 'A',12560,45),
                             (1, 'B',42560,90),
                             (1, 'C',31285,120),
                             (1, 'D',10345,150)
                           ], ["userID", "Name","Revenue","No_of_Days"])

print("Before Scaling :")
df.show(5)

Before Scaling :
+------+----+-------+----------+
|userID|Name|Revenue|No_of_Days|
+------+----+-------+----------+
|     1|   A|  12560|        45|
|     1|   B|  42560|        90|
|     1|   C|  31285|       120|
|     1|   D|  10345|       150|
+------+----+-------+----------+



In [9]:
# # UDF for converting column type from vector to double type
# unlist = udf(lambda x: round(float(list(x)[0]),3), DoubleType())

# # Iterating over columns to be scaled
# for i in ["Revenue","No_of_Days"]:
#     # VectorAssembler Transformation - Converting column to vector type
#     assembler = VectorAssembler(inputCols=[i],outputCol=i+"_Vect")

#     # MinMaxScaler Transformation
#     scaler = StandardScaler(inputCol=i+"_Vect", outputCol=i+"_Scaled", withStd=True, withMean=True)

#     # Pipeline of VectorAssembler and MinMaxScaler
#     pipeline = Pipeline(stages=[assembler, scaler])

#     # Fitting pipeline on dataframe
#     df = pipeline.fit(df).transform(df).withColumn(i+"_Scaled", unlist(i+"_Scaled")).drop(i+"_Vect")

# print("After Scaling :")
# df.show(5)

<b>Why results with StandardScaler are not the same as if I compute (value-mean)/std on my own ?</b>

In [9]:
revenue = [12560,42560,31285,10345]
print("Mean of revenue is: " + str(np.mean(revenue)))
print("Standard deviation of revenue is: " + str(np.std(revenue)))

print("\n Scaled values are: ")
print((12560-24187.5)/13367.281183920686)
print((10345-24187.5)/13367.281183920686)

Mean of revenue is: 24187.5
Standard deviation of revenue is: 13367.281183920686

 Scaled values are: 
-0.8698477902886157
-1.0355508954693755


<link> https://stackoverflow.com/questions/51753088/standardscaler-in-spark-not-working-as-expected </link>

### Compute the difference between two columns

In [10]:
from pyspark.sql.functions import col

df = spark.createDataFrame(
[("a", 1, 0), ("a", -1, 42), ("b", 3, -1), ("b", 10, -2)],
("key", "value1", "value2")
)

df.withColumn('diff', col('value1')-col('value2')).show()

+---+------+------+----+
|key|value1|value2|diff|
+---+------+------+----+
|  a|     1|     0|   1|
|  a|    -1|    42| -43|
|  b|     3|    -1|   4|
|  b|    10|    -2|  12|
+---+------+------+----+



## 3. Operations on your spark dataframe

### Count the number of missing values by column

In [11]:
from pyspark.sql.functions import isnan, when, count, col

df = spark.createDataFrame([ (1, 'A',12560,45),
                             (1, 'B',42560,90),
                             (1, 'C',31285,120),
                             (1, 'D',10345,150)
                           ], ["userID", "Name","Revenue","No_of_Days"])

df.show(5)

df.select([count(when(isnan(c), c)).alias(c) for c in df.columns]).show()

+------+----+-------+----------+
|userID|Name|Revenue|No_of_Days|
+------+----+-------+----------+
|     1|   A|  12560|        45|
|     1|   B|  42560|        90|
|     1|   C|  31285|       120|
|     1|   D|  10345|       150|
+------+----+-------+----------+

+------+----+-------+----------+
|userID|Name|Revenue|No_of_Days|
+------+----+-------+----------+
|     0|   0|      0|         0|
+------+----+-------+----------+



In [12]:
from pyspark.sql.functions import isnan, when, count, col

df = spark.createDataFrame([ (1, 'A',12560,45),
                             (1, 'B',42560,90),
                             (1, 'C',31285,120),
                             (1, 'D',10345,150)
                           ], ["userID", "Name","Revenue","No_of_Days"])

df.show(5)

df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+------+----+-------+----------+
|userID|Name|Revenue|No_of_Days|
+------+----+-------+----------+
|     1|   A|  12560|        45|
|     1|   B|  42560|        90|
|     1|   C|  31285|       120|
|     1|   D|  10345|       150|
+------+----+-------+----------+

+------+----+-------+----------+
|userID|Name|Revenue|No_of_Days|
+------+----+-------+----------+
|     0|   0|      0|         0|
+------+----+-------+----------+



Source : https://stackoverflow.com/questions/44627386/how-to-find-count-of-null-and-nan-values-for-each-column-in-a-pyspark-dataframe

### Aggregate with known function using groupby

In [13]:
from  pyspark.sql.functions import *

df = spark.createDataFrame(
[("a", 1, 0), ("a", -1, 42), ("b", 3, -1), ("b", 10, -2)],
("key", "value1", "value2")
)

df.groupby('key').agg(count('value1')).show()

+---+-------------+
|key|count(value1)|
+---+-------------+
|  b|            2|
|  a|            2|
+---+-------------+



### Aggregate with your own function using groupby

In [23]:
pd.DataFrame([[1] + [2]])

Unnamed: 0,0,1
0,1,2


In [8]:
df = spark.createDataFrame(
[("a", 1, 0), ("a", -1, 42), ("b", 3, -1), ("b", 10, -2)],
("key", "value1", "value2")
)

from pyspark.sql.types import *
from pyspark.sql.functions import pandas_udf,PandasUDFType

schema = StructType([
    StructField("key", StringType()),
    StructField("avg_value1", DoubleType()),
    StructField("avg_value2", DoubleType()),
    StructField("sum_avg", DoubleType()),
    StructField("sub_avg", DoubleType())
])

@pandas_udf(schema, functionType=PandasUDFType.GROUPED_MAP)
def g(df):
    gr = df['key'].iloc[0]
    x = df.value1.mean()
    y = df.value2.mean()
    w = df.value1.mean() + df.value2.mean()
    z = df.value1.mean() - df.value2.mean()
    return pd.DataFrame([[gr]+[x]+[y]+[w]+[z]])
#     return pd.DataFrame([gr,x,y,w,z])

df.groupby("key").apply(g).show()

# df.show()

+---+----------+----------+-------+-------+
|key|avg_value1|avg_value2|sum_avg|sub_avg|
+---+----------+----------+-------+-------+
|  b|       6.5|      -1.5|    5.0|    8.0|
|  a|       0.0|      21.0|   21.0|  -21.0|
+---+----------+----------+-------+-------+



In [15]:
gf = spark.createDataFrame(
[("a", 2, 1), ("a", -2, 43), ("b", 4, -2), ("b", 11, -3)],
("key", "value1", "value2")
)

# gf.groupby("key").apply(g).show()
gf.show()

+---+------+------+
|key|value1|value2|
+---+------+------+
|  a|     2|     1|
|  a|    -2|    43|
|  b|     4|    -2|
|  b|    11|    -3|
+---+------+------+



### Use multiple arguments with pandas UDF

<b>Source</b>: https://stackoverflow.com/questions/59384870/pyspark-pandas-udf-using-partial-functions-went-wrong

In [16]:
##- Initialize your dataframe
df = spark.createDataFrame(
[("a", 1, 0), ("a", -1, 42), ("b", 3, -1), ("b", 10, -2)],
("key", "value1", "value2")
)

df.show()

+---+------+------+
|key|value1|value2|
+---+------+------+
|  a|     1|     0|
|  a|    -1|    42|
|  b|     3|    -1|
|  b|    10|    -2|
+---+------+------+



In [9]:
from pyspark.sql.types import *
from pyspark.sql.functions import pandas_udf,PandasUDFType

schema = StructType([
    StructField("key", StringType()),
    StructField("avg_value1", DoubleType()),
    StructField("avg_value2", DoubleType()),
    StructField("sum_avg", DoubleType()),
    StructField("sub_avg", DoubleType())
])

def h(threshold):
    @pandas_udf(schema, functionType=PandasUDFType.GROUPED_MAP)
    def g(df):
        gr = df['key'].iloc[0]
        x = df.value1.mean()
        y = df.value2.mean()
        w = df.value1.mean() + df.value2.mean()
        z = df.value1.mean() - df.value2.mean()
              
        if x > threshold:
            output = pd.DataFrame([[gr]+[x]+[y]+[w]+[z]])
        else:
            output = pd.DataFrame([[gr]+[99]+[y]+[w]+[z]])
        return output
    return g

df.groupby("key").apply(h(2)).show()

+---+----------+----------+-------+-------+
|key|avg_value1|avg_value2|sum_avg|sub_avg|
+---+----------+----------+-------+-------+
|  b|       6.5|      -1.5|    5.0|    8.0|
|  a|      99.0|      21.0|   21.0|  -21.0|
+---+----------+----------+-------+-------+



### Compute mean and standard deviation on a column

In [18]:
from pyspark.sql.functions import mean as pyspark_mean, stddev as pyspark_stddev

df = spark.createDataFrame(
[("a", 1, 0), ("a", -1, 42), ("b", 3, -1), ("b", 10, -2)],
("key", "value1", "value2")
)

df.select(pyspark_mean('value1').alias('value1_mean'),
              pyspark_stddev('value1').alias('value1_std')).show()

+-----------+------------------+
|value1_mean|        value1_std|
+-----------+------------------+
|       3.25|4.7871355387816905|
+-----------+------------------+



## 4.  Spark ML on your spark dataframe

### Correlation for spark DF

In [19]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.stat import Correlation

df = spark.createDataFrame(
[("a", 1, 0), ("a", -1, 42), ("b", 3, -1), ("b", 10, -2)],
("key", "value1", "value2")
)

assembler = VectorAssembler(inputCols=['value1', 'value2'], outputCol="features")
outdf = assembler.transform(df.select('value1', 'value2'))
print(Correlation.corr(outdf, "features", "pearson").collect())
corrdf = Correlation.corr(outdf, "features", "pearson")
print(str(corrdf.head()[0]))

print(Correlation.corr(outdf, "features", "pearson").collect()[0]["pearson({})".format("features")].values)
print(type(Correlation.corr(outdf, "features", "pearson").collect()[0]["pearson({})".format("features")].values))

[Row(pearson(features)=DenseMatrix(2, 2, [1.0, -0.6206, -0.6206, 1.0], False))]
DenseMatrix([[ 1.        , -0.62056458],
             [-0.62056458,  1.        ]])
[ 1.         -0.62056458 -0.62056458  1.        ]
<class 'numpy.ndarray'>


In [None]:
spark.stop()