In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("MapsaDataEng").getOrCreate()

In [3]:
spark

In [4]:
from datetime import datetime, date
import pandas as pd
from pyspark.sql import Row

df = spark.createDataFrame([
    Row(a=1, b=2., c='string1', d=date(2000, 1, 1), e=datetime(2000, 1, 1, 12, 0)),
    Row(a=2, b=3., c='string2', d=date(2000, 2, 1), e=datetime(2000, 1, 2, 12, 0)),
    Row(a=4, b=5., c='string3', d=date(2000, 3, 1), e=datetime(2000, 1, 3, 12, 0))
])

In [5]:
df.show(2)

+---+---+-------+----------+-------------------+
|  a|  b|      c|         d|                  e|
+---+---+-------+----------+-------------------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|
+---+---+-------+----------+-------------------+
only showing top 2 rows



In [8]:
df.printSchema()

root
 |-- a: long (nullable = true)
 |-- b: double (nullable = true)
 |-- c: string (nullable = true)
 |-- d: date (nullable = true)
 |-- e: timestamp (nullable = true)



In [14]:
spark.conf.set('spark.sql.repl.eagerEval.enabled', False)

In [10]:
df

a,b,c,d,e
1,2.0,string1,2000-01-01,2000-01-01 12:00:00
2,3.0,string2,2000-02-01,2000-01-02 12:00:00
4,5.0,string3,2000-03-01,2000-01-03 12:00:00


In [11]:
df.select("a", "b").describe()

summary,a,b
count,3.0,3.0
mean,2.333333333333333,3.333333333333333
stddev,1.5275252316519468,1.5275252316519468
min,1.0,2.0
max,4.0,5.0


In [12]:
df.take(2)

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0)),
 Row(a=2, b=3.0, c='string2', d=datetime.date(2000, 2, 1), e=datetime.datetime(2000, 1, 2, 12, 0))]

In [18]:
x = df.filter(df.b < 5)

In [21]:
x.filter(df.a < 2).take(1)

[Row(a=1, b=2.0, c='string1', d=datetime.date(2000, 1, 1), e=datetime.datetime(2000, 1, 1, 12, 0))]

In [37]:
from pyspark.sql.functions import upper, pandas_udf, udf

In [31]:
df.select(upper(df.c)).show()

+--------+
|upper(c)|
+--------+
| STRING1|
| STRING2|
| STRING3|
+--------+



In [33]:
def increment(x):
    return x + 1


df.select(increment(df.a)).show()

+-------+
|(a + 1)|
+-------+
|      2|
|      3|
|      5|
+-------+



In [36]:
@pandas_udf('long')
def pandas_plus_one(series: pd.Series) -> pd.Series:
    # Simply plus one by using pandas Series.
    return series + 1

df.select(pandas_plus_one(df.a)).show()

+------------------+
|pandas_plus_one(a)|
+------------------+
|                 2|
|                 3|
|                 5|
+------------------+



In [43]:
def summ(x , y):
    return x + y


udf_summ = udf(summ)
df.withColumn("asghar", udf_summ(df.a, df.b)).show()

+---+---+-------+----------+-------------------+------+
|  a|  b|      c|         d|                  e|asghar|
+---+---+-------+----------+-------------------+------+
|  1|2.0|string1|2000-01-01|2000-01-01 12:00:00|   3.0|
|  2|3.0|string2|2000-02-01|2000-01-02 12:00:00|   5.0|
|  4|5.0|string3|2000-03-01|2000-01-03 12:00:00|   9.0|
+---+---+-------+----------+-------------------+------+



In [49]:
df = spark.createDataFrame([
    ['red', 'banana', 1, 10], ['blue', 'banana', 2, 20], ['red', 'carrot', 3, 30],
    ['blue', 'grape', 4, 40], ['red', 'carrot', 5, 50], ['black', 'carrot', 6, 60],
    ['red', 'banana', 7, 70], ['red', 'grape', 8, 80]], schema=['color', 'fruit', 'v1', 'v2'])
df.show()

+-----+------+---+---+
|color| fruit| v1| v2|
+-----+------+---+---+
|  red|banana|  1| 10|
| blue|banana|  2| 20|
|  red|carrot|  3| 30|
| blue| grape|  4| 40|
|  red|carrot|  5| 50|
|black|carrot|  6| 60|
|  red|banana|  7| 70|
|  red| grape|  8| 80|
+-----+------+---+---+



In [52]:
df.groupby('color').avg().show()

+-----+-------+-------+
|color|avg(v1)|avg(v2)|
+-----+-------+-------+
|  red|    4.8|   48.0|
| blue|    3.0|   30.0|
|black|    6.0|   60.0|
+-----+-------+-------+

