In [1]:
import os

import findspark
findspark.init()
findspark.find()
import pyspark
findspark.find()

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark.sql.window import Window

In [2]:
#### Setting up the SparkContext
conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)

In [8]:
#### Load a random csv input dataset just for illustration
ROOT_PATH = os.getcwd()
INPUT_PATH = os.path.join(ROOT_PATH, "inputs", "BEV325OD3250.csv")

df = spark.read\
          .option("header", True)\
          .option("inferSchema", True)\
          .option("delimiter", ",")\
          .csv(INPUT_PATH)

In [4]:
#### General information about our dataframe
print("Shape: ", (df.count(), len(df.columns)), "\n")
print("Nbr of partitions:", df.rdd.getNumPartitions(), "\n")
print("---- Schema ----")
df.printSchema()

Shape:  (190680, 17) 

Nbr of partitions: 1 

---- Schema ----
root
 |-- StichtagDatJahr: integer (nullable = true)
 |-- StichtagDatMM: integer (nullable = true)
 |-- StichtagDatMonat: string (nullable = true)
 |-- StichtagDat: integer (nullable = true)
 |-- SexCd: integer (nullable = true)
 |-- SexLang: string (nullable = true)
 |-- AlterV20Sort: integer (nullable = true)
 |-- AlterV20Kurz: string (nullable = true)
 |-- HerkunftCd: integer (nullable = true)
 |-- HerkunftLang: string (nullable = true)
 |-- KreisCd: integer (nullable = true)
 |-- KreisLang: string (nullable = true)
 |-- QuarCd: integer (nullable = true)
 |-- QuarLang: string (nullable = true)
 |-- DatenstandCd: string (nullable = true)
 |-- DatenstandLang: string (nullable = true)
 |-- AnzBestWir: integer (nullable = true)



In [4]:
#### Deduplication by filtering

# Depending on the use case, one may need F.rank or F.row_number instead of F.dense_rank
def deduper(table):
    PARTITION = [
        "StichtagDatJahr",
        "KreisLang",
        "QuarLang",
        "AlterV20Kurz",
        "HerkunftLang",
        "SexLang"
    ]
    w = Window.partitionBy(PARTITION)\
              .orderBy(F.col("StichtagDatMM").desc(), F.col("StichtagDat").desc())
    
    return table.select("*", F.dense_rank().over(w).alias("rn"))\
                .filter(F.col("rn") == 1)\
                .drop("rn")

In [11]:
#### Multi aggregation

# It is more efficient to specify the values in the pivot function, so that Spark doesn't need
# to first compute the list of distinct values internally.

def multi_aggr(table):
    AGGR = [F.sum(F.col(c)).alias("new_" + c) for c in ["StichtagDatMM", "AlterV20Sort"]]
    return table.groupBy("StichtagDatJahr")\
                .pivot("SexCd", [1, 2])\
                .agg(*AGGR)


df_aggr = multi_aggr(df)
df_aggr.show()

+---------------+-------------------+------------------+-------------------+------------------+
|StichtagDatJahr|1_new_StichtagDatMM|1_new_AlterV20Sort|2_new_StichtagDatMM|2_new_AlterV20Sort|
+---------------+-------------------+------------------+-------------------+------------------+
|           2003|              26316|             12107|              26389|             12150|
|           2007|              26454|             12185|              26325|             12071|
|           2018|              26520|             12240|              26519|             12235|
|           2015|              26454|             12185|              26520|             12240|
|           2006|              26439|             12160|              26232|             12010|
|           2013|              26520|             12240|              26520|             12240|
|           2014|              26454|             12185|              26520|             12240|
|           2019|              26520|   

In [None]:
#### Broadcast small table in a join to speed up the process
def fast_join(big_table, small_table):
    return big_table.join(
        F.broadcast(small_table).coalesce(1),
        "PK",
        "left"
    )

In [3]:
#### Create a table form scratch
def create_df():
    my_schema = T.StructType([
            T.StructField("Firstname", T.StringType()),
            T.StructField("Lastname", T.StringType()),
            T.StructField("Age", T.IntegerType()),
            T.StructField("array_of_strings", T.ArrayType(T.StringType())),
            T.StructField("SWIFT_msg", T.StringType()),
        ])

    my_data = [
        {"Firstname": "James", "Lastname": "Bond", "Age": 43, "array_of_strings": ["chat", "chien"], "SWIFT_msg": None},
        {"Firstname": "Pierre", "Lastname": "Bin", "Age": 54, "array_of_strings": ["pizza", "milk", None], "SWIFT_msg": "32A: 1700.45, 52E: CHF"},
        {"Firstname": "Lara", "Lastname": "Tempo", "Age": None, "array_of_strings": ["US", "FR"], "SWIFT_msg": "32A: 1700.45, 52E: CHF, 70: London"}
    ]

    return spark.createDataFrame(my_data, my_schema)


my_df = create_df()
my_df.show()

+---------+--------+----+-------------------+--------------------+
|Firstname|Lastname| Age|   array_of_strings|           SWIFT_msg|
+---------+--------+----+-------------------+--------------------+
|    James|    Bond|  43|      [chat, chien]|                null|
|   Pierre|     Bin|  54|[pizza, milk, null]|32A: 1700.45, 52E...|
|     Lara|   Tempo|null|           [US, FR]|32A: 1700.45, 52E...|
+---------+--------+----+-------------------+--------------------+



In [9]:
#### Parsing

# Let's try to get the amount in two different ways
# The first approach uses "split" and getItem functions
# The second approach uses regex extraction
def parser(table):
    return table.withColumn("Amount_method1", F.split(F.col("SWIFT_msg"), "32A:").getItem(1))\
                .withColumn("Amount_method1", F.split(F.col("Amount_method1"), ", 52E:").getItem(0))\
                .withColumn("Amount_method2", F.regexp_extract('SWIFT_msg', r'(32A:\s)([0-9]*\.[0-9]{1,})', 2))


parse_df = parser(my_df)
parse_df.show()

+---------+--------+----+-------------------+--------------------+--------------+--------------+
|Firstname|Lastname| Age|   array_of_strings|           SWIFT_msg|Amount_method1|Amount_method2|
+---------+--------+----+-------------------+--------------------+--------------+--------------+
|    James|    Bond|  43|      [chat, chien]|                null|          null|          null|
|   Pierre|     Bin|  54|[pizza, milk, null]|32A: 1700.45, 52E...|       1700.45|       1700.45|
|     Lara|   Tempo|null|           [US, FR]|32A: 1700.45, 52E...|       1700.45|       1700.45|
+---------+--------+----+-------------------+--------------------+--------------+--------------+



In [None]:
#### Join on levenstein distance condition
def special_cond_join(table1, table2):
    join_cond = (
        (F.levenshtein(table1["Firstname"], table2["prenom"]) < 5)
        &
        (table1["years_old"] == table2["age"])
    )
    return table1.join(table2, join_cond, "inner")

In [None]:
#### String cleaner function using regex
def cleaner(table, list_cols):
    # remove block spaces bigger than 1 and trim
    for c in list_cols:
        table = table.withColumn("cleaned_" + c, F.trim(F.regexp_replace(c, r'(\s{2,})', '')))
    return table

In [31]:
#### Mapping function using SQL like statements
def mapper(table):
    my_mapings = [
        "Firstname as prenom",
        "Lastname as nom_de_famille",
        "Age + 2 as new_age"
    ]
    return table.selectExpr(my_mapings)


map_df = mapper(my_df)
map_df.show()

+------+--------------+-------+
|prenom|nom_de_famille|new_age|
+------+--------------+-------+
| James|          Bond|     45|
|Pierre|           Bin|     56|
|  Lara|         Tempo|   null|
+------+--------------+-------+

