# ceja demo

In [1]:
import ceja
import delta
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [2]:
builder = (
    pyspark.sql.SparkSession.builder.appName("MyApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog",
    )
)

In [3]:
spark = delta.configure_spark_with_delta_pip(builder).getOrCreate()

:: loading settings :: url = jar:file:/Users/matthew.powers/opt/miniconda3/envs/pyspark-everything/lib/python3.9/site-packages/pyspark/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/matthew.powers/.ivy2/cache
The jars for the packages stored in: /Users/matthew.powers/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-13e03790-554c-42b9-b0b4-553503e84686;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.2.0 in central
	found io.delta#delta-storage;2.2.0 in central
	found org.antlr#antlr4-runtime;4.8 in central
:: resolution report :: resolve 309ms :: artifacts dl 15ms
	:: modules in use:
	io.delta#delta-core_2.12;2.2.0 from central in [default]
	io.delta#delta-storage;2.2.0 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     | 

23/02/12 13:08:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Phonetic algorithms

## NYSIIS

In [77]:
df = spark.createDataFrame([("jellyfish",), ("there",), ("their",), (None,)], ["word"])

In [78]:
df.show()

+---------+
|     word|
+---------+
|jellyfish|
|    there|
|    their|
|     null|
+---------+



In [81]:
df.withColumn("word_nysiis", ceja.nysiis(F.col("word"))).show()

+---------+-----------+
|     word|word_nysiis|
+---------+-----------+
|jellyfish|      JALYF|
|    there|        TAR|
|    their|        TAR|
|     null|       null|
+---------+-----------+



## Metaphone

In [82]:
df = spark.createDataFrame(
    [("jellyfish",), ("their",), ("there",), ("Klumpz",), ("Clumps",), (None,)], ["word"]
)

In [83]:
df.show()

                                                                                

+---------+
|     word|
+---------+
|jellyfish|
|    their|
|    there|
|   Klumpz|
|   Clumps|
|     null|
+---------+



In [84]:
df = df.withColumn("word_metaphone", ceja.metaphone(F.col("word")))

In [85]:
df.show()

+---------+--------------+
|     word|word_metaphone|
+---------+--------------+
|jellyfish|          JLFX|
|    their|            0R|
|    there|            0R|
|   Klumpz|         KLMPS|
|   Clumps|         KLMPS|
|     null|          null|
+---------+--------------+



## Match rating codex

In [26]:
data = [("jellyfish",), ("li",), ("l",), ("luisa",), (None,)]

In [27]:
df = spark.createDataFrame(data, ["word"])

In [None]:
df.show()

In [28]:
df = df.withColumn("word_match_rating_codex", ceja.match_rating_codex(F.col("word")))

In [29]:
df.show()

+---------+-----------------------+
|     word|word_match_rating_codex|
+---------+-----------------------+
|jellyfish|                 JLYFSH|
|       li|                      L|
|        l|                      L|
|    luisa|                     LS|
|     null|                   null|
+---------+-----------------------+



# Stemming algorithms

## Porter stem

In [96]:
data = [("washing",), ("washed",), ("wash",), (None,)]

In [97]:
df = spark.createDataFrame(data, ["word"])

In [98]:
df.show()

+-------+
|   word|
+-------+
|washing|
| washed|
|   wash|
|   null|
+-------+



In [99]:
df = df.withColumn("word_porter_stem", ceja.porter_stem(F.col("word")))

In [100]:
df.show()

+-------+----------------+
|   word|word_porter_stem|
+-------+----------------+
|washing|            wash|
| washed|            wash|
|   wash|            wash|
|   null|            null|
+-------+----------------+



# Similarity algorithms

## Damerau Levenshtein Distance

In [101]:
data = [("jellyfish", "smellyfish"), ("li", "lee"), ("luisa", "luisa"), (None, None)]

In [102]:
df = spark.createDataFrame(data, ["word1", "word2"])

In [103]:
df.show()

+---------+----------+
|    word1|     word2|
+---------+----------+
|jellyfish|smellyfish|
|       li|       lee|
|    luisa|     luisa|
|     null|      null|
+---------+----------+



In [104]:
df = df.withColumn(
    "damerau_levenshtein_distance",
    ceja.damerau_levenshtein_distance(F.col("word1"), F.col("word2")),
)

In [105]:
df.show()

+---------+----------+----------------------------+
|    word1|     word2|damerau_levenshtein_distance|
+---------+----------+----------------------------+
|jellyfish|smellyfish|                           2|
|       li|       lee|                           2|
|    luisa|     luisa|                           0|
|     null|      null|                        null|
+---------+----------+----------------------------+



## Hamming distance

In [44]:
data = [("jellyfish", "smellyfish"), ("li", "lee"), ("luisa", "bruna"), (None, None)]

In [45]:
df = spark.createDataFrame(data, ["word1", "word2"])

In [None]:
df.show()

In [47]:
df = df.withColumn(
    "hamming_distance", ceja.hamming_distance(F.col("word1"), F.col("word2"))
)

In [48]:
df.show()

+---------+----------+----------------+
|    word1|     word2|hamming_distance|
+---------+----------+----------------+
|jellyfish|smellyfish|               9|
|       li|       lee|               2|
|    luisa|     bruna|               4|
|     null|      null|            null|
+---------+----------+----------------+



## Jaro similarity

In [49]:
data = [
    ("jellyfish", "smellyfish"),
    ("li", "lee"),
    ("luisa", "bruna"),
    ("hi", "colombia"),
    (None, None),
]

In [50]:
df = spark.createDataFrame(data, ["word1", "word2"])

In [None]:
df.show()

In [54]:
df = df.withColumn(
    "jaro_similarity", ceja.jaro_similarity(F.col("word1"), F.col("word2"))
)

In [55]:
df.show()

+---------+----------+---------------+
|    word1|     word2|jaro_similarity|
+---------+----------+---------------+
|jellyfish|smellyfish|      0.8962963|
|       li|       lee|      0.6111111|
|    luisa|     bruna|            0.6|
|       hi|  colombia|            0.0|
|     null|      null|           null|
+---------+----------+---------------+



## Jaro Winkler similarity

In [56]:
data = [("jellyfish", "smellyfish"), ("li", "lee"), ("luisa", "bruna"), (None, None)]

In [57]:
df = spark.createDataFrame(data, ["word1", "word2"])

In [None]:
df.show()

In [60]:
df = df.withColumn(
    "jaro_winkler_similarity",
    ceja.jaro_winkler_similarity(F.col("word1"), F.col("word2")),
)

In [61]:
df.show()

+---------+----------+-----------------------+
|    word1|     word2|jaro_winkler_similarity|
+---------+----------+-----------------------+
|jellyfish|smellyfish|              0.8962963|
|       li|       lee|              0.6111111|
|    luisa|     bruna|                    0.6|
|     null|      null|                   null|
+---------+----------+-----------------------+



## Match rating comparison

In [62]:
data = [("mat", "matt"), ("there", "their"), ("luisa", "bruna"), (None, None)]

In [63]:
df = spark.createDataFrame(data, ["word1", "word2"])

In [None]:
df.show()

In [67]:
df = df.withColumn(
    "match_rating_comparison",
    ceja.match_rating_comparison(F.col("word1"), F.col("word2")),
)

In [68]:
df.show()

+-----+-----+-----------------------+
|word1|word2|match_rating_comparison|
+-----+-----+-----------------------+
|  mat| matt|                   true|
|there|their|                   true|
|luisa|bruna|                  false|
| null| null|                   null|
+-----+-----+-----------------------+

