# Quinn Funcions

In [1]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Column
from pyspark.sql.functions import col, trim, regexp_replace

In [3]:
spark = SparkSession.builder.master("local").appName("demo").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

24/10/02 17:17:42 WARN Utils: Your hostname, Matthews-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.15.118 instead (on interface en0)
24/10/02 17:17:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/02 17:17:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 59235)
Traceback (most recent call last):
  File "/opt/miniconda3/envs/pyspark-351/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/miniconda3/envs/pyspark-351/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/

## single_space

In [14]:
def single_space(col: Column) -> Column:
    return trim(regexp_replace(col, " +", " "))

In [15]:
df = spark.createDataFrame(
    [
        ("  I like     fish  ",),
        ("    zombies",),
        ("simpsons   cat lady",),
        (None,),
    ], ["words"]
)

In [16]:
df.show()

+-------------------+
|              words|
+-------------------+
|  I like     fish  |
|            zombies|
|simpsons   cat lady|
|               NULL|
+-------------------+



In [17]:
df.withColumn("clean_words", single_space(col("words"))).show()

+-------------------+-----------------+
|              words|      clean_words|
+-------------------+-----------------+
|  I like     fish  |      I like fish|
|            zombies|          zombies|
|simpsons   cat lady|simpsons cat lady|
|               NULL|             NULL|
+-------------------+-----------------+



## Snake case column names

In [18]:
def with_columns_renamed(fun):
    def _(df):
        cols = [col(f"`{col_name}`").alias(fun(col_name)) for col_name in df.columns]
        return df.select(*cols)
    return _

In [19]:
def snake_case_col_names(df):
    return with_columns_renamed(lambda s: s.lower().replace(" ", "_"))(df)

In [20]:
df = spark.createDataFrame(
    [("jose", "a"), ("li", "b"), ("sam", "c")],
    ["I like CHEESE", "YUMMMMY stuff"]
)

In [21]:
df.show()

+-------------+-------------+
|I like CHEESE|YUMMMMY stuff|
+-------------+-------------+
|         jose|            a|
|           li|            b|
|          sam|            c|
+-------------+-------------+



In [22]:
snake_case_col_names(df).show()

+-------------+-------------+
|i_like_cheese|yummmmy_stuff|
+-------------+-------------+
|         jose|            a|
|           li|            b|
|          sam|            c|
+-------------+-------------+

