## Import

In [1]:
import os

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

## Config

### Analysis Condition

In [9]:
WRITE_RESULTS = True

## Spark Session

In [2]:
MASTER = 'local[10]'
CONFIG_DICT = {
    'spark.driver.memory': "8g",
    'spark.executor.memory': "8g",
    'spark.checkpoint.compress': True,
}

### Data Path

In [3]:
DATA_PATH = 'PATH/TO/INITIAL/DATA'
NORMALIZED_DATA_OUTPUT_PATH = 'PATH/TO/NORMALIZED/DATA/DIRECTORY'

In [4]:
DEBUGGING_MODE = False
TRUNCATE = True
def df_summary(df, df_name=None, debugging_mode=DEBUGGING_MODE, truncate=TRUNCATE):
    if df_name is not None:
        print(f"DataFrame {df_name}")
    df.printSchema()
    if debugging_mode:
        print("Number of rows in df:", df.count())
        print("Sample rows:")
        df.show(5, truncate=truncate)

## Spark

In [5]:
def get_spark_session(app_name,
                      master=MASTER,
                      spark_config=CONFIG_DICT,):
    spark_builder = SparkSession.builder.master(master).appName(app_name)

    for key, val in spark_config.items():
        spark_builder.config(key, val)

    spark_session = spark_builder.getOrCreate()
    sc = spark_session.sparkContext
    return spark_session

In [6]:
spark = get_spark_session("Text Normalization")
print("Spark UI address:", spark.sparkContext.uiWebUrl)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/02/02 20:43:13 WARN Utils: Your hostname, nikast, resolves to a loopback address: 127.0.1.1; using 192.168.1.21 instead (on interface wlp0s20f3)
26/02/02 20:43:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/02 20:43:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark UI address: http://192.168.1.21:4040


# Load Datasets

In [7]:
def load_data(data_path=DATA_PATH,):
    return (
        spark.read.parquet(data_path)
    )

texts_df = load_data()
df_summary(texts_df)

root
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- reddit_scores: array (nullable = true)
 |    |-- element: integer (containsNull = true)
 |-- score: float (nullable = true)



In [8]:
texts_df.limit(5).toPandas()

                                                                                

Unnamed: 0,title,text,reddit_scores,score
0,FBI sought records related to 'Access Hollywoo...,The federal agents who raided the office of Pr...,"[25, 31, 2]",4.0
1,"IQ, Skepticism, and the Failure of Debate","IQ, Skepticism, and the Failure of Debate\n\nO...","[24, 2, 1]",5.0
2,Trump can't tell the difference between his pe...,The king sits in Dunfermline town drinking the...,[3],4.0
3,At least 19 wounded in cross-LoC shelling by I...,Heavy Indian shelling from across the Line of ...,[10],5.0
4,"ABC changes 'unnecessary and unjustified', Mic...",Three bills before parliament to change the AB...,[116],5.0


# Normalization Modules

# Write Data

In [None]:
if WRITE_RESULTS:
    normalized_df.write.parquet(NORMALIZED_DATA_OUTPUT_PATH, mode='overwrite')
        print('Normalized result saved!')