# **Data Processing Operations using Pyspark**

In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [None]:
import findspark
findspark.init()

Loading Dataset


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("NumericalProcessing").getOrCreate()
data = spark.read.csv("/content/sample_data/california_housing_train.csv", header=True, inferSchema=True)
data.show(5)

Getting Column Names

In [None]:
colNames = data.columns
colNames

Basic Operations

In [None]:
from pyspark.sql.functions import col
res_data = data.withColumn("longitude_latitude_sum", col("longitude")+col("latitude"))
res_data.show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+----------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|longitude_latitude_sum|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+----------------------+
|  -114.31|   34.19|              15.0|     5612.0|        1283.0|    1015.0|     472.0|       1.4936|           66900.0|                -80.12|
|  -114.47|    34.4|              19.0|     7650.0|        1901.0|    1129.0|     463.0|         1.82|           80100.0|                -80.07|
|  -114.56|   33.69|              17.0|      720.0|         174.0|     333.0|     117.0|       1.6509|           85700.0|                -80.87|
|  -114.57|   33.64|              14.0|     1501.0|         337.0|     515.0|     226.0|       3.1917|           73400.0|    -80.9

Aggregation Operations

Sorting By a Column

In [None]:
from pyspark.sql.functions import avg, sum, max
avg_res = data.select(avg("latitude")).first()[0]
sum_res = data.select(sum("latitude")).first()[0]
max_res = data.select(max("latitude")).first()[0]
print(avg_res, sum_res, max_res)

35.6252247058827 605628.8200000059 41.95


Filtering

In [None]:
filter_res = data.filter(col("population")>1000)
filter_res.show()

In [None]:
sorted_by_rooms = data.orderBy("total_rooms")
sorted_by_rooms.show()

Statistical Opeartions

In [None]:
from pyspark.sql.functions import mean, stddev
mean_res = data.select(mean("total_bedrooms")).first()[0]
std_res = data.select(stddev("total_bedrooms")).first()[0]
print(mean_res, std_res)

539.4108235294118 421.4994515798648


Feature Engineering

In [None]:
from pyspark.sql.functions import sqrt
sqrt_data = data.withColumn("households_sqrt", sqrt(col("households")))
sqrt_data.show(5)

Missing Data Handling

In [None]:
from pyspark.sql.functions import when

filled_df = data.withColumn("filled_rooms", when(col("total_rooms").isNull(), 0).otherwise(col("total_rooms")))
filled_df.show()

User Defined Functions

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

def custom_function(value):
    return value * 2

custom_udf = udf(custom_function, DoubleType())
result_df = data.withColumn("custom_col", custom_udf(col("median_income")))
result_df.show()

Text Processing

In [None]:
txt_data = spark.read.text("lorem.txt")
txt_data.show()

Tokenization

In [None]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="value", outputCol="words")
tokenized_data = tokenizer.transform(txt_data)
tokenized_data.show()

Stop word removal

In [None]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered_data = remover.transform(tokenized_data)
filtered_data.show()

Text Cleaning

In [None]:
from pyspark.sql.functions import regexp_replace, lower

cleaned_data = filtered_data.withColumn("cleaned_text",
                  lower(regexp_replace(col("value"), "[^a-zA-Z\\s]", "")))
cleaned_data.show()

Text Analysis

In [None]:
from pyspark.sql.functions import length

data_with_length = cleaned_data.withColumn("text_length", length(col("cleaned_text")))
avg_length = data_with_length.agg({"text_length": "avg"}).collect()[0][0]
avg_length

78.44444444444444