In [1]:
!pip -q install pyspark

Collecting pyspark
  Downloading pyspark-3.3.2.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 kB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.3.2-py2.py3-none-any.whl size=281824025 sha256=87c0f744609fb94bfe3a8c9ba9637ef439ef15d399689d033a8d2439dd64b9b0
  Stored in directory: /root/.cache/pip/wheels/5a/54/9b/a89cac960efb57c4c35d41cc7c9f7b80daa21108bc376339b7
Successfully built pyspark
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py4j 0.10.9.7
  

## Background of the ETL

The note book will run inside the spark cluster, with Pyspark3 instance. Here ETL steps are created, and tested before executing in the cluster.

This notebook is dealing with the Customer data, which is available in CSV format. The dataset is attached with this notebook, no need download the data when testing in kaggle environment.

When executing inside the cluster the dataset needs to be pulled using kaggle api command, which is explained in the below videos

https://youtu.be/m_4ZDaX24co

In [None]:
### Do not execute this cell... This is for example

spark = SparkSession. \
    builder. \
    config('spark.ui.port', '0'). \
    config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
    enableHiveSupport(). \
    appName(f'{username} | ETL - Overview'). \
    master('yarn'). \
    getOrCreate()

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder.appName('Data Loader'). \
    enableHiveSupport(). \
    master('local'). \
    getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/30 09:07:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
# Reading the file into spark context

customer_raw = spark.read.csv("/kaggle/input/customers-dataset/Customers.csv",
                              header=True,
                             inferSchema=True)

# What transformations are executed in this notebook

1) The column names are modified in the Spark Dataframe

2) New table under the name customer_spark_table is created in Spark metastore

3) Execute a simple filter transformation. Select the rows that have income above 15000, and spending power above 50 

4) Write a new table inside spark metastore

5) Write the new table as csv file 

6) Convert the Jupyter notebook cells into Pyspark Script that can execute code on the given csv file(it will customer.csv file only)



In [7]:
customer_raw.head(2)

[Row(CustomerID=1, Gender='Male', Age=19, Annual Income ($)=15000, Spending Score (1-100)=39, Profession='Healthcare', Work Experience=1, Family Size=4),
 Row(CustomerID=2, Gender='Male', Age=21, Annual Income ($)=35000, Spending Score (1-100)=81, Profession='Engineer', Work Experience=3, Family Size=3)]

In [9]:
customer_raw.show(2,truncate=False)

+----------+------+---+-----------------+----------------------+----------+---------------+-----------+
|CustomerID|Gender|Age|Annual Income ($)|Spending Score (1-100)|Profession|Work Experience|Family Size|
+----------+------+---+-----------------+----------------------+----------+---------------+-----------+
|1         |Male  |19 |15000            |39                    |Healthcare|1              |4          |
|2         |Male  |21 |35000            |81                    |Engineer  |3              |3          |
+----------+------+---+-----------------+----------------------+----------+---------------+-----------+
only showing top 2 rows



In [10]:
customer_raw.columns

['CustomerID',
 'Gender',
 'Age',
 'Annual Income ($)',
 'Spending Score (1-100)',
 'Profession',
 'Work Experience',
 'Family Size']

In [13]:
customer_raw.schema

StructType([StructField('CustomerID', IntegerType(), True), StructField('Gender', StringType(), True), StructField('Age', IntegerType(), True), StructField('Annual Income ($)', IntegerType(), True), StructField('Spending Score (1-100)', IntegerType(), True), StructField('Profession', StringType(), True), StructField('Work Experience', IntegerType(), True), StructField('Family Size', IntegerType(), True)])

In [15]:
from pyspark.sql.types import StructField, StructType, IntegerType,StringType

updated_schema = StructType([StructField('CustomerID', 
                                         IntegerType(), True), 
                             StructField('Gender', 
                                         StringType(), True), 
                             StructField('Age', 
                                         IntegerType(), True), 
                             StructField('AnnualIncome', 
                                         IntegerType(), True), 
                             StructField('SpendingScore', 
                                         IntegerType(), True), 
                             StructField('Profession', 
                                         StringType(), True), 
                             StructField('WorkExperience', 
                                         IntegerType(), True), 
                             StructField('FamilySize', 
                                         IntegerType(), True)])

In [16]:
#Re-reading with the schema 

customer_updated = spark.read.csv("/kaggle/input/customers-dataset/Customers.csv",
                                 header=True, schema=updated_schema)

In [17]:
customer_updated.show(2)

23/03/30 09:21:41 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: CustomerID, Gender, Age, Annual Income ($), Spending Score (1-100), Profession, Work Experience, Family Size
 Schema: CustomerID, Gender, Age, AnnualIncome, SpendingScore, Profession, WorkExperience, FamilySize
Expected: AnnualIncome but found: Annual Income ($)
CSV file: file:///kaggle/input/customers-dataset/Customers.csv
+----------+------+---+------------+-------------+----------+--------------+----------+
|CustomerID|Gender|Age|AnnualIncome|SpendingScore|Profession|WorkExperience|FamilySize|
+----------+------+---+------------+-------------+----------+--------------+----------+
|         1|  Male| 19|       15000|           39|Healthcare|             1|         4|
|         2|  Male| 21|       35000|           81|  Engineer|             3|         3|
+----------+------+---+------------+-------------+----------+--------------+----------+
only showing top 2 rows



In [18]:
customer_updated.printSchema()

root
 |-- CustomerID: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- AnnualIncome: integer (nullable = true)
 |-- SpendingScore: integer (nullable = true)
 |-- Profession: string (nullable = true)
 |-- WorkExperience: integer (nullable = true)
 |-- FamilySize: integer (nullable = true)



In [20]:
spark.sql("Show Databases").show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [21]:
spark.sql("CREATE DATABASE IF NOT EXISTS customer_spark_db")

23/03/30 09:25:40 WARN ObjectStore: Failed to get database customer_spark_db, returning NoSuchObjectException
23/03/30 09:25:40 WARN ObjectStore: Failed to get database customer_spark_db, returning NoSuchObjectException
23/03/30 09:25:40 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
23/03/30 09:25:40 WARN ObjectStore: Failed to get database customer_spark_db, returning NoSuchObjectException


DataFrame[]

In [22]:
spark.sql("USE customer_spark_db")

DataFrame[]

In [25]:
customer_updated.write.saveAsTable("customer_spark_table",
                                  mode="overwrite",
                                  format="csv")

23/03/30 09:29:31 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: CustomerID, Gender, Age, Annual Income ($), Spending Score (1-100), Profession, Work Experience, Family Size
 Schema: CustomerID, Gender, Age, AnnualIncome, SpendingScore, Profession, WorkExperience, FamilySize
Expected: AnnualIncome but found: Annual Income ($)
CSV file: file:///kaggle/input/customers-dataset/Customers.csv
23/03/30 09:29:31 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider csv. Persisting data source table `customer_spark_db`.`customer_spark_table` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
23/03/30 09:29:31 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
23/03/30 09:29:31 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
23/03/30 09:29:31 WARN HiveConf: Hiv

In [26]:
spark.sql("SHOW TABLES").show()

+-----------------+--------------------+-----------+
|        namespace|           tableName|isTemporary|
+-----------------+--------------------+-----------+
|customer_spark_db|customer_spark_table|      false|
+-----------------+--------------------+-----------+



In [27]:
spark.sql("""SELECT * FROM customer_spark_table LIMIT 5""").show()

+----------+------+---+------------+-------------+-------------+--------------+----------+
|CustomerID|Gender|Age|AnnualIncome|SpendingScore|   Profession|WorkExperience|FamilySize|
+----------+------+---+------------+-------------+-------------+--------------+----------+
|         1|  Male| 19|       15000|           39|   Healthcare|             1|         4|
|         2|  Male| 21|       35000|           81|     Engineer|             3|         3|
|         3|Female| 20|       86000|            6|     Engineer|             1|         1|
|         4|Female| 23|       59000|           77|       Lawyer|             0|         2|
|         5|Female| 31|       38000|           40|Entertainment|             2|         6|
+----------+------+---+------------+-------------+-------------+--------------+----------+



In [28]:
spark.sql("""SELECT * FROM customer_spark_table
            WHERE AnnualIncome > 15000 AND
            SpendingScore > 50""").show(5)

+----------+------+---+------------+-------------+----------+--------------+----------+
|CustomerID|Gender|Age|AnnualIncome|SpendingScore|Profession|WorkExperience|FamilySize|
+----------+------+---+------------+-------------+----------+--------------+----------+
|         2|  Male| 21|       35000|           81|  Engineer|             3|         3|
|         4|Female| 23|       59000|           77|    Lawyer|             0|         2|
|         6|Female| 22|       58000|           76|    Artist|             0|         2|
|         8|Female| 23|       84000|           94|Healthcare|             1|         3|
|        10|Female| 30|       98000|           72|    Artist|             1|         4|
+----------+------+---+------------+-------------+----------+--------------+----------+
only showing top 5 rows



In [29]:
spark.sql("""SELECT * FROM customer_spark_table
            WHERE AnnualIncome > 15000 AND
            SpendingScore > 50""").count()

972

In [30]:
spark.sql("""CREATE TABLE transformed_customer_table
            SELECT * FROM customer_spark_table
            WHERE AnnualIncome > 15000 AND
            SpendingScore > 50""")

23/03/30 09:35:57 WARN ResolveSessionCatalog: A Hive serde table will be created as there is no table provider specified. You can set spark.sql.legacy.createHiveTableByDefault to false so that native data source table will be created instead.
23/03/30 09:35:57 WARN HiveMetaStore: Location: file:/kaggle/working/spark-warehouse/customer_spark_db.db/transformed_customer_table specified for non-external table:transformed_customer_table


DataFrame[]

In [31]:
spark.sql("SELECT * FROM transformed_customer_table").count()

972

In [32]:
spark.sql("""SELECT * FROM transformed_customer_table
          LIMIT 5""").show()

+----------+------+---+------------+-------------+----------+--------------+----------+
|CustomerID|Gender|Age|AnnualIncome|SpendingScore|Profession|WorkExperience|FamilySize|
+----------+------+---+------------+-------------+----------+--------------+----------+
|         2|  Male| 21|       35000|           81|  Engineer|             3|         3|
|         4|Female| 23|       59000|           77|    Lawyer|             0|         2|
|         6|Female| 22|       58000|           76|    Artist|             0|         2|
|         8|Female| 23|       84000|           94|Healthcare|             1|         3|
|        10|Female| 30|       98000|           72|    Artist|             1|         4|
+----------+------+---+------------+-------------+----------+--------------+----------+



In [34]:
spark.sql("""SELECT * FROM transformed_customer_table
          LIMIT 5""").write.csv("/kaggle/working/transformed.csv",
                               mode="overwrite")