### Reading in Kaggles Used Cars Date set for step 1 of our ETL

In [1]:
from pyspark.sql import SparkSession


In [2]:
spark = SparkSession.builder \
    .appName("ETL") \
    .getOrCreate()
    

In [3]:
df = spark.read.csv("AdvWorksData.csv", header=True, inferSchema=True)

In [4]:
df.show(5)

+---------------+------------------+--------------------+--------------+--------------+------+-------+------------+-----------+----------+---------+------------+---------+-----------------+--------+---------+-----------------+----------+-------------+---------+
|productcategory|productsubcategory|             product| saleterritory|       Country|  City|   Sate|    Customer|   Employee|OrderCount|OrderDate|StandardCost|UnitPrice|UnitPriceDiscount|Discount|ListPrice|SaleswithStandard|  NetSales|OrderQuantity|    Sales|
+---------------+------------------+--------------------+--------------+--------------+------+-------+------------+-----------+----------+---------+------------+---------+-----------------+--------+---------+-----------------+----------+-------------+---------+
|       Clothing|              Caps|        AWC Logo Cap|United Kingdom|United Kingdom| Berks|England|  Gary Suess|Amy Alberts|         1|  00:00.0|      6.9223|   5.0136|             0.02|  1.4038|   8.6442|      

### Our datas' Schema

In [5]:
df.printSchema()

root
 |-- productcategory: string (nullable = true)
 |-- productsubcategory: string (nullable = true)
 |-- product: string (nullable = true)
 |-- saleterritory: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Sate: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- Employee: string (nullable = true)
 |-- OrderCount: integer (nullable = true)
 |-- OrderDate: string (nullable = true)
 |-- StandardCost: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- UnitPriceDiscount: double (nullable = true)
 |-- Discount: double (nullable = true)
 |-- ListPrice: string (nullable = true)
 |-- SaleswithStandard: string (nullable = true)
 |-- NetSales: string (nullable = true)
 |-- OrderQuantity: integer (nullable = true)
 |-- Sales: double (nullable = true)



### We are going to make a new set of data for simpler exploration

In [6]:
new_columns = ["NetSales", "OrderQuantity", "UnitPrice", "productcategory","productsubcategory", "City", "Sate"]

In [7]:
new_columns = df.select(new_columns)
new_columns.show(5,truncate=False)

+----------+-------------+---------+---------------+------------------+------+-------+
|NetSales  |OrderQuantity|UnitPrice|productcategory|productsubcategory|City  |Sate   |
+----------+-------------+---------+---------------+------------------+------+-------+
|-28.125608|14           |5.0136   |Clothing       |Caps              |Berks |England|
|28.125    |6            |15.0     |Accessories    |Locks             |Berks |England|
|56.9848   |8            |22.794   |Clothing       |Gloves            |Berks |England|
|66.862    |16           |20.8945  |Clothing       |Gloves            |London|England|
|49.8617   |7            |22.794   |Clothing       |Gloves            |Berks |England|
+----------+-------------+---------+---------------+------------------+------+-------+
only showing top 5 rows



In [8]:
row_count = new_columns.count()
print(f"The number of rows in the new Dataframe is: {row_count}")

The number of rows in the new Dataframe is: 60880


### Going to perform a statistical query on this new DataSet

In [9]:
from pyspark.sql import functions as F

In [10]:
columns_of_interest = [
    F.col("NetSales"), F.col("OrderQuantity"), F.col("UnitPrice"),
    F.col("ProductCategory"), F.col("ProductSubcategory"),
    F.col("City"), F.col("Sate")
]

In [11]:
eda_df = df.select(columns_of_interest)

In [12]:
eda_df.describe().show(truncate=False)

+-------+------------------+------------------+------------------+---------------+------------------+--------+-------+
|summary|NetSales          |OrderQuantity     |UnitPrice         |ProductCategory|ProductSubcategory|City    |Sate   |
+-------+------------------+------------------+------------------+---------------+------------------+--------+-------+
|count  |60880             |60880             |60880             |60880          |60880             |60880   |60880  |
|mean   |-38.17250914731913|3.523587385019711 |444.40329774967597|NULL           |NULL              |NULL    |NULL   |
|stddev |405.5368767701377 |3.0339497480493964|519.9546238943453 |NULL           |NULL              |NULL    |NULL   |
|min    |-1.5283           |1                 |1.3282            |Accessories    |Bib-Shorts        |Abingdon|Alabama|
|max    |NULL              |44                |2146.962          |NULL           |Wheels            |Zeeland |Wyoming|
+-------+------------------+------------------+-

In [13]:
distinct_counts = {col: eda_df.select(col).distinct().count() for col in [
    "ProductCategory", "ProductSubcategory", "City", "Sate"]}
print("\nDistinct Counts:")
for col, count in distinct_counts.items():
    print(f"{col}: {count}")


Distinct Counts:
ProductCategory: 5
ProductSubcategory: 34
City: 416
Sate: 64


In [14]:
eda_df.show(5,truncate=False)

+----------+-------------+---------+---------------+------------------+------+-------+
|NetSales  |OrderQuantity|UnitPrice|ProductCategory|ProductSubcategory|City  |Sate   |
+----------+-------------+---------+---------------+------------------+------+-------+
|-28.125608|14           |5.0136   |Clothing       |Caps              |Berks |England|
|28.125    |6            |15.0     |Accessories    |Locks             |Berks |England|
|56.9848   |8            |22.794   |Clothing       |Gloves            |Berks |England|
|66.862    |16           |20.8945  |Clothing       |Gloves            |London|England|
|49.8617   |7            |22.794   |Clothing       |Gloves            |Berks |England|
+----------+-------------+---------+---------------+------------------+------+-------+
only showing top 5 rows



### Due to system security constraints and file issues I will convert this to Pandas and store in a temp directory

In [38]:
import pandas as pd
import tempfile
import os

In [39]:
temp_dir = tempfile.mkdtemp()

In [40]:
csv_file_path = os.path.join(temp_dir, "data.csv")

In [41]:
pandas_df.to_csv(csv_file_path, header=True,index=False)