In [1]:
import getpass as gp
from pyspark.sql import SparkSession, functions as F, types as T

In [2]:
user = gp.getuser()
spark = SparkSession.builder \
    .appName(f'{user}-Week-5-Assignment-3') \
    .config('spark.sql.warehouse.dir', f'/user/{user}/warehouse') \
    .config('spark.sql.catalogImplementation', 'hive') \
    .enableHiveSupport() \
    .master('yarn') \
    .getOrCreate()

In [3]:
spark

In [4]:
!hadoop fs -ls /public/trendytech/retail_db/customers

Found 1 items
-rw-r--r--   3 itv005857 supergroup     953719 2023-04-26 16:47 /public/trendytech/retail_db/customers/part-00000


In [5]:
!hadoop fs -head /public/trendytech/retail_db/customers/part-00000

1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521
2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126
3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,3422 Blue Pioneer Bend,Caguas,PR,00725
4,Mary,Jones,XXXXXXXXX,XXXXXXXXX,8324 Little Common,San Marcos,CA,92069
5,Robert,Hudson,XXXXXXXXX,XXXXXXXXX,"10 Crystal River Mall ",Caguas,PR,00725
6,Mary,Smith,XXXXXXXXX,XXXXXXXXX,3151 Sleepy Quail Promenade,Passaic,NJ,07055
7,Melissa,Wilcox,XXXXXXXXX,XXXXXXXXX,9453 High Concession,Caguas,PR,00725
8,Megan,Smith,XXXXXXXXX,XXXXXXXXX,3047 Foggy Forest Plaza,Lawrence,MA,01841
9,Mary,Perez,XXXXXXXXX,XXXXXXXXX,3616 Quaking Street,Caguas,PR,00725
10,Melissa,Smith,XXXXXXXXX,XXXXXXXXX,8598 Harvest Beacon Plaza,Stafford,VA,22554
11,Mary,Huffman,XXXXXXXXX,XXXXXXXXX,3169 Stony Woods,Caguas,PR,00725
12,Christopher,Smith,XXXXXXXXX,XXXXXXXXX,5594 Jagged Embers By-pass,San Antonio,TX,78227
13,Mary,Baldwin,XXXXXXXXX,XXXXXXXXX,7922 Iron Oak Gardens,Caguas,PR,00725
14,Katherine

In [6]:
schema = T.StructType([
    T.StructField('cust_id', T.IntegerType()),
    T.StructField('cust_fname', T.StringType()),
    T.StructField('cust_lname', T.StringType()),
    T.StructField('cust_email', T.StringType()),
    T.StructField('cust_password', T.StringType()),
    T.StructField('cust_street', T.StringType()),
    T.StructField('cust_city', T.StringType()),
    T.StructField('cust_state', T.StringType()),
    T.StructField('cust_zipcode', T.StringType()),
])

In [7]:
df_cust = spark.read \
    .schema(schema) \
    .csv('/public/trendytech/retail_db/customers/part-00000')

In [8]:
df_cust.printSchema()

root
 |-- cust_id: integer (nullable = true)
 |-- cust_fname: string (nullable = true)
 |-- cust_lname: string (nullable = true)
 |-- cust_email: string (nullable = true)
 |-- cust_password: string (nullable = true)
 |-- cust_street: string (nullable = true)
 |-- cust_city: string (nullable = true)
 |-- cust_state: string (nullable = true)
 |-- cust_zipcode: string (nullable = true)



In [9]:
df_cust.show(5, truncate=False)

+-------+----------+----------+----------+-------------+-----------------------+-----------+----------+------------+
|cust_id|cust_fname|cust_lname|cust_email|cust_password|cust_street            |cust_city  |cust_state|cust_zipcode|
+-------+----------+----------+----------+-------------+-----------------------+-----------+----------+------------+
|1      |Richard   |Hernandez |XXXXXXXXX |XXXXXXXXX    |6303 Heather Plaza     |Brownsville|TX        |78521       |
|2      |Mary      |Barrett   |XXXXXXXXX |XXXXXXXXX    |9526 Noble Embers Ridge|Littleton  |CO        |80126       |
|3      |Ann       |Smith     |XXXXXXXXX |XXXXXXXXX    |3422 Blue Pioneer Bend |Caguas     |PR        |00725       |
|4      |Mary      |Jones     |XXXXXXXXX |XXXXXXXXX    |8324 Little Common     |San Marcos |CA        |92069       |
|5      |Robert    |Hudson    |XXXXXXXXX |XXXXXXXXX    |10 Crystal River Mall  |Caguas     |PR        |00725       |
+-------+----------+----------+----------+-------------+--------

In [10]:
df_cust.createOrReplaceTempView('customers')

In [11]:
spark.sql('show tables').filter('isTemporary == "true" and tableName == "customers"').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
|        |customers|       true|
+--------+---------+-----------+



In [12]:
spark.sql('select * from customers limit 5').show()

+-------+----------+----------+----------+-------------+--------------------+-----------+----------+------------+
|cust_id|cust_fname|cust_lname|cust_email|cust_password|         cust_street|  cust_city|cust_state|cust_zipcode|
+-------+----------+----------+----------+-------------+--------------------+-----------+----------+------------+
|      1|   Richard| Hernandez| XXXXXXXXX|    XXXXXXXXX|  6303 Heather Plaza|Brownsville|        TX|       78521|
|      2|      Mary|   Barrett| XXXXXXXXX|    XXXXXXXXX|9526 Noble Embers...|  Littleton|        CO|       80126|
|      3|       Ann|     Smith| XXXXXXXXX|    XXXXXXXXX|3422 Blue Pioneer...|     Caguas|        PR|       00725|
|      4|      Mary|     Jones| XXXXXXXXX|    XXXXXXXXX|  8324 Little Common| San Marcos|        CA|       92069|
|      5|    Robert|    Hudson| XXXXXXXXX|    XXXXXXXXX|10 Crystal River ...|     Caguas|        PR|       00725|
+-------+----------+----------+----------+-------------+--------------------+-----------

### Find the total number of customers in each state.

In [13]:
df_cust.groupBy('cust_state').count().show()

+----------+-----+
|cust_state|count|
+----------+-----+
|        AZ|  213|
|        SC|   41|
|        LA|   63|
|        MN|   39|
|        NJ|  219|
|        DC|   42|
|        OR|  119|
|        VA|  136|
|        RI|   15|
|        KY|   35|
|        MI|  254|
|        NV|  103|
|        WI|   64|
|        ID|    9|
|        CA| 2012|
|        CT|   73|
|        MT|    7|
|        NC|  150|
|        MD|  164|
|        DE|   23|
+----------+-----+
only showing top 20 rows



In [14]:
spark.sql('select cust_state, count(cust_id) from customers group by cust_state').show()

+----------+--------------+
|cust_state|count(cust_id)|
+----------+--------------+
|        AZ|           213|
|        SC|            41|
|        LA|            63|
|        MN|            39|
|        NJ|           219|
|        DC|            42|
|        OR|           119|
|        VA|           136|
|        RI|            15|
|        KY|            35|
|        MI|           254|
|        NV|           103|
|        WI|            64|
|        ID|             9|
|        CA|          2012|
|        CT|            73|
|        MT|             7|
|        NC|           150|
|        MD|           164|
|        DE|            23|
+----------+--------------+
only showing top 20 rows



### Find the top 5 most common last names among the customers.

In [15]:
df_cust.groupBy('cust_lname').count().orderBy('count', ascending=False).limit(5).show()

+----------+-----+
|cust_lname|count|
+----------+-----+
|     Smith| 4626|
|   Johnson|   76|
|  Williams|   69|
|     Jones|   65|
|     Brown|   62|
+----------+-----+



In [16]:
spark.sql('select cust_lname, count(cust_id) as count from customers group by cust_lname order by count desc limit 5').show()

+----------+-----+
|cust_lname|count|
+----------+-----+
|     Smith| 4626|
|   Johnson|   76|
|  Williams|   69|
|     Jones|   65|
|     Brown|   62|
+----------+-----+



### Check whether there are any customers whose zip codes are not valid (i.e., not equal to 5 digits).

In [17]:
df_cust.filter(F.length('cust_zipcode') != 5).show()

+-------+----------+----------+----------+-------------+-----------+---------+----------+------------+
|cust_id|cust_fname|cust_lname|cust_email|cust_password|cust_street|cust_city|cust_state|cust_zipcode|
+-------+----------+----------+----------+-------------+-----------+---------+----------+------------+
+-------+----------+----------+----------+-------------+-----------+---------+----------+------------+



In [18]:
spark.sql('select * from customers where length(cust_zipcode) != 5').show()

+-------+----------+----------+----------+-------------+-----------+---------+----------+------------+
|cust_id|cust_fname|cust_lname|cust_email|cust_password|cust_street|cust_city|cust_state|cust_zipcode|
+-------+----------+----------+----------+-------------+-----------+---------+----------+------------+
+-------+----------+----------+----------+-------------+-----------+---------+----------+------------+



### Find the number of customers from each city in the state of California(CA).

In [19]:
df_cust.filter('cust_state == "CA"').groupBy('cust_city').count().orderBy('count', ascending=False).show()

+--------------+-----+
|     cust_city|count|
+--------------+-----+
|   Los Angeles|  224|
|     San Diego|  104|
|      San Jose|   71|
|   Bakersfield|   41|
|     Santa Ana|   36|
|    Long Beach|   36|
|        Fresno|   29|
|     Escondido|   29|
|       Ontario|   29|
| San Francisco|   28|
|     Riverside|   27|
| Mission Viejo|   26|
|     Oceanside|   24|
|    Sacramento|   23|
|       Modesto|   23|
|       Fremont|   22|
|       Hayward|   21|
| Moreno Valley|   21|
|   Simi Valley|   20|
|San Bernardino|   20|
+--------------+-----+
only showing top 20 rows



In [20]:
spark.sql('select cust_city, count(cust_id) as count from customers where cust_state = "CA" group by cust_city order by count desc').show()

+--------------+-----+
|     cust_city|count|
+--------------+-----+
|   Los Angeles|  224|
|     San Diego|  104|
|      San Jose|   71|
|   Bakersfield|   41|
|     Santa Ana|   36|
|    Long Beach|   36|
|        Fresno|   29|
|     Escondido|   29|
|       Ontario|   29|
| San Francisco|   28|
|     Riverside|   27|
| Mission Viejo|   26|
|     Oceanside|   24|
|    Sacramento|   23|
|       Modesto|   23|
|       Fremont|   22|
|       Hayward|   21|
| Moreno Valley|   21|
|   Simi Valley|   20|
|San Bernardino|   20|
+--------------+-----+
only showing top 20 rows



In [21]:
spark.stop()