In [1]:
import getpass as gp
from pyspark.sql import SparkSession, functions as F, types as T

In [2]:
user=gp.getuser()

In [3]:
user

'itv005077'

In [4]:
spark = SparkSession.builder \
    .appName(f'{user}-Week-6-Assignment') \
    .config('spark.sql.warehouse.dir', f'/user/{user}/warehouse') \
    .config('spark.sql.catalogImplementation', 'hive') \
    .enableHiveSupport() \
    .getOrCreate()

In [5]:
spark

### Question-1

In [6]:
data = [
    ("Spring", 12.3),
    ("Summer", 10.5), 
    ("Autumn", 8.2), 
    ("Winter", 15.1)
]

In [7]:
columns = ['season', 'windspeed']

In [8]:
df = spark.createDataFrame(data, columns)

In [9]:
df.printSchema()

root
 |-- season: string (nullable = true)
 |-- windspeed: double (nullable = true)



In [10]:
df.show()

+------+---------+
|season|windspeed|
+------+---------+
|Spring|     12.3|
|Summer|     10.5|
|Autumn|      8.2|
|Winter|     15.1|
+------+---------+



In [11]:
!hadoop fs -head /public/trendytech/datasets/library_data.json

{"library_name": "Central Library","location": "City Center","books": [{"book_id": "B001","book_name": "The Great Gatsby","author": "F. Scott Fitzgerald","copies_available": 5},{"book_id": "B002","book_name": "To Kill a Mockingbird","author": "Harper Lee","copies_available": 3}],"members": [{"member_id": "M001","member_name": "John Smith","age": 28,"books_borrowed": ["B001"]},{"member_id": "M002","member_name": "Emma Johnson","age": 35,"books_borrowed": []}]},
{"library_name": "Community Library","location": "Suburb","books": [{"book_id": "B003","book_name": "1984","author": "George Orwell","copies_available": 2},{"book_id": "B004","book_name": "Pride and Prejudice","author": "Jane Austen","copies_available": 4}],"members": [{"member_id": "M003","member_name": "Michael Brown","age": 42,"books_borrowed": ["B003","B004"]},{"member_id": "M004","member_name": "Sophia Davis","age": 31,"books_borrowed": ["B004"]}]}


In [12]:
book_schema = T.StructType([
    T.StructField('book_id', T.StringType()),
    T.StructField('book_name', T.StringType()),
    T.StructField('author', T.StringType()),
    T.StructField('copies_available', T.IntegerType()),
])

member_schema = T.StructType([
    T.StructField('member_id', T.StringType()),
    T.StructField('member_name', T.StringType()),
    T.StructField('age', T.IntegerType()),
    T.StructField('books_borrowed', T.ArrayType(T.StringType())),
])


schema = T.StructType([
    T.StructField('library_name', T.StringType()),
    T.StructField('location', T.StringType()),
    T.StructField('books', T.ArrayType(book_schema)),
    T.StructField('members', T.ArrayType(member_schema)),
])

In [13]:
df_library = spark.read \
    .format('json') \
    .schema(schema) \
    .load('/public/trendytech/datasets/library_data.json')

In [14]:
df_library.printSchema()

root
 |-- library_name: string (nullable = true)
 |-- location: string (nullable = true)
 |-- books: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- book_id: string (nullable = true)
 |    |    |-- book_name: string (nullable = true)
 |    |    |-- author: string (nullable = true)
 |    |    |-- copies_available: integer (nullable = true)
 |-- members: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- member_id: string (nullable = true)
 |    |    |-- member_name: string (nullable = true)
 |    |    |-- age: integer (nullable = true)
 |    |    |-- books_borrowed: array (nullable = true)
 |    |    |    |-- element: string (containsNull = true)



In [15]:
df_library.show(truncate=False)

+-----------------+-----------+------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------+
|library_name     |location   |books                                                                                           |members                                                                    |
+-----------------+-----------+------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------+
|Central Library  |City Center|[[B001, The Great Gatsby, F. Scott Fitzgerald, 5], [B002, To Kill a Mockingbird, Harper Lee, 3]]|[[M001, John Smith, 28, [B001]], [M002, Emma Johnson, 35, []]]             |
|Community Library|Suburb     |[[B003, 1984, George Orwell, 2], [B004, Pride and Prejudice, Jane Austen, 4]]                   |[[M003, Michael Brown, 42, [B003, B004]], [M004, Sop

### Question-2:

In [16]:
!hadoop fs -head /public/trendytech/datasets/train.csv

train_number,train_name,seats_available,passenger_name,age,ticket_number,seat_number
123,Express,100,John,25,T123,A1
123,Express,100,Emma,30,T124,B2
456,Superfast,150,Michael,35,T125,C3
456,Superfast,150,Sophia,40,T126,D4
789,Local,50,William,28,T127,E5
789,Local,50,Sophia,32,T128,F6
789,Local,50,Oliver,45,T129,G7


In [17]:
schema = 'train_number int, train_name string, seats_available int, passenger_name string, age int, ticket_number string, seat_number string'

In [18]:
df_train = spark.read \
    .format('csv') \
    .option('header', 'true') \
    .schema(schema) \
    .load('/public/trendytech/datasets/train.csv')    

In [19]:
df_train.printSchema()

root
 |-- train_number: integer (nullable = true)
 |-- train_name: string (nullable = true)
 |-- seats_available: integer (nullable = true)
 |-- passenger_name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- ticket_number: string (nullable = true)
 |-- seat_number: string (nullable = true)



In [20]:
df_train.show(5)

+------------+----------+---------------+--------------+---+-------------+-----------+
|train_number|train_name|seats_available|passenger_name|age|ticket_number|seat_number|
+------------+----------+---------------+--------------+---+-------------+-----------+
|         123|   Express|            100|          John| 25|         T123|         A1|
|         123|   Express|            100|          Emma| 30|         T124|         B2|
|         456| Superfast|            150|       Michael| 35|         T125|         C3|
|         456| Superfast|            150|        Sophia| 40|         T126|         D4|
|         789|     Local|             50|       William| 28|         T127|         E5|
+------------+----------+---------------+--------------+---+-------------+-----------+
only showing top 5 rows



In [21]:
drop_cols = ['passenger_name', 'age']
df_train = df_train.drop(*drop_cols)

In [22]:
df_train.show(5)

+------------+----------+---------------+-------------+-----------+
|train_number|train_name|seats_available|ticket_number|seat_number|
+------------+----------+---------------+-------------+-----------+
|         123|   Express|            100|         T123|         A1|
|         123|   Express|            100|         T124|         B2|
|         456| Superfast|            150|         T125|         C3|
|         456| Superfast|            150|         T126|         D4|
|         789|     Local|             50|         T127|         E5|
+------------+----------+---------------+-------------+-----------+
only showing top 5 rows



In [23]:
dups_col = ['train_number', 'ticket_number']
df_train.dropDuplicates(dups_col).count()

7

In [24]:
df_train.select('train_name').distinct().show()

+----------+
|train_name|
+----------+
|   Express|
|     Local|
| Superfast|
+----------+



In [25]:
!hadoop fs -cat /public/trendytech/datasets/sales_data.json

{"store_id": 1, "product": "Apple", "quantity": 10, "revenue": 100.0}
{"store_id": 2, "product": "Banana", "quantity": 15, "revenue": 75.0}
{"store_id": 3, "product": "Orange", "quantity": 12, "revenue": 90.0}
{"store_id": 4, "product": "Mango", "quantity": 8, "revenue": 120.0}
{"store_id": 5, "product": "Grape", "quantity": 20, "revenue": 150.0}
{"store_id": 6, "product": "Watermelon", "quantity": 5, "revenue": 50.0}
{"store_id": 7, "product": "Strawberry", "quantity": 18, "revenue": 108.0}
{"store_id": 8, "product": "Pineapple", "quantity": 14, "revenue": 140.0}
{"store_id": 9, "product": "Cherry", "quantity": 7, "revenue": 105.0}
{"store_id": 10, "product": "Pear", "quantity": 9, "revenue": 81.0}
{"store_id": 11, "product": "Blueberry", "quantity": 11, "revenue": 88.0}
{"store_id": 12, "product": "Kiwi", "quantity": 16, "revenue": 128.0}
{"store_id": 13, "product": "Peach", "quantity": 13, "revenue": 91.0}
{"store_id": 14, "product": "Plum", "quantity": 6, "revenue": 54.0}
{"store_i

In [26]:
schema = T.StructType([
    T.StructField('store_id', T.IntegerType()),
    T.StructField('product', T.StringType()),
    T.StructField('quantity', T.IntegerType()),
    T.StructField('revenue', T.FloatType()),
])

In [27]:
df_sales = spark.read \
    .format('json') \
    .schema(schema) \
    .option('mode', 'permissive') \
    .load('/public/trendytech/datasets/sales_data.json')

In [28]:
df_sales.printSchema()

root
 |-- store_id: integer (nullable = true)
 |-- product: string (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- revenue: float (nullable = true)



In [29]:
df_sales.show(30)

+--------+----------+--------+-------+
|store_id|   product|quantity|revenue|
+--------+----------+--------+-------+
|       1|     Apple|      10|  100.0|
|       2|    Banana|      15|   75.0|
|       3|    Orange|      12|   90.0|
|       4|     Mango|       8|  120.0|
|       5|     Grape|      20|  150.0|
|       6|Watermelon|       5|   50.0|
|       7|Strawberry|      18|  108.0|
|       8| Pineapple|      14|  140.0|
|       9|    Cherry|       7|  105.0|
|      10|      Pear|       9|   81.0|
|      11| Blueberry|      11|   88.0|
|      12|      Kiwi|      16|  128.0|
|      13|     Peach|      13|   91.0|
|      14|      Plum|       6|   54.0|
|      15|     Lemon|      10|   70.0|
|      16| Raspberry|      17|  136.0|
|      17|   Coconut|       4|   80.0|
|      18|   Avocado|      11|   99.0|
|      19|Blackberry|       8|   64.0|
|      20|         G|    null|    NaN|
|    null|      null|    null|   null|
|      22|Watermelon|       5|   null|
+--------+----------+----

In [30]:
df_sales.count()

22

In [31]:
df_sales_dropmalformed = spark.read \
    .format('json') \
    .schema(schema) \
    .option('mode', 'dropmalformed') \
    .load('/public/trendytech/datasets/sales_data.json')

In [32]:
df_sales_dropmalformed.show(truncate=False)

+--------+----------+--------+-------+
|store_id|product   |quantity|revenue|
+--------+----------+--------+-------+
|1       |Apple     |10      |100.0  |
|2       |Banana    |15      |75.0   |
|3       |Orange    |12      |90.0   |
|4       |Mango     |8       |120.0  |
|5       |Grape     |20      |150.0  |
|6       |Watermelon|5       |50.0   |
|7       |Strawberry|18      |108.0  |
|8       |Pineapple |14      |140.0  |
|9       |Cherry    |7       |105.0  |
|10      |Pear      |9       |81.0   |
|11      |Blueberry |11      |88.0   |
|12      |Kiwi      |16      |128.0  |
|13      |Peach     |13      |91.0   |
|14      |Plum      |6       |54.0   |
|15      |Lemon     |10      |70.0   |
|16      |Raspberry |17      |136.0  |
|17      |Coconut   |4       |80.0   |
|18      |Avocado   |11      |99.0   |
|19      |Blackberry|8       |64.0   |
+--------+----------+--------+-------+



In [33]:
df_sales_dropmalformed.count()

21

In [34]:
!hadoop fs -ls /public/trendytech/datasets/

Found 12 items
drwxr-xr-x   - itv005857 supergroup          0 2023-05-18 17:40 /public/trendytech/datasets/customer_nested
-rw-r--r--   3 itv005857 supergroup       1319 2023-05-23 13:04 /public/trendytech/datasets/hospital.csv
-rw-r--r--   3 itv005857 supergroup        925 2023-05-23 13:05 /public/trendytech/datasets/library_data.json
drwxr-xr-x   - itv005857 supergroup          0 2023-05-28 04:30 /public/trendytech/datasets/orders
-rw-r--r--   3 itv005857 supergroup    7064041 2023-05-04 07:46 /public/trendytech/datasets/orders.json
-rw-r--r--   3 itv005857 supergroup        292 2023-05-18 10:50 /public/trendytech/datasets/orders_sample1.csv
-rw-r--r--   3 itv005857 supergroup        292 2023-05-18 10:50 /public/trendytech/datasets/orders_sample2.csv
-rw-r--r--   3 itv005857 supergroup        296 2023-05-18 10:50 /public/trendytech/datasets/orders_sample3.csv
drwxr-xr-x   - itv005857 supergroup          0 2023-05-04 07:54 /public/trendytech/datasets/ordersorc
drwxr-xr-x   - itv005857

In [35]:
!hadoop fs -head /public/trendytech/datasets/hospital.csv

patient_id,admission_date,discharge_date,diagnosis,doctor_id,total_cost
1,01-01-2022,2022-01-10,Pneumonia,101,5000.00
2,02-05-2022,2022-02-09,Appendicitis,102,7000.00
3,03-12-2022,2022-03-18,Fractured Arm,103,3500.00
4,04-02-2022,2022-04-08,Heart Attack,104,15000.00
5,05-05-2022,2022-05-07,Influenza,105,2500.00
6,06-10-2022,2022-06-15,Appendicitis,106,8000.00
7,07-20-2022,2022-07-25,Pneumonia,107,5500.00
8,08-25-2022,2022-09-01,Heart Attack,108,20000.00
9,09-15-2022,2022-09-22,Fractured Leg,109,6000.00
10,10-05-2022,2022-10-10,Appendicitis,110,7500.00
11,11-02-2022,2022-11-05,Influenza,111,2800.00
12,12-10-2022,2022-12-18,Pneumonia,112,6000.00
13,01-02-2023,2023-01-09,Heart Attack,113,18000.00
14,02-14-2023,2023-02-18,Appendicitis,114,7200.00
15,03-20-2023,2023-03-28,Fractured Arm,115,3800.00
16,04-05-2023,2023-04-11,Influenza,116,2700.00
17,05-08-2023,2023-05-11,Heart Attack,117,16000.00
18,06-15-2023,2023-06-20,Pneumonia,118,4800.00
19,07-22-2023,2023-07-27,Fractured Leg,119,6500.00


In [36]:
schema = T.StructType([
    T.StructField('patient_id', T.IntegerType()),
    T.StructField('admission_date', T.StringType()),
    T.StructField('discharge_date', T.DateType()),
    T.StructField('diagnosis', T.StringType()),
    T.StructField('doctor_id', T.IntegerType()),
    T.StructField('total_cost', T.FloatType()),
])

In [37]:
df_patient = spark.read \
    .format('csv') \
    .option('header', 'true') \
    .schema(schema) \
    .load('/public/trendytech/datasets/hospital.csv')

In [38]:
df_patient.printSchema()

root
 |-- patient_id: integer (nullable = true)
 |-- admission_date: string (nullable = true)
 |-- discharge_date: date (nullable = true)
 |-- diagnosis: string (nullable = true)
 |-- doctor_id: integer (nullable = true)
 |-- total_cost: float (nullable = true)



In [39]:
df_patient.show()

+----------+--------------+--------------+-------------+---------+----------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|
+----------+--------------+--------------+-------------+---------+----------+
|         1|    01-01-2022|    2022-01-10|    Pneumonia|      101|    5000.0|
|         2|    02-05-2022|    2022-02-09| Appendicitis|      102|    7000.0|
|         3|    03-12-2022|    2022-03-18|Fractured Arm|      103|    3500.0|
|         4|    04-02-2022|    2022-04-08| Heart Attack|      104|   15000.0|
|         5|    05-05-2022|    2022-05-07|    Influenza|      105|    2500.0|
|         6|    06-10-2022|    2022-06-15| Appendicitis|      106|    8000.0|
|         7|    07-20-2022|    2022-07-25|    Pneumonia|      107|    5500.0|
|         8|    08-25-2022|    2022-09-01| Heart Attack|      108|   20000.0|
|         9|    09-15-2022|    2022-09-22|Fractured Leg|      109|    6000.0|
|        10|    10-05-2022|    2022-10-10| Appendicitis|      11

In [40]:
df_patient = df_patient.withColumn('admission_date', F.to_date('admission_date', 'MM-dd-yyyy'))

In [41]:
df_patient.printSchema()

root
 |-- patient_id: integer (nullable = true)
 |-- admission_date: date (nullable = true)
 |-- discharge_date: date (nullable = true)
 |-- diagnosis: string (nullable = true)
 |-- doctor_id: integer (nullable = true)
 |-- total_cost: float (nullable = true)



In [42]:
df_patient.show()

+----------+--------------+--------------+-------------+---------+----------+
|patient_id|admission_date|discharge_date|    diagnosis|doctor_id|total_cost|
+----------+--------------+--------------+-------------+---------+----------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|      101|    5000.0|
|         2|    2022-02-05|    2022-02-09| Appendicitis|      102|    7000.0|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|      103|    3500.0|
|         4|    2022-04-02|    2022-04-08| Heart Attack|      104|   15000.0|
|         5|    2022-05-05|    2022-05-07|    Influenza|      105|    2500.0|
|         6|    2022-06-10|    2022-06-15| Appendicitis|      106|    8000.0|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|      107|    5500.0|
|         8|    2022-08-25|    2022-09-01| Heart Attack|      108|   20000.0|
|         9|    2022-09-15|    2022-09-22|Fractured Leg|      109|    6000.0|
|        10|    2022-10-05|    2022-10-10| Appendicitis|      11

In [43]:
df_patient = df_patient \
    .drop('doctor_id') \
    .withColumnRenamed('total_cost', 'hospital_bill') \
    .withColumn('duration_of_stay', F.datediff('discharge_date', 'admission_date'))

In [44]:
df_patient.show()

+----------+--------------+--------------+-------------+-------------+----------------+
|patient_id|admission_date|discharge_date|    diagnosis|hospital_bill|duration_of_stay|
+----------+--------------+--------------+-------------+-------------+----------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|       5000.0|               9|
|         2|    2022-02-05|    2022-02-09| Appendicitis|       7000.0|               4|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|       3500.0|               6|
|         4|    2022-04-02|    2022-04-08| Heart Attack|      15000.0|               6|
|         5|    2022-05-05|    2022-05-07|    Influenza|       2500.0|               2|
|         6|    2022-06-10|    2022-06-15| Appendicitis|       8000.0|               5|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|       5500.0|               5|
|         8|    2022-08-25|    2022-09-01| Heart Attack|      20000.0|               7|
|         9|    2022-09-15|    2

In [45]:
df_patient = df_patient \
    .withColumn('adjusted_total', 
                F.expr(
                    'case when diagnosis == "Heart Attack" then hospital_bill * 1.5 \
                          when diagnosis == "Appendicitis" then hospital_bill * 1.2 \
                    else hospital_bill end'))

In [46]:
df_patient.show()

+----------+--------------+--------------+-------------+-------------+----------------+--------------+
|patient_id|admission_date|discharge_date|    diagnosis|hospital_bill|duration_of_stay|adjusted_total|
+----------+--------------+--------------+-------------+-------------+----------------+--------------+
|         1|    2022-01-01|    2022-01-10|    Pneumonia|       5000.0|               9|        5000.0|
|         2|    2022-02-05|    2022-02-09| Appendicitis|       7000.0|               4|        8400.0|
|         3|    2022-03-12|    2022-03-18|Fractured Arm|       3500.0|               6|        3500.0|
|         4|    2022-04-02|    2022-04-08| Heart Attack|      15000.0|               6|       22500.0|
|         5|    2022-05-05|    2022-05-07|    Influenza|       2500.0|               2|        2500.0|
|         6|    2022-06-10|    2022-06-15| Appendicitis|       8000.0|               5|        9600.0|
|         7|    2022-07-20|    2022-07-25|    Pneumonia|       5500.0|   

In [47]:
cols = ["patient_id", "diagnosis", "hospital_bill", "adjusted_total"]
df_patient.select(*cols).show()

+----------+-------------+-------------+--------------+
|patient_id|    diagnosis|hospital_bill|adjusted_total|
+----------+-------------+-------------+--------------+
|         1|    Pneumonia|       5000.0|        5000.0|
|         2| Appendicitis|       7000.0|        8400.0|
|         3|Fractured Arm|       3500.0|        3500.0|
|         4| Heart Attack|      15000.0|       22500.0|
|         5|    Influenza|       2500.0|        2500.0|
|         6| Appendicitis|       8000.0|        9600.0|
|         7|    Pneumonia|       5500.0|        5500.0|
|         8| Heart Attack|      20000.0|       30000.0|
|         9|Fractured Leg|       6000.0|        6000.0|
|        10| Appendicitis|       7500.0|        9000.0|
|        11|    Influenza|       2800.0|        2800.0|
|        12|    Pneumonia|       6000.0|        6000.0|
|        13| Heart Attack|      18000.0|       27000.0|
|        14| Appendicitis|       7200.0|        8640.0|
|        15|Fractured Arm|       3800.0|        

In [48]:
spark.stop()