# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [2]:
%idle_timeout 2880
%glue_version 5.0
%worker_type G.1X
%number_of_workers 5

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

JOB_NAME = "HW17-DA-Part2"
args = {'JOB_NAME': JOB_NAME}
  
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(JOB_NAME, args)

You are already connected to a glueetl session 06c17116-d14c-42c7-bd3f-de1d37a7c53e.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Current idle_timeout is None minutes.
idle_timeout has been set to 2880 minutes.


You are already connected to a glueetl session 06c17116-d14c-42c7-bd3f-de1d37a7c53e.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Setting Glue version to: 5.0


You are already connected to a glueetl session 06c17116-d14c-42c7-bd3f-de1d37a7c53e.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous worker type: None
Setting new worker type to: G.1X


You are already connected to a glueetl session 06c17116-d14c-42c7-bd3f-de1d37a7c53e.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous number of workers: None
Setting new number of workers to: 5



#### Example: Create a DynamicFrame from a table in the AWS Glue Data Catalog and display its schema


In [3]:
S3_INPUT = "s3://hw17-part2/processed_data/"
df = spark.read.parquet(S3_INPUT)
df.show()

+-----------+--------+------+--------------------+------+----+-----+-----+-------+--------+-----------+
|PassengerID|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Fare|Embarked|family_size|
+-----------+--------+------+--------------------+------+----+-----+-----+-------+--------+-----------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|   7.25|       S|          2|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|71.2833|       C|          2|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|  7.925|       S|          1|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|   53.1|       S|          2|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|   8.05|       S|          1|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    0|51.8625|       S|          1|
|          8|       0|     3|Palsson, Master. ...|  male| 2.0|  

In [8]:
from pyspark.sql import functions as f
# categorize the passengers to different age groups
df_age_groups = df.withColumn("age_groups",
                              f.when(f.col("Age") < 5, "baby")
                               .when((f.col("Age") >= 5) & (f.col("Age") < 18), "teenagers")
                               .when((f.col("Age") >= 18) & (f.col("Age") < 28), "youth")
                               .when((f.col("Age") >= 28) & (f.col("Age") < 40), "adults")
                               .when((f.col("Age") >= 40) & (f.col("Age") < 65), "middle-aged")
                               .when(f.col("Age") >= 65, "elderly")
)

df_age_groups.show()

+-----------+--------+------+--------------------+------+----+-----+-----+-------+--------+-----------+-----------+
|PassengerID|Survived|Pclass|                Name|   Sex| Age|SibSp|Parch|   Fare|Embarked|family_size| age_groups|
+-----------+--------+------+--------------------+------+----+-----+-----+-------+--------+-----------+-----------+
|          1|       0|     3|Braund, Mr. Owen ...|  male|22.0|    1|    0|   7.25|       S|          2|      youth|
|          2|       1|     1|Cumings, Mrs. Joh...|female|38.0|    1|    0|71.2833|       C|          2|     adults|
|          3|       1|     3|Heikkinen, Miss. ...|female|26.0|    0|    0|  7.925|       S|          1|      youth|
|          4|       1|     1|Futrelle, Mrs. Ja...|female|35.0|    1|    0|   53.1|       S|          2|     adults|
|          5|       0|     3|Allen, Mr. Willia...|  male|35.0|    0|    0|   8.05|       S|          1|     adults|
|          7|       0|     1|McCarthy, Mr. Tim...|  male|54.0|    0|    

In [12]:
# perform explotary analysis
# first, find out the number of people who broad the ship and the survivors, we can calculate the survival rate accordingly
df_age_count = df_age_groups.groupBy("age_groups") \
                            .agg(f.count(f.when(f.col("Survived") == 1, 1)).alias("Survivors"),
                                f.count(f.when(f.col("Survived") == 0, 1)).alias("Non_Survivors"),
                                f.count("*").alias("Total_People")
                                )
df_age_count = df_age_count.withColumn("Survival_Rate", f.round(f.col("Survivors") / f.col("Total_People"), 4))
df_age_count.show()

+-----------+---------+-------------+------------+-------------+
| age_groups|Survivors|Non_Survivors|Total_People|Survival_Rate|
+-----------+---------+-------------+------------+-------------+
|    elderly|        1|           10|          11|       0.0909|
|middle-aged|       60|           92|         152|       0.3947|
|      youth|       80|          144|         224|       0.3571|
|       baby|       27|           13|          40|        0.675|
|  teenagers|       34|           39|          73|       0.4658|
|     adults|       88|          126|         214|       0.4112|
+-----------+---------+-------------+------------+-------------+


In [20]:
# second, we can find the average ship fee for each class, and the survival rate accordingly
df_class = df_age_groups.groupBy("Pclass") \
                        .agg(f.count(f.when(f.col("Survived") == 1, 1)).alias("Survivors"),
                             f.count(f.when(f.col("Survived") == 0, 1)).alias("Non_Survivors"),
                             f.count("*").alias("Total_Class_People"),
                             f.round(f.avg(f.col("fare")), 4).alias("Average_Fare")
                            ).orderBy(f.asc("Pclass"))
df_class = df_class.withColumn("Survival_Rate", f.round(f.col("Survivors") / f.col("Total_Class_People"), 4))
df_class.show()

+------+---------+-------------+------------------+------------+-------------+
|Pclass|Survivors|Non_Survivors|Total_Class_People|Average_Fare|Survival_Rate|
+------+---------+-------------+------------------+------------+-------------+
|     1|      122|           64|               186|     87.9616|       0.6559|
|     2|       83|           90|               173|     21.4716|       0.4798|
|     3|       85|          270|               355|     13.2294|       0.2394|
+------+---------+-------------+------------------+------------+-------------+


In [32]:
# third, we can find the survival rate associate with gender
df_gender = df_age_groups.groupBy("Sex") \
                         .agg(f.count("*").alias("Total"),
                              f.count(f.when(f.col("Survived") == 1, 1)).alias("Survivors"),
                             )
df_gender = df_gender.withColumn("Survival_Rate", f.round(f.col("Survivors") / f.col("Total"), 4))
df_gender.show()

+------+-----+---------+-------------+
|   Sex|Total|Survivors|Survival_Rate|
+------+-----+---------+-------------+
|  male|  453|       93|       0.2053|
|female|  261|      197|       0.7548|
+------+-----+---------+-------------+


In [34]:
# forth, we can do survival rate and family size
df_family = df_age_groups.groupBy("family_size") \
                         .agg(f.count("*").alias("Total"),
                              f.count(f.when(f.col("Survived") == 1, 1)).alias("Survivors"),
                             ).orderBy(f.asc(f.col("family_size")))
df_family = df_family.withColumn("Survival_Rate", f.round(f.col("Survivors") / f.col("Total"), 4))
df_family.show()

+-----------+-----+---------+-------------+
|family_size|Total|Survivors|Survival_Rate|
+-----------+-----+---------+-------------+
|          1|  404|      130|       0.3218|
|          2|  139|       76|       0.5468|
|          3|   93|       53|       0.5699|
|          4|   27|       21|       0.7778|
|          5|   11|        3|       0.2727|
|          6|   22|        3|       0.1364|
|          7|   12|        4|       0.3333|
|          8|    6|        0|          0.0|
+-----------+-----+---------+-------------+


#### Example: Write the data in the DynamicFrame to a location in Amazon S3 and a table for it in the AWS Glue Data Catalog


In [40]:
from awsglue.dynamicframe import DynamicFrame
S3_OUTPUT = "s3://hw17-part2/DA_results/"

def write_out(df_out):
    # exchange the format: from spark dataframe to Glue dataframe
    dyf_out = DynamicFrame.fromDF(df_out, glueContext, "dyf_out")
    glueContext.write_dynamic_frame.from_options(
        frame = dyf_out,
        connection_type = "s3",
        format = "csv",
        connection_options = {
            "path": S3_OUTPUT,  
        },
        format_options = {"withHeader": True},
     )

write_out(df_age_groups)
write_out(df_age_count)
write_out(df_class)
write_out(df_gender)
write_out(df_family)




In [41]:
job.commit()


