In [4]:
%pip install boto3


Collecting boto3
  Downloading boto3-1.39.14-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.40.0,>=1.39.14 (from boto3)
  Downloading botocore-1.39.14-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.14.0,>=0.13.0 (from boto3)
  Downloading s3transfer-0.13.1-py3-none-any.whl.metadata (1.7 kB)
Collecting urllib3<1.27,>=1.25.4 (from botocore<1.40.0,>=1.39.14->boto3)
  Downloading urllib3-1.26.20-py2.py3-none-any.whl.metadata (50 kB)
Downloading boto3-1.39.14-py3-none-any.whl (139 kB)
Downloading botocore-1.39.14-py3-none-any.whl (13.9 MB)
   ---------------------------------------- 0.0/13.9 MB ? eta -:--:--
   -------------------------- ------------- 9.2/13.9 MB 47.4 MB/s eta 0:00:01
   ---------------------------------------- 13.9/13.9 MB 45.9 MB/s eta 0:00:00
Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.13.1-py3-none-any.whl 

In [None]:
import boto3
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType

spark = SparkSession.builder.appName("S3 iris").getOrCreate()
sc = spark.sparkContext

# get the iris data from the specific s3 bucket
s3 = boto3.client("s3")
bucket_name = "assignment14irisdatasets"
file_name = "iris.data"
s3.download_file(bucket_name, file_name, file_name)

# the schema of the iris data
schema = StructType([
    StructField("sepal_length", FloatType(), True),
    StructField("sepal_width", FloatType(), True),
    StructField("petal_length", FloatType(), True),
    StructField("petal_width", FloatType(), True),
    StructField("class", StringType(), True),
])

# read the iris data from the specific s3 bucket as dataframe
df = spark.read.csv(file_name, header = False, schema = schema)

# turn the dataframe into a temporary table
df.createOrReplaceTempView("iris")

# print some data to verify the data is read correctly
print(df.show())

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|      class|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
|         5.4|        3.9|         1.7|        0.4|Iris-setosa|
|         4.6|        3.4|         1.4|        0.3|Iris-setosa|
|         5.0|        3.4|         1.5|        0.2|Iris-setosa|
|         4.4|        2.9|         1.4|        0.2|Iris-setosa|
|         4.9|        3.1|         1.5|        0.1|Iris-setosa|
|         5.4|        3.7|         1.5|        0.2|Iris-setosa|
|         4.8|        3.4|         1.6|        0.2|Iris-setosa|
|         4.8|        3.0|         1.4| 

In [2]:
# run the SQL search
query = """SELECT class, COUNT(*) as count
           FROM iris 
           GROUP BY class"""
result = spark.sql(query)
result.show()

+---------------+-----+
|          class|count|
+---------------+-----+
| Iris-virginica|   50|
|    Iris-setosa|   50|
|Iris-versicolor|   50|
+---------------+-----+



In [3]:
#2. find the average sepal length of each class
query = """
        SELECT class, AVG(petal_length) as avg_petal_length
        FROM iris
        GROUP BY class
        """
result = spark.sql(query)
result.show()

+---------------+------------------+
|          class|  avg_petal_length|
+---------------+------------------+
| Iris-virginica| 5.551999988555909|
|    Iris-setosa|1.4639999961853027|
|Iris-versicolor| 4.259999980926514|
+---------------+------------------+



In [4]:
#3. find the species with maximum average sepal width
# calculate the average sepal width for each class and order by descending order, then select the top 1
query = """
        SELECT class, AVG(sepal_width) as avg_sepal_width
        FROM iris
        GROUP BY class
        ORDER BY avg_sepal_width DESC
        LIMIT 1
        """
result = spark.sql(query)
result.show()


+-----------+----------------+
|      class| avg_sepal_width|
+-----------+----------------+
|Iris-setosa|3.41800000667572|
+-----------+----------------+

