Query the two cities in STATION with the shortest and longest CITY names, as well as their respective lengths (i.e.: number of characters in the name). If there is more than one smallest or largest city, choose the one that comes first when ordered alphabetically.
The STATION table is described as follows:


![Output](https://s3.amazonaws.com/hr-challenge-images/9336/1449345840-5f0a551030-Station.jpg)


where LAT_N is the northern latitude and LONG_W is the western longitude.

Sample Input

For example, CITY has four entries: DEF, ABC, PQRS and WXY.

Sample Output

ABC 3
PQRS 4
Explanation

When ordered alphabetically, the CITY names are listed as ABC, DEF, PQRS, and WXY, with lengths  and . The longest name is PQRS, but there are  options for shortest named city. Choose ABC, because it comes first alphabetically.

Note
You can write two separate queries to get the desired output. It need not be a single query.

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

# Initialize Spark session
spark = SparkSession.builder.appName("StationTable").getOrCreate()

# Define the schema for the STATION table
schema = StructType([
    StructField("ID", IntegerType(), True),
    StructField("City", StringType(), True),
    StructField("State", StringType(), True),
    StructField("LAT_N", FloatType(), True),
    StructField("LONG_W", FloatType(), True)
])

# Create the sample data
data = [
    (1, "New York", "NY", 40.7128, -74.0060),
    (2, "Los Angeles", "CA", 34.0522, -118.2437),
    (3, "Chicago", "IL", 41.8781, -87.6298),
    (4, "Houston", "TX", 29.7604, -95.3698),
    (5, "Phoenix", "AZ", 33.4484, -112.0740),
    (6, "San Antonio", "TX", 29.4241, -98.4936),
    (7, "San Diego", "CA", 32.7157, -117.1611),
    (8, "Dallas", "TX", 32.7767, -96.7970),
    (9, "San Jose", "CA", 37.3382, -121.8863),
    (10, "Austin", "TX", 30.2672, -97.7431),
    (11, "Toronto", "ON", 43.65107, -79.347015),
    (12, "London", "ENG", 51.5074, -0.1278),
    (13, "Vancouver", "BC", 49.2827, -123.1207),
    (14, "London", "ENG", 51.5074, -0.1278),
    (15, "Vancouver", "BC", 49.2827, -123.1207)
]

# Create DataFrame
df = spark.createDataFrame(data, schema)

# Save the DataFrame as a temporary table
df.createOrReplaceTempView("STATION")

# Save as a permanent table (uncomment if needed)
# df.write.mode("overwrite").saveAsTable("STATION")

# Show the DataFrame
df.show()


+---+-----------+-----+--------+----------+
| ID|       City|State|   LAT_N|    LONG_W|
+---+-----------+-----+--------+----------+
|  1|   New York|   NY| 40.7128|   -74.006|
|  2|Los Angeles|   CA| 34.0522| -118.2437|
|  3|    Chicago|   IL| 41.8781|  -87.6298|
|  4|    Houston|   TX| 29.7604|  -95.3698|
|  5|    Phoenix|   AZ| 33.4484|  -112.074|
|  6|San Antonio|   TX| 29.4241|  -98.4936|
|  7|  San Diego|   CA| 32.7157| -117.1611|
|  8|     Dallas|   TX| 32.7767|   -96.797|
|  9|   San Jose|   CA| 37.3382| -121.8863|
| 10|     Austin|   TX| 30.2672|  -97.7431|
| 11|    Toronto|   ON|43.65107|-79.347015|
| 12|     London|  ENG| 51.5074|   -0.1278|
| 13|  Vancouver|   BC| 49.2827| -123.1207|
| 14|     London|  ENG| 51.5074|   -0.1278|
| 15|  Vancouver|   BC| 49.2827| -123.1207|
+---+-----------+-----+--------+----------+



In [0]:
%sql
WITH CityLengths AS (
    SELECT CITY, LENGTH(CITY) AS NameLength
    FROM STATION
)
SELECT CITY, NameLength
FROM (
    SELECT CITY, NameLength,
           ROW_NUMBER() OVER (ORDER BY CITY) AS rn
    FROM CityLengths
    WHERE NameLength = (SELECT MIN(NameLength) FROM CityLengths)
    LIMIT 1
) 
UNION ALL
SELECT CITY, NameLength
FROM (
    SELECT CITY, NameLength,
           ROW_NUMBER() OVER (ORDER BY CITY) AS rn
    FROM CityLengths
    WHERE NameLength = (SELECT MAX(NameLength) FROM CityLengths)
    LIMIT 1
)
ORDER BY NameLength;


CITY,NameLength
Austin,6
Los Angeles,11


In [0]:
from pyspark.sql.functions import col, length, min, max

# Add a column with the length of each city name
df_with_len = df.withColumn("len", length(col("City")))

# Get the minimum and maximum city name lengths
min_len = df_with_len.agg(min("len")).collect()[0][0]
max_len = df_with_len.agg(max("len")).collect()[0][0]

# Find the shortest and longest city names, ordering alphabetically and picking the first
shortest_city = df_with_len.filter(col("len") == min_len).orderBy("City").limit(1)
longest_city = df_with_len.filter(col("len") == max_len).orderBy("City").limit(1)

# Combine results
shortest_city.union(longest_city).show()


+---+-----------+-----+-------+---------+---+
| ID|       City|State|  LAT_N|   LONG_W|len|
+---+-----------+-----+-------+---------+---+
| 10|     Austin|   TX|30.2672| -97.7431|  6|
|  2|Los Angeles|   CA|34.0522|-118.2437| 11|
+---+-----------+-----+-------+---------+---+

