In [3]:
!pip install pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("Airbnb Analysis") \
    .getOrCreate()

csv_url = "https://raw.githubusercontent.com/tnavarrete-iedib/bigdata-24-25/refs/heads/main/listings.csv"
!wget -O listings.csv {csv_url}

df = spark.read.csv("listings.csv", header=True, inferSchema=True)

df.printSchema()
df.show(5)

df.createOrReplaceTempView("airbnb_listings")

query = """
SELECT
    neighbourhood as municipi,
    COUNT(*) as count
FROM
    airbnb_listings
WHERE
    room_type = 'Entire home/apt'
    AND license IS NOT NULL
    AND license != ''
    AND number_of_reviews >= 100
GROUP BY
    neighbourhood
ORDER BY
    neighbourhood ASC
"""


result = spark.sql(query)
result.show(100, False)

result_df = df.filter(
    (col("room_type") == "Entire home/apt") &
    (col("license").isNotNull()) &
    (col("license") != "") &
    (col("number_of_reviews") >= 100)
).groupBy("neighbourhood").count().orderBy("neighbourhood")

print("\nResultat utilitzant l'API de DataFrame:")
result_df.show(100, False)

spark.stop()

--2025-04-20 10:51:46--  https://raw.githubusercontent.com/tnavarrete-iedib/bigdata-24-25/refs/heads/main/listings.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 584697 (571K) [text/plain]
Saving to: ‘listings.csv’


2025-04-20 10:51:46 (12.2 MB/s) - ‘listings.csv’ saved [584697/584697]

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- host_id: string (nullable = true)
 |-- host_name: string (nullable = true)
 |-- neighbourhood_group: string (nullable = true)
 |-- neighbourhood: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- room_type: string (nullable = true)
 |-- price: string (nullable = true)
 |-- minimum_nights: string (nullable = true)
 |-- number_of_rev