# Import & Launch Session

In [1]:
import pandas as pd
import os

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, Row

### Utils

In [3]:
def print_context_infos(context):
    print("VERSION: ",context.version)
    print("PYTHON_VERSION: ", context.pythonVer)
    print("MASTER: ", context.master)
    print("SPARK_HOME: ", str(context.sparkHome))
    print("SPARK_USER: ", str(context.sparkUser()))
    print("APP_NAME: ", context.appName)
    print("APP_ID: ", context.applicationId)
    print("DEFAULT_PARALLESLISM: ", context.defaultParallelism)
    print("DEFAULT_PARTITION: ", context.defaultMinPartitions)

### Initialisation

In [4]:
cwd = os.getcwd()
print(cwd)

/Users/manulabricole/Documents/CDN/BigData


In [6]:
spark_session = SparkSession.builder \
    .appName("MySparkSession") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .enableHiveSupport() \
    .getOrCreate()
print("Spark Web UI: http://localhost:4040")

Spark Web UI: http://localhost:4040


In [7]:
spark_context = spark_session.sparkContext
print_context_infos(spark_context)

VERSION:  3.4.1
PYTHON_VERSION:  3.11
MASTER:  local[*]
SPARK_HOME:  None
SPARK_USER:  manulabricole
APP_NAME:  MySparkSession
APP_ID:  local-1690275468195
DEFAULT_PARALLESLISM:  10
DEFAULT_PARTITION:  2


# Import & Load data

In [8]:
csv_file_path = os.path.join(str(cwd), "arbresremarquablesparis.csv")
print(csv_file_path)

/Users/manulabricole/Documents/CDN/BigData/arbresremarquablesparis.csv


In [20]:
rdd = spark_session.read.csv(csv_file_path, sep=';', header=True)

# Play & Explore DataFrame

In [40]:
from pyspark.sql.functions import col, mean, min, max

In [41]:
columns = rdd.columns
print(columns)

['Geo point', 'idbase', 'domanialite', 'arrondissement3', 'complement adresse', 'numero', 'adresse6', 'circonference en cm', 'hauteur en m', 'stade développement', 'pépinière', 'genre', 'espèce', 'varieteoucultivar', 'date de plantation', 'libellé Français', 'ID Base', 'ID arbre', 'Site', 'Adresse19', "Complément d'adresse", 'Arrondissement21', 'Domanialité', 'Dénomination usuelle', 'Dénomination botanique', 'Autorité taxonomique', 'Année de plantation', 'Qualification remarquable', 'Résumé', 'Descriptif', 'Numéro de délibération', 'Date de la délibération', 'Label national', 'Panonceau', 'Photo 1', 'Copyright 1']


#### Age

In [42]:
max_age = 2023 - int(rdd.agg(min("Année de plantation")).collect()[0][0])
print("max age: ", max_age)

max age:  421


#### Volume

In [43]:
rdd_volume = rdd.withColumn("Volume", col("circonference en cm")/100 * col("hauteur en m"))
max_volume = rdd_volume.agg(min("Volume")).collect()[0][0]
print(f"Max Volume: {max_volume} m3")

Max Volume: 0.8999999999999999 m3


#### Hauteur Moyenne

In [45]:
mean_h = rdd.agg(mean(col("hauteur en m"))).collect()[0][0]
print(f"La hauteur moyenne des arbres est de: {mean_h} m3")

La hauteur moyenne des arbres est de: 19.233333333333334 m3


#### Arrondissement avec le nombre d'arbres max

In [69]:
rdd_arrondissement = rdd.groupBy("arrondissement3").count()
max_trees_count = rdd_arrondissement.agg({"count": "max"}).collect()[0][0]
max_arbres_arrondissement = rdd_arrondissement.filter(col("count") == max_trees_count)
max_arbres_arrondissement.show()

+---------------+-----+
|arrondissement3|count|
+---------------+-----+
|PARIS 16E ARRDT|   30|
+---------------+-----+



23/07/25 11:56:49 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: arrondissement
 Schema: arrondissement3
Expected: arrondissement3 but found: arrondissement
CSV file: file:///Users/manulabricole/Documents/CDN/BigData/arbresremarquablesparis.csv
23/07/25 11:56:49 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: arrondissement
 Schema: arrondissement3
Expected: arrondissement3 but found: arrondissement
CSV file: file:///Users/manulabricole/Documents/CDN/BigData/arbresremarquablesparis.csv
