In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local").getOrCreate()
sc = spark.sparkContext

### Chargement du dataset en DF

In [3]:
df = spark.read.options(header=True, inferSchema=True, sep=",").csv("../Data/friends-with-header.csv")
df.show()

+---+--------+---+-------------+
| id|    name|age|friendsNumber|
+---+--------+---+-------------+
|  0|    Will| 33|          385|
|  1|Jean-Luc| 26|            2|
|  2|    Hugh| 55|          221|
|  3|  Deanna| 40|          465|
|  4|   Quark| 68|           21|
|  5|  Weyoun| 59|          318|
|  6|  Gowron| 37|          220|
|  7|    Will| 54|          307|
|  8|  Jadzia| 38|          380|
|  9|    Hugh| 27|          181|
| 10|     Odo| 53|          191|
| 11|     Ben| 57|          372|
| 12|   Keiko| 54|          253|
| 13|Jean-Luc| 56|          444|
| 14|    Hugh| 43|           49|
| 15|     Rom| 36|           49|
| 16|  Weyoun| 22|          323|
| 17|     Odo| 35|           13|
| 18|Jean-Luc| 45|          455|
| 19|  Geordi| 60|          246|
+---+--------+---+-------------+
only showing top 20 rows



### Select

In [None]:
# df.select("name").show()
# df[["name"]].show()
df.select("name", "age").show()

+--------+
|    name|
+--------+
|    Will|
|Jean-Luc|
|    Hugh|
|  Deanna|
|   Quark|
|  Weyoun|
|  Gowron|
|    Will|
|  Jadzia|
|    Hugh|
|     Odo|
|     Ben|
|   Keiko|
|Jean-Luc|
|    Hugh|
|     Rom|
|  Weyoun|
|     Odo|
|Jean-Luc|
|  Geordi|
+--------+
only showing top 20 rows



In [13]:
from pyspark.sql.functions import *

df.select(
    col("name").alias("nom")
).show()

+--------+
|     nom|
+--------+
|    Will|
|Jean-Luc|
|    Hugh|
|  Deanna|
|   Quark|
|  Weyoun|
|  Gowron|
|    Will|
|  Jadzia|
|    Hugh|
|     Odo|
|     Ben|
|   Keiko|
|Jean-Luc|
|    Hugh|
|     Rom|
|  Weyoun|
|     Odo|
|Jean-Luc|
|  Geordi|
+--------+
only showing top 20 rows



In [18]:
df_age = df.select(
    min("age").alias("age_min"),
    max("age").alias("age_max"),
    avg("age").alias("avg_age")
).collect()[0]

df_age["age_min"]

18

### Filtrage

In [19]:
df.filter(
    (col("age") < 30) & (col("friendsNumber") < 200)
).show()

+---+--------+---+-------------+
| id|    name|age|friendsNumber|
+---+--------+---+-------------+
|  1|Jean-Luc| 26|            2|
|  9|    Hugh| 27|          181|
| 24|  Julian| 25|            1|
| 26|  Julian| 22|          100|
| 46|    Morn| 25|           96|
| 47|   Brunt| 24|           49|
| 48|     Nog| 20|            1|
| 54|   Brunt| 19|            5|
| 60|  Geordi| 20|          100|
| 72|  Kasidy| 22|          179|
| 95|     Odo| 29|          173|
|112|    Morn| 25|           13|
|126|   Brunt| 26|           84|
|137|  Martok| 28|           32|
|144|   Miles| 22|           93|
|166| Lwaxana| 25|           10|
|171|  Weyoun| 29|          126|
|173|   Leeta| 23|          129|
|182|  Weyoun| 26|          145|
|201|    Ezri| 23|          174|
+---+--------+---+-------------+
only showing top 20 rows



In [20]:
df.filter((col("name") == "Julian")).show()

+---+------+---+-------------+
| id|  name|age|friendsNumber|
+---+------+---+-------------+
| 24|Julian| 25|            1|
| 26|Julian| 22|          100|
|128|Julian| 34|          221|
|184|Julian| 44|           84|
|264|Julian| 29|          228|
|274|Julian| 64|          244|
|327|Julian| 20|           63|
|332|Julian| 55|          362|
|353|Julian| 64|          499|
|417|Julian| 37|          106|
|447|Julian| 38|           34|
|453|Julian| 44|          337|
+---+------+---+-------------+



In [None]:
liste_noms = ["Julian", "Jean-Luc", "Hugh"]

df.filter(col("name").isin(liste_noms)).sort("name").show()

+---+----+---+-------------+
| id|name|age|friendsNumber|
+---+----+---+-------------+
|  2|Hugh| 55|          221|
|  9|Hugh| 27|          181|
| 14|Hugh| 43|           49|
| 41|Hugh| 67|          167|
| 58|Hugh| 59|          158|
| 82|Hugh| 57|          465|
| 86|Hugh| 55|          257|
|131|Hugh| 65|          309|
|199|Hugh| 38|          180|
|250|Hugh| 36|          342|
|277|Hugh| 46|          300|
|288|Hugh| 56|          354|
|316|Hugh| 64|          391|
|346|Hugh| 29|          329|
|375|Hugh| 66|          201|
|402|Hugh| 58|           98|
|413|Hugh| 45|          147|
|460|Hugh| 47|          400|
|483|Hugh| 57|           99|
|493|Hugh| 23|          357|
+---+----+---+-------------+
only showing top 20 rows



## Ajouter une colone

In [28]:
df = df.withColumn("BirthYear", 2025 - df["age"] )

In [29]:
df.show()

+---+--------+---+-------------+---------+
| id|    name|age|friendsNumber|BirthYear|
+---+--------+---+-------------+---------+
|  0|    Will| 33|          385|     1992|
|  1|Jean-Luc| 26|            2|     1999|
|  2|    Hugh| 55|          221|     1970|
|  3|  Deanna| 40|          465|     1985|
|  4|   Quark| 68|           21|     1957|
|  5|  Weyoun| 59|          318|     1966|
|  6|  Gowron| 37|          220|     1988|
|  7|    Will| 54|          307|     1971|
|  8|  Jadzia| 38|          380|     1987|
|  9|    Hugh| 27|          181|     1998|
| 10|     Odo| 53|          191|     1972|
| 11|     Ben| 57|          372|     1968|
| 12|   Keiko| 54|          253|     1971|
| 13|Jean-Luc| 56|          444|     1969|
| 14|    Hugh| 43|           49|     1982|
| 15|     Rom| 36|           49|     1989|
| 16|  Weyoun| 22|          323|     2003|
| 17|     Odo| 35|           13|     1990|
| 18|Jean-Luc| 45|          455|     1980|
| 19|  Geordi| 60|          246|     1965|
+---+------