## Installation de Pyspark : 

#1. Installer Anaconda

#2. Installer Java :
conda install openjdk

#3. Créer un environnement dédié :
conda create -n pyspark_env python=3

#4. Activer l'environnement :
conda activate pyspark_env

#5. Installer pyspark :
pip install pyspark

#6. Installer findspark : 
conda install -c conda-forge findspark

#7. Au niveau du notebook lancer les commandes :
import findspark
findspark.init()
findspark.find()

In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
findspark.find()

'C:\\spark'

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [6]:
spark

In [7]:
df = spark.read.json("people.json")

In [8]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [9]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [10]:
df.columns

['age', 'name']

In [11]:
df.describe()

DataFrame[summary: string, age: string, name: string]

In [12]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   NULL|
| stddev|7.7781745930520225|   NULL|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [13]:
from pyspark.sql.types import StructField, StringType, IntegerType, StructType

In [14]:
sch = [StructField('age', IntegerType(), True),
       StructField('name', StringType(), True)]

In [15]:
struct = StructType(fields=sch)

In [16]:
df = spark.read.json('people.json', schema=struct)

In [17]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [18]:
df = spark.read.option("inferSchema", "true").json('people.json')

In [20]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [19]:
df.select('age').show()

+----+
| age|
+----+
|NULL|
|  30|
|  19|
+----+



In [21]:
df.head(2)

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [22]:
df.head(2)[1]

Row(age=30, name='Andy')

In [23]:
df.select('age', 'name').show()

+----+-------+
| age|   name|
+----+-------+
|NULL|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [23]:
df.withColumn?

[1;31mSignature:[0m [0mdf[0m[1;33m.[0m[0mwithColumn[0m[1;33m([0m[0mcolName[0m[1;33m:[0m [0mstr[0m[1;33m,[0m [0mcol[0m[1;33m:[0m [0mpyspark[0m[1;33m.[0m[0msql[0m[1;33m.[0m[0mcolumn[0m[1;33m.[0m[0mColumn[0m[1;33m)[0m [1;33m->[0m [1;34m'DataFrame'[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Returns a new :class:`DataFrame` by adding a column or replacing the
existing column that has the same name.

The column expression must be an expression over this :class:`DataFrame`; attempting to add
a column from some other :class:`DataFrame` will raise an error.

.. versionadded:: 1.3.0

.. versionchanged:: 3.4.0
    Supports Spark Connect.

Parameters
----------
colName : str
    string, name of the new column.
col : :class:`Column`
    a :class:`Column` expression for the new column.

Returns
-------
:class:`DataFrame`
    DataFrame with new or replaced column.

Notes
-----
This method introduces a projection internally. Therefore, calling it multi

In [24]:
df.withColumn("triple_age", df["age"]*3).show()

+----+-------+----------+
| age|   name|triple_age|
+----+-------+----------+
|NULL|Michael|      NULL|
|  30|   Andy|        90|
|  19| Justin|        57|
+----+-------+----------+



In [25]:
df.withColumnRenamed('name', 'first_name').show()

+----+----------+
| age|first_name|
+----+----------+
|NULL|   Michael|
|  30|      Andy|
|  19|    Justin|
+----+----------+



In [26]:
df.createOrReplaceTempView('people_view')

In [27]:
spark.sql('SELECT * FROM people_view WHERE name="Justin"').show()

+---+------+
|age|  name|
+---+------+
| 19|Justin|
+---+------+



In [28]:
ma_req = 'SELECT * FROM people_view WHERE age>15'

In [29]:
spark.sql(ma_req).show()

+---+------+
|age|  name|
+---+------+
| 30|  Andy|
| 19|Justin|
+---+------+

