In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Data_Heure").getOrCreate()

In [3]:
spark

In [4]:
df = spark.read.csv("appl_stock.csv", header=True, inferSchema=True)

In [5]:
df.show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
|2010-01-11|212.79999700000002|        213.000002|      

In [12]:
from pyspark.sql.functions import year, month, day, dayofmonth, dayofweek, dayofyear, format_number, date_format, weekofyear, hour

In [8]:
df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [11]:
df.select(dayofmonth(df['Date']).alias("jour du mois")).show()

+------------+
|jour du mois|
+------------+
|           4|
|           5|
|           6|
|           7|
|           8|
|          11|
|          12|
|          13|
|          14|
|          15|
|          19|
|          20|
|          21|
|          22|
|          25|
|          26|
|          27|
|          28|
|          29|
|           1|
+------------+
only showing top 20 rows



In [13]:
df.select(hour(df['Date']).alias("heure")).show()

+-----+
|heure|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
+-----+
only showing top 20 rows



In [14]:
df.select(dayofyear(df['Date']).alias("jour de l'année")).show()

+---------------+
|jour de l'année|
+---------------+
|              4|
|              5|
|              6|
|              7|
|              8|
|             11|
|             12|
|             13|
|             14|
|             15|
|             19|
|             20|
|             21|
|             22|
|             25|
|             26|
|             27|
|             28|
|             29|
|             32|
+---------------+
only showing top 20 rows



In [16]:
df.select(month(df['Date']).alias("mois")).show()

+----+
|mois|
+----+
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   1|
|   2|
+----+
only showing top 20 rows



In [17]:
df.select(year(df['Date']).alias("année")).show()

+-----+
|année|
+-----+
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
| 2010|
+-----+
only showing top 20 rows



In [21]:
df_com = df.select(
    df['Date'],
    dayofmonth(df['Date']).alias("jour du mois"),
    hour(df['Date']).alias("heure"),
    dayofyear(df['Date']).alias("jour de l'année"),
    month(df['Date']).alias("mois"),
    year(df['Date']).alias("année")
)
df_com.show()

+----------+------------+-----+---------------+----+-----+
|      Date|jour du mois|heure|jour de l'année|mois|année|
+----------+------------+-----+---------------+----+-----+
|2010-01-04|           4|    0|              4|   1| 2010|
|2010-01-05|           5|    0|              5|   1| 2010|
|2010-01-06|           6|    0|              6|   1| 2010|
|2010-01-07|           7|    0|              7|   1| 2010|
|2010-01-08|           8|    0|              8|   1| 2010|
|2010-01-11|          11|    0|             11|   1| 2010|
|2010-01-12|          12|    0|             12|   1| 2010|
|2010-01-13|          13|    0|             13|   1| 2010|
|2010-01-14|          14|    0|             14|   1| 2010|
|2010-01-15|          15|    0|             15|   1| 2010|
|2010-01-19|          19|    0|             19|   1| 2010|
|2010-01-20|          20|    0|             20|   1| 2010|
|2010-01-21|          21|    0|             21|   1| 2010|
|2010-01-22|          22|    0|             22|   1| 201