<a href="https://colab.research.google.com/github/RobDrie/IT-Tools-Spark/blob/main/Spark_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [86]:
!pip install pyspark



In [111]:
import pyspark
from pyspark.sql.functions import year, month, dayofweek, col, split
from pyspark.sql.functions import weekofyear

In [88]:
spark = pyspark.sql.SparkSession.builder.appName('Session1').getOrCreate()

## Load & Process gas price data with Spark
* Download 2010 to 2023 gas price data
* Download gas Stations file & Services file (2022 versions)

In [89]:
!git clone https://github.com/rvm-courses/GasPrices

fatal: destination path 'GasPrices' already exists and is not an empty directory.


In [94]:
def price_download(year_list):
  prices_df = None

  for year in year_list:
    file_path = f'GasPrices/Prix{year}.csv.gz'
    current_df = spark.read.option("header", "False").option('delimiter', ';').csv(file_path)

    if prices_df is None:
      prices_df = current_df
    else:
      #prices_df = prices_df.join(current_df, on='_c1')
      prices_df = prices_df.union(current_df)

  return prices_df

In [135]:
year_list = ['2019', '2020', '2021', '2022S1', '2022S2']
prices_df = price_download(year_list)

In [96]:
prices_df.describe().show()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [61]:
prices_df.describe().show()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

### Variable definitions
* id_pdv = points of sales
* cp = zip code
* pop = type of population
* latitude
* longitude
* date
* id carburant = gas id
* nom carburant = gas label
* prix = price in millieuros

In [136]:
# Set the column headers
old_names = prices_df.columns
new_names = ['id_pdv', 'cp', 'pop', 'latitude', 'longitude',
             'date', 'id_carburant', 'nom_carburant', 'prix']

for new_name, old_name in zip(new_names, old_names):
  prices_df = prices_df.withColumnRenamed(f'{old_name}', f'{new_name}')

In [137]:
# Split date in year, month, week of the year
split_datetime = split(prices_df['date'], 'T')
prices_df = prices_df.withColumn('Date', split_datetime.getItem(0))
split_date = split(prices_df['Date'], '-')

prices_df = prices_df.withColumn('Year', split_date.getItem(0))
prices_df = prices_df.withColumn('Month', split_date.getItem(1))
prices_df = prices_df.withColumn('Day', split_date.getItem(2))
prices_df = prices_df.withColumn('WeekOfYear', weekofyear(prices_df['date']))

In [131]:
prices_df.show(10)

+-------+-----+---+--------+---------+----------+------------+-------------+----+----+-----+---+----------+
| id_pdv|   cp|pop|latitude|longitude|      Date|id_carburant|nom_carburant|prix|Year|Month|Day|WeekOfYear|
+-------+-----+---+--------+---------+----------+------------+-------------+----+----+-----+---+----------+
|1000001|01000|  R| 4620114|   519791|2019-01-04|           1|       Gazole|1328|2019|   01| 04|         1|
|1000001|01000|  R| 4620114|   519791|2019-01-07|           1|       Gazole|1348|2019|   01| 07|         2|
|1000001|01000|  R| 4620114|   519791|2019-01-10|           1|       Gazole|1374|2019|   01| 10|         2|
|1000001|01000|  R| 4620114|   519791|2019-01-11|           1|       Gazole|1387|2019|   01| 11|         2|
|1000001|01000|  R| 4620114|   519791|2019-01-14|           1|       Gazole|1394|2019|   01| 14|         3|
|1000001|01000|  R| 4620114|   519791|2019-01-16|           1|       Gazole|1394|2019|   01| 16|         3|
|1000001|01000|  R| 4620114|

In [140]:
# Prepare latitude & longitude for mapping (Divide by the right power of 10)
# To map the latitude & longitude on a [0,1] scale, we identity two different constants

division_constant_latitude = 10**7
division_constant_longitude = 10**6
prices_df = prices_df.withColumn('latitude_adj', col('latitude') / division_constant_latitude)
prices_df = prices_df.withColumn('longitude_adj', col('longitude') / division_constant_longitude)

In [141]:
prices_df.show(5)

+-------+-----+---+--------+---------+----------+------------+-------------+----+----+-----+---+----------+------------+-------------+
| id_pdv|   cp|pop|latitude|longitude|      Date|id_carburant|nom_carburant|prix|Year|Month|Day|WeekOfYear|latitude_adj|longitude_adj|
+-------+-----+---+--------+---------+----------+------------+-------------+----+----+-----+---+----------+------------+-------------+
|1000001|01000|  R| 4620114|   519791|2019-01-04|           1|       Gazole|1328|2019|   01| 04|         1|   0.4620114|     0.519791|
|1000001|01000|  R| 4620114|   519791|2019-01-07|           1|       Gazole|1348|2019|   01| 07|         2|   0.4620114|     0.519791|
|1000001|01000|  R| 4620114|   519791|2019-01-10|           1|       Gazole|1374|2019|   01| 10|         2|   0.4620114|     0.519791|
|1000001|01000|  R| 4620114|   519791|2019-01-11|           1|       Gazole|1387|2019|   01| 11|         2|   0.4620114|     0.519791|
|1000001|01000|  R| 4620114|   519791|2019-01-14|      

In [142]:
# Make data available as a table in order to be able to use Spark SQL
prices_df.createOrReplaceTempView("Gas_prices")

In [143]:
# Through basic statistics, consider which gas types have some interest
#for the rest of the project

# Idenitify the different gas types
spark.sql("""
  SELECT DISTINCT nom_carburant
  FROM Gas_prices
""").show()

+-------------+
|nom_carburant|
+-------------+
|          E10|
|         SP98|
|          E85|
|       Gazole|
|         SP95|
|         GPLc|
|         NULL|
+-------------+



In [147]:
# Inspect summary statistics for different gas types
spark.sql("""
    SELECT
        nom_carburant,
        COUNT(*) as count,
        AVG(prix) as mean,
        STDDEV(prix) as stddev,
        MIN(prix) as min,
        MAX(prix) as max
    FROM
        Gas_prices
    GROUP BY
        nom_carburant
""").show()

+-------------+-------+------------------+------------------+-----+-----+
|nom_carburant|  count|              mean|            stddev|  min|  max|
+-------------+-------+------------------+------------------+-----+-----+
|         NULL|  14566|              NULL|              NULL| NULL| NULL|
|          E10|4357845|1061.6209274556575|  668.596370536261|0.001|  959|
|          E85|1141058|443.79118290393654|342.67485665722415|0.001|  999|
|         GPLc| 753286| 641.0837959314258| 392.8441353639444|0.019|  999|
|       Gazole|5273314| 999.6524478986081| 642.1946477857184|0.001|  999|
|         SP95|1327084|1071.6206721887988| 671.9647156310942|0.004|9.999|
|         SP98|4433584|1135.2852601017596| 702.3608904771925|0.001|  969|
+-------------+-------+------------------+------------------+-----+-----+

