# PREPROCESSOR

### 1. Create the Spark Session and Spark Context
The Spark session is the entry point into all functionality in Spark.
You can custom your session with spark.configuration using *.config("spark.some.config.option", "some-value")*

In [1]:
import pandas as pd
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Preprocessor") \
    .getOrCreate()

# Create a context from the specified session
sc = spark.sparkContext

### 2. Load data from HDFS

In [2]:
#First obtain the current time
from datetime import datetime

# datetime object containing current date and time
now = datetime.now()

current_year = str(now.year)
current_mont = str(now.month)
current_day  = str(now.day) #attenzione che qui restituisce 3 ma hdfs vuole 03! Idem per le ore.
current_hour = str(now.hour)
print(now.month)

3


In [10]:
#Load data of the last nn hours

# The path can be either a single text file or a directory storing text files
utenti_path = "hdfs://my-hdfs-namenodes:8020/HeraSDG/raw_data/utenti/*/*/*/*/*.json"
comportamenti_path = "hdfs://my-hdfs-namenodes:8020/HeraSDG/raw_data/comportamenti/*/*/*/*/*.json"
premi_path = "hdfs://my-hdfs-namenodes:8020/HeraSDG/raw_data/premi/*/*/*/*/*.json"
utenti = spark.read.json(utenti_path)
comportamenti = spark.read.json(comportamenti_path)
premi = spark.read.json(premi_path)

In [11]:
utenti.printSchema()
utenti.describe().show()

root
 |-- Data di Nascita: string (nullable = true)
 |-- Eta: long (nullable = true)
 |-- Provincia: string (nullable = true)
 |-- Sesso: string (nullable = true)
 |-- id_utente: string (nullable = true)

+-------+---------------+------------------+---------+-----+---------+
|summary|Data di Nascita|               Eta|Provincia|Sesso|id_utente|
+-------+---------------+------------------+---------+-----+---------+
|  count|          30609|             30609|    30609|30609|    30609|
|   mean|           null|54.232905354634255|     null| null|     null|
| stddev|           null| 20.99087701181164|     null| null|     null|
|    min|     1930-03-31|                18|         |     |         |
|    max|     2003-03-29|                90|       RE|    M|  HE_9999|
+-------+---------------+------------------+---------+-----+---------+



In [22]:
comportamenti.show(5)

+----------------+--------------------+---------+----------+
|Partner_erogante|       comportamento|id_utente|reward(tk)|
+----------------+--------------------+---------+----------+
|            HERA|Acquisto energia ...|  CA_2687|      2.85|
|            HERA|Autolettura consu...|   HE_267|       1.6|
|            HERA|Acquisto energia ...|    HE_32|      2.85|
|           CONAD|Acquisto di prodo...|  CO_3281|      2.55|
|            HERA|Invio elettronico...|   CO_522|      1.75|
+----------------+--------------------+---------+----------+
only showing top 5 rows



In [23]:
premi.show(5)

+----------------+---------+--------------------+----------+
|Partner_erogante|id_utente|              premio|prezzo(tk)|
+----------------+---------+--------------------+----------+
|           CONAD|  CA_1935|Buono Spesa 5€ Conad|         5|
|           CAMST|  HE_1994|Buono Spesa 5€ Camst|         5|
|           CONAD|  HE_1799|Buono Spesa 5€ Conad|         5|
|           CONAD|  CA_2284|Buono Spesa 5€ Conad|         5|
|           CONAD|   CA_627|Buono Spesa 5€ Conad|        10|
+----------------+---------+--------------------+----------+
only showing top 5 rows



### 3. Processing

#### 3.1 comportamenti

In [37]:
comportamenti.groupBy(comportamenti.id_utente, comportamenti.Partner_erogante).agg({'reward(tk)':'sum', 'Partner_erogante':'count'}).sort(comportamenti.id_utente).show(5)

+---------+----------------+---------------+-----------------------+
|id_utente|Partner_erogante|sum(reward(tk))|count(Partner_erogante)|
+---------+----------------+---------------+-----------------------+
| CA_10003|           CONAD|           2.55|                      1|
| CA_10003|            HERA|            5.7|                      2|
| CA_10003|           CAMST|            4.8|                      2|
| CA_10007|            HERA|            6.2|                      3|
| CA_10007|           CONAD|           2.55|                      1|
+---------+----------------+---------------+-----------------------+
only showing top 5 rows



#### 3.2 premi

In [36]:
premi.groupBy(premi.id_utente, premi.Partner_erogante).agg({'prezzo(tk)':'sum', 'Partner_erogante':'count'}).sort(premi.id_utente).show(5)

+---------+----------------+-----------------------+---------------+
|id_utente|Partner_erogante|count(Partner_erogante)|sum(prezzo(tk))|
+---------+----------------+-----------------------+---------------+
|         |           CONAD|                   9698|          96980|
|         |            HERA|                   9615|          96150|
| CA_10003|           CONAD|                      2|             10|
|  CA_1001|           CAMST|                      1|              5|
|  CA_1001|           CONAD|                      4|             25|
+---------+----------------+-----------------------+---------------+
only showing top 5 rows



440123

In [52]:
b = utenti.filter(a['Eta'] < 20)
b.count()

29555

In [None]:
# Creates a temporary view using the DataFrame
utenti.createOrReplaceTempView("utenti")
teenagerNamesDF = spark.sql("SELECT id_utente FROM utenti WHERE Eta BETWEEN 13 AND 19")
teenagerNamesDF.show()

### 4. Visualization

### 5. Clustering

In [None]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=2)
kmeans.setSeed(1)
kmeans.setMaxIter(10)
kmeans.getMaxIter()

df = peopleDF['id_utente', 'reward(tk)'].groupBy("id_utente").sum()

model = kmeans.fit(df)
print(model)

In [None]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
import time

# Create the PipelineOptions
options = PipelineOptions([
"--runner=DirectRunner",          #Portable Runner is needed to execute a Python Apache Beam pipeline on Spark
"--job_endpoint=localhost:8099",    #The job endpoint is the JobService, so the central instance where you submit your Beam pipeline. The JobService will create a Spark job for the pipeline and execute the job.
# "--environment_type=LOOPBACK",
"--hdfs_host=my-hdfs-namenodes",
"--hdfs_port=8020",
"--hdfs_user=lori"
])
with beam.Pipeline(options=options) as p:
    lines = (
        p 
        | beam.io.ReadFromText('hdfs://my-hdfs-namenodes:9870/HeraSDG/raw_data/comportamenti/2021/03/24/11/*.json') 
        | 'Print contents' >> beam.Map(print)
    )
#     output = (
#         p
#         | 'Create mock values' >> beam.Create([1,2,3,4,5,6])
#         | 'Sum all values' >> beam.CombineGlobally(sum)
#     )
#     output | beam.io.WriteToText(FlatMap(print)

#     output | beam.io.hadoopfilesystem.HadoopFileSystem._list("hdfs://funziona.txt")
# hdfs = beam.io.hadoopfilesystem.HadoopFileSystem(options)
# hdfs.create("hdfs://funziona.txt")

In [None]:
print(sc)