<h2>Fives CortX / Data preparation / Minimize volume</h2>

## Code Notebook

**By :** MOURABIT El Bachir

### 1- Imports and Initialization

In [2]:
import pyspark
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.master("local[1]").appName("CSV_to_ORC").getOrCreate()

### 2- Read Milling

In [None]:
df_milling = spark.read.csv("../csv_files_s3/milling_modes.csv", header=True)
df_milling.printSchema()

### 3- Read Vibration_A_B

In [23]:
df_vibration_A_B = spark \
                    .read \
                    .csv("../csv_files_s3/vibration_axis_A_axis_B/*.*", header=True) \
                    .toDF("date_AB", "value_A", "value_B") \
                    .orderBy('date_AB')
                    
df_vibration_A_B.printSchema()
df_vibration_A_B.show(20)

root
 |-- date_AB: string (nullable = true)
 |-- value_A: string (nullable = true)
 |-- value_B: string (nullable = true)

+-------+-------+-------+
|date_AB|value_A|value_B|
+-------+-------+-------+
|      2|   null|   null|
|      2|   null|   null|
|      2|   null|   null|
|      2|   null|   null|
|      2|   null|   null|
|      2|   null|   null|
|     20|   null|   null|
|     20|   null|   null|
|     20|   null|   null|
|     20|   null|   null|
|     20|   null|   null|
|     20|   null|   null|
|     20|   null|   null|
|     20|   null|   null|
|     20|   null|   null|
|     20|   null|   null|
|     20|   null|   null|
|     20|   null|   null|
|     20|   null|   null|
|     20|   null|   null|
+-------+-------+-------+
only showing top 20 rows



### 4- Read Vibration_C

In [13]:
df_vibration_C = spark.read.csv("../csv_files_s3/vibration_axis_C.csv", header=True)
df_vibration_C.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- values3: string (nullable = true)



### 5- Read Vibration_D

In [14]:
df_vibration_D = spark.read.csv("../csv_files_s3/vibration_axis_D.csv", header=True)
df_vibration_D.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- values4: string (nullable = true)



### 6- Migrate from CSV to ORC

In [None]:
df_milling.write.mode("overwrite").orc("../orc_to_s3/milling_modes.orc")

In [9]:
df_vibration_A_B.write.mode("overwrite").orc("../orc_to_s3/vibration_axis_A_axis_B.orc")

In [16]:
df_vibration_C.write.mode("overwrite").orc("../orc_to_s3/vibration_axis_C.orc")

In [17]:
df_vibration_D.write.mode("overwrite").orc("../orc_to_s3/vibration_axis_D.orc")

### 7- Verification

In [16]:
vibration_A_B_df = spark \
                    .read \
                    .option("header", "true") \
                    .orc("../orc_to_s3/vibration_axis_A_axis_B.orc")
vibration_A_B_df.printSchema()
vibration_A_B_df.show(20)

root
 |-- _c0: string (nullable = true)
 |-- values1: string (nullable = true)
 |-- values2: string (nullable = true)

+--------------------+--------------------+--------------------+
|                 _c0|             values1|             values2|
+--------------------+--------------------+--------------------+
|2018-11-21 18:06:...|  -1.700976359316765|-0.46908177869076406|
|2018-11-21 18:06:...| -0.3297466179063344| -0.1797920614084737|
|2018-11-21 18:06:...|  -1.178325579046026| -0.4208986619048354|
|2018-11-21 18:06:...| -1.0228619352056278| -0.5983779417665455|
|2018-11-21 18:06:...| -0.5285685917851948| -0.3060502723631234|
|2018-11-21 18:06:...| -0.8518715371757454| -0.5551414746568127|
|2018-11-21 18:06:...|-0.35253491147863425| -0.3593343224872544|
|2018-11-21 18:06:...| -1.2094980753115538|-0.17926240196702284|
|2018-11-21 18:06:...|  -1.413347896002105| -0.5362140526295736|
|2018-11-21 18:06:...|-0.22435992152724993|-0.18071471636144687|
|2018-11-21 18:06:...| -1.1090106441