<a href="https://colab.research.google.com/github/MarinaEstefania/data-engineering-bootcamp/blob/main/Notebooks/log_reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%%capture
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.2.2/spark-3.2.2-bin-hadoop3.2.tgz
!tar xf spark-3.2.2-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.2-bin-hadoop3.2"

import findspark
findspark.init()
findspark.find()

from pyspark.sql import DataFrame, SparkSession, Window
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
from pyspark.sql.functions import *
from pyspark.sql.window import *

spark = SparkSession \
       .builder \
       .appName("review_logs") \
       .config('spark.jars.packages', 'com.databricks:spark-xml_2.12:0.15.0')\
       .getOrCreate()

spark

In [None]:
#For the log_reviews.csv file:
#Map the structure for the DataFrame schema according to the log_review column that contains the xml as a string.
schemaLog = StructType([ \
    StructField("log_id",StringType(),True), \
    StructField("logDate",StringType(),True), \
    StructField("device",StringType(),True), \
    StructField("location", StringType(), True), \
    StructField("os", StringType(), True), \
    StructField("ipAddress", StringType(), True), \
    StructField("phoneNumber", StringType(), True) \
  ])

#Work with the log  column to get all the metadata and build your columns for your  DataFrame.
df = spark.read.format("xml").option("rootTag", "reviewlog").option("rowTag", "log").load("sample_data/log_reviews.csv", schema=schemaLog)
#df.show(5)

#Don’t forget to drop the log column by the end
dropLogIdDF = df.drop("log_id")
#dropLogIdDF.show(5)




In [None]:
#Store your results into a new file in the STAGE area (log_id, log_date, device, os, location, browser, ip, phone_number)
#find os list
osValues = dropLogIdDF.select("os").dropDuplicates()
#osValues.show()

#add browser column  
withBrowserDF = dropLogIdDF.withColumn("browser", when(dropLogIdDF.os=="Apple iOS", "Safari") \
                                       .when(dropLogIdDF.os=="Apple MacOS", "Safari") \
                                       .when(dropLogIdDF.os=="Microsoft Windows", "Microsoft Edge") \
                                       .when(dropLogIdDF.os=="Linux", "Firefox") \
                                       .when(dropLogIdDF.os=="Google Android", "Google Chrome") \
                                       .when(dropLogIdDF.os=="Linux", "Firefox"))
#withBrowserDF.show(10)

#Rename columns
renameColsDF = withBrowserDF.withColumnRenamed("logDate","log_date").withColumnRenamed("ipAddress","ip").withColumnRenamed("phoneNumber","phone_number")
#renameColsDF.show(5)

#add log_id column
withIncreasingIDDF = renameColsDF.withColumn("monotonically_increasing_id", monotonically_increasing_id())
window = Window.orderBy(col('monotonically_increasing_id'))
reviewLogdDF = withIncreasingIDDF.withColumn('log_id', row_number().over(window)).drop('monotonically_increasing_id')
reviewLogdDF.show(10)



+----------+--------+-------------+-----------------+------------+------------+--------------+------+
|  log_date|  device|     location|               os|          ip|phone_number|       browser|log_id|
+----------+--------+-------------+-----------------+------------+------------+--------------+------+
|04-25-2021|  Mobile|       Kansas|        Apple iOS|9.200.232.57|821-540-5777|        Safari|     1|
|03-13-2021|  Tablet|       Oregon|   Google Android|9.200.232.57|819-102-1320| Google Chrome|     2|
|09-30-2021|  Tablet|    Minnesota|        Apple iOS|9.200.232.57|989-156-0498|        Safari|     3|
|05-24-2021|  Tablet|     Arkansas|      Apple MacOS|9.200.232.57|225-837-9935|        Safari|     4|
|02-01-2021|  Tablet|New Hampshire|            Linux|9.200.232.57|243-842-4562|       Firefox|     5|
|07-23-2021|  Tablet|  Pensylvania|        Apple iOS|9.200.232.57|694-501-4352|        Safari|     6|
|10-13-2021|Computer|     New York|      Apple MacOS|9.200.232.57|430-449-7136|   

In [None]:

#Load results
reviewLogdDF.write.option("header","true").parquet("s3://manual-bucket-megc/stage-data/review_logs.parquet")

# Nueva sección