<a href="https://colab.research.google.com/github/MarcelaMonteiroMontenegroGallo/Python/blob/master/PySpark_no_Google_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# instalar as dependências
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [32]:
# configurar as variáveis de ambiente
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

# tornar o pyspark "importável"
import findspark
findspark.init('spark-2.4.4-bin-hadoop2.7')

In [62]:
from os.path import abspath
from pyspark.sql import SparkSession
from pyspark import SparkConf


In [63]:
# set default location for 
warehouse_location = abspath('spark-warehouse')

In [68]:
warehouse_location = abspath('spark-warehouse')


**init session**

In [69]:
    spark = SparkSession \
            .builder.master("local") \
            .appName("etl-yelp-py") \
            .config("spark.sql.warehouse.dir", abspath('spark-warehouse')) \
            .enableHiveSupport() \
            .getOrCreate()


**show configured parameters**



In [70]:
print(SparkConf().getAll())

[('spark.master', 'local[*]'), ('spark.submit.deployMode', 'client'), ('spark.ui.showConsoleProgress', 'true'), ('spark.app.name', 'pyspark-shell')]


**set log level**

In [71]:
spark.sparkContext.setLogLevel("INFO")

set dynamic input file [hard-coded]

can be changed for input parameters [spark-submit]

In [73]:
get_users_file = "/home/user.json"

In [74]:
df_user = spark.read \
        .format("json") \
        .option("inferSchema", "true") \
        .option("header", "true") \
        .json(get_users_file)

In [75]:
  print(df_user.rdd.getNumPartitions())

1


In [76]:
 df_user.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- coordinates: struct (nullable = true)
 |    |    |-- lat: double (nullable = true)
 |    |    |-- lng: double (nullable = true)
 |    |-- country: string (nullable = true)
 |    |-- state: string (nullable = true)
 |    |-- street_address: string (nullable = true)
 |    |-- street_name: string (nullable = true)
 |    |-- zip_code: string (nullable = true)
 |-- avatar: string (nullable = true)
 |-- credit_card: struct (nullable = true)
 |    |-- cc_number: string (nullable = true)
 |-- date_of_birth: string (nullable = true)
 |-- dt_current_timestamp: long (nullable = true)
 |-- email: string (nullable = true)
 |-- employment: struct (nullable = true)
 |    |-- key_skill: string (nullable = true)
 |    |-- title: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: long (nullable = true)
 |-- last_name: string (nullable = true)
 |-- 

In [77]:
 df_user.show()

+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+----------+-----------+----+----------+----------+--------------------+-----------------------+--------------------+--------------------+-------+-----------------+
|             address|              avatar|         credit_card|date_of_birth|dt_current_timestamp|               email|          employment|first_name|     gender|  id| last_name|  password|        phone_number|social_insurance_number|        subscription|                 uid|user_id|         username|
+--------------------+--------------------+--------------------+-------------+--------------------+--------------------+--------------------+----------+-----------+----+----------+----------+--------------------+-----------------------+--------------------+--------------------+-------+-----------------+
|[East Horaciobury...|https://robohash....|[6771-8988-9684-6...|   1983-03-24|       

In [78]:
df_user.count()

100

In [79]:
df_user.createOrReplaceTempView("user")

In [86]:
df_join = spark.sql("""
        SELECT username ,email
        FROM user AS u
      
    """)


In [87]:
df_join.explain()

== Physical Plan ==
*(1) Project [username#48, email#36]
+- *(1) FileScan json [email#36,username#48] Batched: false, Format: JSON, Location: InMemoryFileIndex[file:/home/user.json], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<email:string,username:string>


In [88]:
df_join.count()


100

In [89]:
df_join.show()

+-----------------+--------------------+
|         username|               email|
+-----------------+--------------------+
|   marcos.collier|marcos.collier@em...|
|      elina.hills|elina.hills@email...|
|enedina.schroeder|enedina.schroeder...|
|       colin.ryan|colin.ryan@email.com|
|     dallas.boyle|dallas.boyle@emai...|
|     grover.towne|grover.towne@emai...|
|   dexter.schmitt|dexter.schmitt@em...|
|    novella.weber|novella.weber@ema...|
|lesley.mccullough|lesley.mccullough...|
|      marti.marks|marti.marks@email...|
|  shawnna.keebler|shawnna.keebler@e...|
| guillermo.beahan|guillermo.beahan@...|
|    sid.bechtelar|sid.bechtelar@ema...|
|    merrill.upton|merrill.upton@ema...|
|      felipe.ward|felipe.ward@email...|
|     willie.walsh|willie.walsh@emai...|
|      jae.krajcik|jae.krajcik@email...|
|  wilfredo.bailey|wilfredo.bailey@e...|
|    josefa.marvin|josefa.marvin@ema...|
|       loyd.hintz|loyd.hintz@email.com|
+-----------------+--------------------+
only showing top

In [90]:
spark.stop()