In [1]:
import findspark

In [8]:
!pwd

/home/i-sip_iot/PycharmProjects/ETL_DataIngestion/spark


In [2]:
findspark.init('/home/i-sip_iot/spark-3.0.1-bin-hadoop2.7')

In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession

A SparkSession can be used create DataFrame, register DataFrame as tables, execute SQL over tables, cache tables, and read parquet files. To create a SparkSession, use the following builder pattern:

In [9]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

#### builder
A class attribute having a Builder to construct SparkSession instances.

#### class Builder[source]
Builder for SparkSession.

#### appName(name)[source]
Sets a name for the application, which will be shown in the Spark web UI.

If no application name is set, a randomly generated name will be used.

Parameters
name – an application name

New in version 2.0.

#### getOrCreate()[source]
Gets an existing SparkSession or, if there is no existing one, creates a new one based on the options set in this builder.

This method first checks whether there is a valid global default SparkSession, and if yes, return that one. If no valid global default SparkSession exists, the method creates a new SparkSession and assigns the newly created SparkSession as the global default.

In [10]:
df = spark.read.json('people.json')

In [11]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [12]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [13]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [14]:
from pyspark.sql.types import StructField, IntegerType, StructType, StringType

In [15]:
dataSchema = [StructField('age', IntegerType(), True),
             StructField('name', StringType(), True)]

In [16]:
final_struct = StructType(fields=dataSchema)

Now READ the data with new SCHEMA

In [17]:
df = spark.read.json('people.json', schema=final_struct)

In [18]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [19]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [20]:
import findspark

In [21]:
findspark.init('/home/i-sip_iot/spark-3.0.1-bin-hadoop2.7')

In [22]:
import pyspark

In [24]:
from pyspark.sql import SparkSession

In [26]:
spark = SparkSession.builder.appName('test').getOrCreate()

In [28]:
df = spark.read.json('people.json')

In [30]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [32]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [38]:
df.head()

Row(age=None, name='Michael')

In [40]:
df.schema

StructType(List(StructField(age,LongType,true),StructField(name,StringType,true)))

In [70]:
from pyspark.sql.types import StringType, StructField, StructType, IntegerType 

In [71]:
dataSchema = [StructField('age',IntegerType(),True),
             StructField('name',StringType(),True)]

In [72]:
structofChoice = StructType(dataSchema)

In [73]:
df = spark.read.json('people.json', schema=structofChoice)

In [74]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [75]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [76]:
df.schema

StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true)))

In [77]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [78]:
df.select(['age']).show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [79]:
df.select(['age', 'name']).show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [80]:
df.withColumn('newage', df['age']).show()

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    30|
|  19| Justin|    19|
+----+-------+------+



In [81]:
df.withColumnRenamed('age', 'newage').show()

+------+-------+
|newage|   name|
+------+-------+
|  null|Michael|
|    30|   Andy|
|    19| Justin|
+------+-------+



In [86]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [96]:
df.createOrReplaceGlobalTempView('table1')

In [97]:
results = spark.sql("SELECT * FROM global_temp.table1")

In [98]:
results.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [99]:
res = spark.sql("SELECT * FROM global_temp.table1 WHERE age=30")

In [100]:
res.show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [1]:
import findspark

In [2]:
findspark.init('/home/i-sip_iot/spark-3.0.1-bin-hadoop2.7')

In [3]:
import pyspark

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('basic').getOrCreate()

In [6]:
df = spark.read.json('people.json')

In [7]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [8]:
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [12]:
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [13]:
df.schema

StructType(List(StructField(age,LongType,true),StructField(name,StringType,true)))

In [14]:
from pyspark.sql.types import StringType, StructField, StructType, IntegerType

In [20]:
data_schema = [StructField('age', IntegerType(), True), StructField('name', StringType(), True)]

In [23]:
structofChoice = StructType(data_schema)

In [24]:
df = spark.read.json('people.json', schema=structofChoice)

In [26]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [32]:
df.withColumn('naage', df['age']).show()

+----+-------+-----+
| age|   name|naage|
+----+-------+-----+
|null|Michael| null|
|  30|   Andy|   30|
|  19| Justin|   19|
+----+-------+-----+



In [34]:
df.withColumnRenamed('age', 'newage').show()

+------+-------+
|newage|   name|
+------+-------+
|  null|Michael|
|    30|   Andy|
|    19| Justin|
+------+-------+



In [36]:
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [37]:
df.createOrReplaceGlobalTempView('tab1')

In [45]:
data_ = spark.sql("SELECT * FROM global_temp.tab1 WHERE age=19").show()

+---+------+
|age|  name|
+---+------+
| 19|Justin|
+---+------+



In [44]:
data_.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



# READ CSV

In [47]:
import findspark

In [48]:
findspark.init('/home/i-sip_iot/spark-3.0.1-bin-hadoop2.7')

In [49]:
import pyspark

In [50]:
from pyspark.sql import SparkSession

In [51]:
spark = SparkSession.builder.appName('rcsv').getOrCreate()

In [53]:
df_n = spark.read.csv('appl_stock.csv', header=True, )

In [54]:
df_n.printSchema

<bound method DataFrame.printSchema of DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string]>

In [57]:
df_n.head(3)

[Row(_c0='Date', _c1='Open', _c2='High', _c3='Low', _c4='Close', _c5='Volume', _c6='Adj Close'),
 Row(_c0='2010-01-04', _c1='213.429998', _c2='214.499996', _c3='212.38000099999996', _c4='214.009998', _c5='123432400', _c6='27.727039'),
 Row(_c0='2010-01-05', _c1='214.599998', _c2='215.589994', _c3='213.249994', _c4='214.379993', _c5='150476200', _c6='27.774976000000002')]