In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Basics').getOrCreate()

In [4]:
df = spark.read.json('people.json')
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [5]:
# To print schema of the Pyspark dataframe
df.printSchema()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)



In [6]:
# If you want to see all the column names
df.columns

['age', 'name']

In [7]:
# To get the description of the columns
df.describe()

DataFrame[summary: string, age: string, name: string]

In [8]:
# show Df function will work in pyspark print dataframe will work in pandas
print(df)

DataFrame[age: bigint, name: string]


In [9]:
# df.deacribe.show function will return count, mean, stddev, min, max of the column values.
df.describe().show()

+-------+------------------+-------+
|summary|               age|   name|
+-------+------------------+-------+
|  count|                 2|      3|
|   mean|              24.5|   null|
| stddev|7.7781745930520225|   null|
|    min|                19|   Andy|
|    max|                30|Michael|
+-------+------------------+-------+



In [10]:
# Importing data types
from pyspark.sql.types import StructField,StringType,IntegerType,StructType
# Creating a Df schema by defining the imported data typed and accepting null values
data_schema = [StructField('age',IntegerType(),True),
               StructField ('name',StringType(),True)]

In [11]:
# Passing created schema into struct type class
final_struc = StructType(fields=data_schema)
# We defined a structure type and gonna read the json file with our required format by passing argument schema
df = spark.read.json('people.json' ,schema=final_struc)
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- name: string (nullable = true)



In [12]:
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [13]:
type (df ['age'])
# Printing column object

pyspark.sql.column.Column

In [14]:
#Select Method throws list of columns given and its data types in an list
df.select('age')

DataFrame[age: int]

In [15]:
df.select()

DataFrame[]

In [16]:
# Printing column using show function
df.select('age').show()

+----+
| age|
+----+
|null|
|  30|
|  19|
+----+



In [17]:
type(df.select('age'))

pyspark.sql.dataframe.DataFrame

In [18]:
# Printing samples
df.head(2)

[Row(age=None, name='Michael'), Row(age=30, name='Andy')]

In [19]:
type(df.head(2))

list

In [20]:
df.head(2)[0]

Row(age=None, name='Michael')

In [21]:
type(df.head(2)[0])

pyspark.sql.types.Row

In [22]:
# We can create a column by withcolumn method and passing column name and a column data
df.withColumn ('newage',df['age'])

DataFrame[age: int, name: string, newage: int]

In [23]:
df.withColumn ('newage',df['age']).show()

+----+-------+------+
| age|   name|newage|
+----+-------+------+
|null|Michael|  null|
|  30|   Andy|    30|
|  19| Justin|    19|
+----+-------+------+



In [27]:
# Whole column numeric operations is possible
df.withColumn ('doubleage',df['age']*2).show()

+----+-------+---------+
| age|   name|doubleage|
+----+-------+---------+
|null|Michael|     null|
|  30|   Andy|       60|
|  19| Justin|       38|
+----+-------+---------+



In [28]:
# Renaming a column name
df.withColumnRenamed('age','my_new_age').show()

+----------+-------+
|my_new_age|   name|
+----------+-------+
|      null|Michael|
|        30|   Andy|
|        19| Justin|
+----------+-------+



In [30]:
# createOrReplaceTempView('people') method will create or replace the existing data with new hive table of elements in given df
# Creatong a temproary hive table with name people
df.createOrReplaceTempView('people')
df.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [31]:
# From the temp hive table we can perform sql operations using spark.sql
results = spark.sql('SELECT * FROM people')
results.show()

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [32]:
# Giving condition in sql query
new_results = spark.sql('SELECT * FROM people WHERE age = 19')
new_results.show()

+---+------+
|age|  name|
+---+------+
| 19|Justin|
+---+------+

