In [1]:
import findspark
findspark.init('/home/ec2-user/spark')

In [2]:
import pyspark
from pyspark import SparkContext

In [3]:
sc = SparkContext()

In [4]:
# Creating SparkSession bcoz this is the entry point to create DataFrame..
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("My App").config("spark.some.config.option", "some-value").getOrCreate()

In [5]:
# Creating a DataFrame from csv file.
df = spark.read.csv("/home/ec2-user/wc.csv" , header=True, inferSchema=True)
df.show()
# header=True -- means it will take the first line of csv file as header or column name
# inferSchema=True -- fetches the schema of the table. 

+------+---+------+
|  name|age|  city|
+------+---+------+
| harsh| 20|jaipur|
| ankit| 21| ajmer|
| dhruv| 24|  kota|
|akshay| 21| ajmer|
+------+---+------+



In [7]:
# select() and show() Transformation and action
df_name = df.select("name")
df_name.show(2)

+-----+
| name|
+-----+
|harsh|
|ankit|
+-----+
only showing top 2 rows



In [8]:
# filter() Transformation
df_filter = df.filter(df['age'] >20 )  # Filters values where age is greater than 20
df_filter.show()

+------+---+-----+
|  name|age| city|
+------+---+-----+
| ankit| 21|ajmer|
| dhruv| 24| kota|
|akshay| 21|ajmer|
+------+---+-----+



In [9]:
# groupBy() and count() Transformation and Action
df_group = df.groupby('age')  # group variables with same values
df_group.count().show()       # counts number of members in group

+---+-----+
|age|count|
+---+-----+
| 20|    1|
| 24|    1|
| 21|    2|
+---+-----+



In [10]:
# orderBy() Transformations 
df_order = df.orderBy('city')  # Sorts results according to column name
df_order.show()

+------+---+------+
|  name|age|  city|
+------+---+------+
| ankit| 21| ajmer|
|akshay| 21| ajmer|
| harsh| 20|jaipur|
| dhruv| 24|  kota|
+------+---+------+



In [11]:
# dropDuplicates() Transformations
df_drop = df.select('age','city').drop_duplicates()  # Removes Duplicates
df_drop.show()

+---+------+
|age|  city|
+---+------+
| 20|jaipur|
| 24|  kota|
| 21| ajmer|
+---+------+



In [12]:
# withColumnRenamed() Transformation
df_rename = df.withColumnRenamed('age','umar')
df_rename.show()

+------+----+------+
|  name|umar|  city|
+------+----+------+
| harsh|  20|jaipur|
| ankit|  21| ajmer|
| dhruv|  24|  kota|
|akshay|  21| ajmer|
+------+----+------+



# Dataframe Actions

In [15]:
# printSchema() Action -- print the schema of the dataframe
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)



In [18]:
# columns Action -- shows all the columns of the dataframe
df.columns

['name', 'age', 'city']

In [21]:
# describe() Action -- computes summary statistics of numerical columns in dataframe.
df.describe().show()

+-------+------+------------------+-----+
|summary|  name|               age| city|
+-------+------+------------------+-----+
|  count|     4|                 4|    4|
|   mean|  null|              21.5| null|
| stddev|  null|1.7320508075688772| null|
|    min|akshay|                20|ajmer|
|    max| harsh|                24| kota|
+-------+------+------------------+-----+

