In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DataFrameOps').getOrCreate()

25/06/08 07:16:22 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [3]:
data = [
    (1, 'Alice', 25),
    (2, 'Bob', 30),
    (2, 'Charlie', 36)
]

#Define Schema
schema = StructType([
    StructField('id', IntegerType(), False),
    StructField('name', StringType(), False),
    StructField('age', IntegerType(), False)
])

df = spark.createDataFrame(data, schema)

In [4]:
df.show()

                                                                                

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  2|Charlie| 36|
+---+-------+---+



In [5]:
df.printSchema()

root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = false)
 |-- age: integer (nullable = false)



In [14]:
df2 = df.withColumns({'new_age': df.age + 2})

In [15]:
df2.show()

+---+-------+---+-------+
| id|   name|age|new_age|
+---+-------+---+-------+
|  1|  Alice| 25|     27|
|  2|    Bob| 30|     32|
|  2|Charlie| 36|     38|
+---+-------+---+-------+



In [16]:
df2.printSchema()

root
 |-- id: integer (nullable = false)
 |-- name: string (nullable = false)
 |-- age: integer (nullable = false)
 |-- new_age: integer (nullable = false)



In [18]:
df.describe().show()



+-------+------------------+-------+------------------+
|summary|                id|   name|               age|
+-------+------------------+-------+------------------+
|  count|                 3|      3|                 3|
|   mean|1.6666666666666667|   NULL|30.333333333333332|
| stddev|0.5773502691896257|   NULL| 5.507570547286102|
|    min|                 1|  Alice|                25|
|    max|                 2|Charlie|                36|
+-------+------------------+-------+------------------+



                                                                                

In [25]:
df.filter(df.age > 30).show()

[Stage 13:>                                                         (0 + 1) / 1]

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  2|Charlie| 36|
+---+-------+---+



                                                                                

In [32]:
df.where(df.name == 'Alice').show()

+---+-----+---+
| id| name|age|
+---+-----+---+
|  1|Alice| 25|
+---+-----+---+



In [33]:
df.distinct().show()



+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  2|Charlie| 36|
+---+-------+---+



                                                                                

### sorting & ordering

In [34]:
df.orderBy('name').show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  2|Charlie| 36|
+---+-------+---+



In [35]:
df.withColumnRenamed('age', 'a').show()

+---+-------+---+
| id|   name|  a|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  2|Charlie| 36|
+---+-------+---+



In [36]:
df.show()

+---+-------+---+
| id|   name|age|
+---+-------+---+
|  1|  Alice| 25|
|  2|    Bob| 30|
|  2|Charlie| 36|
+---+-------+---+



In [41]:
df.groupBy('id').count().show()

+---+-----+
| id|count|
+---+-----+
|  1|    1|
|  2|    2|
+---+-----+



In [44]:
df.agg({'age' : 'avg'}).show()



+------------------+
|          avg(age)|
+------------------+
|30.333333333333332|
+------------------+



                                                                                

In [45]:
spark.stop()