In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField,IntegerType ,StringType
import pyspark.sql.functions as func

### Createing SparkSession

In [2]:
spark = SparkSession.builder.appName("FirstApp").getOrCreate()

### Defining schema for DataFrame

In [3]:
myschema = StructType(
    [
        StructField("userID", IntegerType(), True),
        StructField("name", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField("friends",IntegerType(), True)
    ]
)

### Creating DataFrame on CSV

In [4]:
people = (
    spark.read.format("csv")
    .schema(myschema)
    .option("path", "./data/fakefriends.csv")
    .load()
)

In [5]:
people.show()

+------+--------+---+-------+
|userID|    name|age|friends|
+------+--------+---+-------+
|     0|    Will| 33|    385|
|     1|Jean-Luc| 26|      2|
|     2|    Hugh| 55|    221|
|     3|  Deanna| 40|    465|
|     4|   Quark| 68|     21|
|     5|  Weyoun| 59|    318|
|     6|  Gowron| 37|    220|
|     7|    Will| 54|    307|
|     8|  Jadzia| 38|    380|
|     9|    Hugh| 27|    181|
|    10|     Odo| 53|    191|
|    11|     Ben| 57|    372|
|    12|   Keiko| 54|    253|
|    13|Jean-Luc| 56|    444|
|    14|    Hugh| 43|     49|
|    15|     Rom| 36|     49|
|    16|  Weyoun| 22|    323|
|    17|     Odo| 35|     13|
|    18|Jean-Luc| 45|    455|
|    19|  Geordi| 60|    246|
+------+--------+---+-------+
only showing top 20 rows



In [9]:
op = (
    people.select(people.userID, people.name, people.age, people.friends)
    .where(people.age < 30)
    .withColumn("insert_ts", func.current_timestamp())
    .orderBy(people.userID)
)

In [10]:
op.count()

112

In [8]:
op.show()

+------+--------+---+-------+--------------------+
|userID|    name|age|friends|           insert_ts|
+------+--------+---+-------+--------------------+
|     1|Jean-Luc| 26|      2|2024-01-03 16:02:...|
|     9|    Hugh| 27|    181|2024-01-03 16:02:...|
|    16|  Weyoun| 22|    323|2024-01-03 16:02:...|
|    21|   Miles| 19|    268|2024-01-03 16:02:...|
|    24|  Julian| 25|      1|2024-01-03 16:02:...|
|    25|     Ben| 21|    445|2024-01-03 16:02:...|
|    26|  Julian| 22|    100|2024-01-03 16:02:...|
|    32|     Nog| 26|    281|2024-01-03 16:02:...|
|    35| Beverly| 27|    305|2024-01-03 16:02:...|
|    46|    Morn| 25|     96|2024-01-03 16:02:...|
|    47|   Brunt| 24|     49|2024-01-03 16:02:...|
|    48|     Nog| 20|      1|2024-01-03 16:02:...|
|    52| Beverly| 19|    269|2024-01-03 16:02:...|
|    54|   Brunt| 19|      5|2024-01-03 16:02:...|
|    60|  Geordi| 20|    100|2024-01-03 16:02:...|
|    66|  Geordi| 21|    477|2024-01-03 16:02:...|
|    72|  Kasidy| 22|    179|20

In [11]:
op.createOrReplaceTempView("peoples")

In [12]:
spark.sql("select name,age,friends,insert_ts from peoples").show()

+--------+---+-------+--------------------+
|    name|age|friends|           insert_ts|
+--------+---+-------+--------------------+
|Jean-Luc| 26|      2|2024-01-03 16:06:...|
|    Hugh| 27|    181|2024-01-03 16:06:...|
|  Weyoun| 22|    323|2024-01-03 16:06:...|
|   Miles| 19|    268|2024-01-03 16:06:...|
|  Julian| 25|      1|2024-01-03 16:06:...|
|     Ben| 21|    445|2024-01-03 16:06:...|
|  Julian| 22|    100|2024-01-03 16:06:...|
|     Nog| 26|    281|2024-01-03 16:06:...|
| Beverly| 27|    305|2024-01-03 16:06:...|
|    Morn| 25|     96|2024-01-03 16:06:...|
|   Brunt| 24|     49|2024-01-03 16:06:...|
|     Nog| 20|      1|2024-01-03 16:06:...|
| Beverly| 19|    269|2024-01-03 16:06:...|
|   Brunt| 19|      5|2024-01-03 16:06:...|
|  Geordi| 20|    100|2024-01-03 16:06:...|
|  Geordi| 21|    477|2024-01-03 16:06:...|
|  Kasidy| 22|    179|2024-01-03 16:06:...|
|   Brunt| 20|    384|2024-01-03 16:06:...|
|     Ben| 28|    311|2024-01-03 16:06:...|
|    Worf| 24|    492|2024-01-03