In [2]:
from pyspark.sql import SparkSession

spark: SparkSession = SparkSession.Builder().appName("demo").getOrCreate()

In [3]:
df = spark.createDataFrame(
    [
        ("sue", 32),
        ("li", 3),
        ("bob", 75),
        ("heo", 13),
    ],
    ["first_name", "age"],
)

In [4]:
df.show()

                                                                                

+----------+---+
|first_name|age|
+----------+---+
|       sue| 32|
|        li|  3|
|       bob| 75|
|       heo| 13|
+----------+---+



In [4]:
from pyspark.sql.functions import col, when

df1 = df.withColumn(
    "life_stage",
    when(col("age") < 13, "child")
    .when(col("age").between(13, 19), "teenager")
    .otherwise("adult"),
)

In [5]:
df.show()

+----------+---+
|first_name|age|
+----------+---+
|       sue| 32|
|        li|  3|
|       bob| 75|
|       heo| 13|
+----------+---+



In [6]:
df1.show()

+----------+---+----------+
|first_name|age|life_stage|
+----------+---+----------+
|       sue| 32|     adult|
|        li|  3|     child|
|       bob| 75|     adult|
|       heo| 13|  teenager|
+----------+---+----------+



In [7]:
df1.where(col("life_stage").isin(["teenager", "adult"])).show()


+----------+---+----------+
|first_name|age|life_stage|
+----------+---+----------+
|       sue| 32|     adult|
|       bob| 75|     adult|
|       heo| 13|  teenager|
+----------+---+----------+



In [8]:
df1.write.saveAsTable("some_people")


                                                                                

In [9]:
spark.sql("select * from some_people").show()

+----------+---+----------+
|first_name|age|life_stage|
+----------+---+----------+
|       heo| 13|  teenager|
|       bob| 75|     adult|
|       sue| 32|     adult|
|        li|  3|     child|
+----------+---+----------+



In [10]:
spark.sql("INSERT INTO some_people VALUES ('frank', 4, 'child')")

DataFrame[]

In [11]:
spark.sql("select * from some_people").show()

+----------+---+----------+
|first_name|age|life_stage|
+----------+---+----------+
|       heo| 13|  teenager|
|     frank|  4|     child|
|       bob| 75|     adult|
|       sue| 32|     adult|
|        li|  3|     child|
+----------+---+----------+



In [12]:
spark.sql("select * from some_people where life_stage='teenager'").show()


+----------+---+----------+
|first_name|age|life_stage|
+----------+---+----------+
|       heo| 13|  teenager|
+----------+---+----------+



In [13]:
text_file = spark.sparkContext.textFile("some_words.txt")

In [14]:
text_file

some_words.txt MapPartitionsRDD[31] at textFile at NativeMethodAccessorImpl.java:0

In [17]:
counts = (
    text_file.flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
)

In [18]:
counts.collect()

[('these', 2),
 ('words', 3),
 ('are', 2),
 ('more', 1),
 ('in', 1),
 ('english', 1)]