In [1]:
# Importing Libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField,IntegerType,StringType
import pyspark.sql.functions as func

In [2]:
# creating SparkSession
spark=SparkSession.builder.appName("FirstApp")\
                    .config("spark.ui.port", "4040")\
                        .getOrCreate()


25/02/21 11:11:28 WARN Utils: Your hostname, krish-V resolves to a loopback address: 127.0.1.1; using 172.17.127.111 instead (on interface eth0)
25/02/21 11:11:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
# Defining the SparkSession
myschema=StructType([\
                    StructField("userID",IntegerType(),True),
                    StructField("name",StringType(),True),
                    StructField("age",IntegerType(),True),
                    StructField("friends",IntegerType(),True),
                    ])

In [4]:
# Creating DataFrame on a csv file
people = spark.read.format("csv") \
        .schema(myschema) \
        .option("path", "file:///home/krish/Spark_practice/fakefriends.csv") \
        .load()


In [5]:
#Performing all transformations
output = people.select(people.userID,people.name\
                        ,people.age,people.friends)\
                .where(people.age<30).withColumn('insert_ts',func.current_timestamp())\
                .orderBy(people.userID)

In [6]:
#taking the count of o/p DataFrame
output.count()

112

In [13]:
# Creating a tempView
output.createOrReplaceTempView("peoples")

In [14]:
# Running a simple spark sql query
spark.sql("select name,age,friends,insert_ts from peoples").show()

+--------+---+-------+--------------------+
|    name|age|friends|           insert_ts|
+--------+---+-------+--------------------+
|Jean-Luc| 26|      2|2025-02-20 17:58:...|
|    Hugh| 27|    181|2025-02-20 17:58:...|
|  Weyoun| 22|    323|2025-02-20 17:58:...|
|   Miles| 19|    268|2025-02-20 17:58:...|
|  Julian| 25|      1|2025-02-20 17:58:...|
|     Ben| 21|    445|2025-02-20 17:58:...|
|  Julian| 22|    100|2025-02-20 17:58:...|
|     Nog| 26|    281|2025-02-20 17:58:...|
| Beverly| 27|    305|2025-02-20 17:58:...|
|    Morn| 25|     96|2025-02-20 17:58:...|
|   Brunt| 24|     49|2025-02-20 17:58:...|
|     Nog| 20|      1|2025-02-20 17:58:...|
| Beverly| 19|    269|2025-02-20 17:58:...|
|   Brunt| 19|      5|2025-02-20 17:58:...|
|  Geordi| 20|    100|2025-02-20 17:58:...|
|  Geordi| 21|    477|2025-02-20 17:58:...|
|  Kasidy| 22|    179|2025-02-20 17:58:...|
|   Brunt| 20|    384|2025-02-20 17:58:...|
|     Ben| 28|    311|2025-02-20 17:58:...|
|    Worf| 24|    492|2025-02-20

#RDD

In [9]:
rdd = spark.sparkContext.parallelize([("Alice",25),("Bob",30)])
print(rdd.collect())

[('Alice', 25), ('Bob', 30)]


#DF

In [10]:
from pyspark.sql import Row

df = spark.createDataFrame([Row(name="Alice", age=25), Row(name="Bob", age=30)])
df.show()


+-----+---+
| name|age|
+-----+---+
|Alice| 25|
|  Bob| 30|
+-----+---+

