<a href="https://colab.research.google.com/github/Nadiyapathan/pyspark/blob/main/spark_architecture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

from pyspark.sql import SparkSession


spark = SparkSession.builder.appName("ColabSpark").getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x79c6dd633b90>


In [3]:

from pyspark.sql import Row

data = [Row(id=1, name="Jabi", age=39),
        Row(id=2, name="Nadi", age=34),
        Row(id=3, name="Ayesha", age=6),
        Row(id=4, name="Ayaan",age=3)]

df = spark.createDataFrame(data)
df.show()

+---+------+---+
| id|  name|age|
+---+------+---+
|  1|  Jabi| 39|
|  2|  Nadi| 34|
|  3|Ayesha|  6|
|  4| Ayaan|  3|
+---+------+---+



In [4]:

df.select("name", "age").show()

df.filter(df.age > 32).show()


+------+---+
|  name|age|
+------+---+
|  Jabi| 39|
|  Nadi| 34|
|Ayesha|  6|
| Ayaan|  3|
+------+---+

+---+----+---+
| id|name|age|
+---+----+---+
|  1|Jabi| 39|
|  2|Nadi| 34|
+---+----+---+



In [5]:

df.groupBy("age").count().show()


+---+-----+
|age|count|
+---+-----+
| 34|    1|
| 39|    1|
|  6|    1|
|  3|    1|
+---+-----+



In [6]:

rdd = spark.sparkContext.parallelize(["hello world", "hello PySpark", "hello Spark"])
words = rdd.flatMap(lambda line: line.split(" "))
word_counts = words.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)
word_counts.collect()

[('hello', 3), ('world', 1), ('PySpark', 1), ('Spark', 1)]

In [7]:

sc = spark.sparkContext
print("Number of Executors:", sc.defaultParallelism)


Number of Executors: 2


In [9]:
rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5], numSlices=2)

squared_rdd = rdd.map(lambda x: x * x)

print(squared_rdd.collect())


[1, 4, 9, 16, 25]


In [10]:
print(sc.master)



local[*]


In [11]:

df.cache()

df.count()

print("Is Cached?", df.is_cached)

Is Cached? True


In [12]:

from pyspark.sql.functions import col

data = [(1, "Alice", "Math"), (2, "Bob", "Science"), (3, "Charlie", "Math"), (4, "David", "Science")]
df = spark.createDataFrame(data, ["id", "name", "subject"])

df_repartitioned = df.repartition(2)

df_repartitioned.groupBy("subject").count().show()

+-------+-----+
|subject|count|
+-------+-----+
|Science|    2|
|   Math|    2|
+-------+-----+



In [13]:
print("Number of Partitions:", rdd.getNumPartitions())

def print_partition(index, iterator):
    return [("Partition: " + str(index), list(iterator))]

partition_rdd = rdd.mapPartitionsWithIndex(print_partition)
print(partition_rdd.collect())


Number of Partitions: 2
[('Partition: 0', [1, 2]), ('Partition: 1', [3, 4, 5])]
