### Data Reading

In [0]:
# to get the file name 
dbutils.fs.ls('/FileStore/tables/')

Out[6]: [FileInfo(path='dbfs:/FileStore/tables/sample.txt', name='sample.txt', size=59, modificationTime=1747504110000)]

In [0]:
df =spark.read.format('csv').option('inferSchema',True).option('header',True).load('/FileStore/tables/sample.txt')

In [0]:
import re
from pyspark import Row
from pyspark.sql.types import *
from pyspark.sql import SparkSession

In [0]:

spark = SparkSession\
    .builder\
    .master("local[4]")\
    .appName("RDDs")\
    .config("spark.eventLog.logBlockUpdates.enabled", True)\
    .getOrCreate()

# Creating a RDD from a collection
data = [1, 2, 3, 4, 5]
rdd = spark.sparkContext.parallelize(data)
print("RDD elements:", rdd.collect())


# Creating a RDD from a text file
text_rdd = spark.sparkContext.textFile("/FileStore/tables/sample.txt")
print("File contents:", text_rdd.collect())

# RDD Transformations
# map
squared = rdd.map(lambda x: x ** 2)
print("Squares:", squared.collect())

# filter
even = rdd.filter(lambda x: x % 2 == 0)
print("Even numbers:", even.collect())

# flatmap
words = text_rdd.flatMap(lambda line: line.split(" "))
print("Words:", words.collect())

# distinct
unique_words = words.distinct()
print("Unique words:", unique_words.collect())

# RDD Actions
print("Count:", rdd.count())
print("Sum:", rdd.sum())
print("First element:", rdd.first())
print("Take 3:", rdd.take(3))


RDD elements: [1, 2, 3, 4, 5]
File contents: ['Hello world', 'Hello Spark', 'This is an RDD lab', 'RDDs are cool']
Squares: [1, 4, 9, 16, 25]
Even numbers: [2, 4]
Words: ['Hello', 'world', 'Hello', 'Spark', 'This', 'is', 'an', 'RDD', 'lab', 'RDDs', 'are', 'cool']
Unique words: ['Hello', 'world', 'Spark', 'is', 'an', 'lab', 'are', 'This', 'RDD', 'RDDs', 'cool']
Count: 5
Sum: 15
First element: 1
Take 3: [1, 2, 3]
