In [3]:
# pip install pyspark

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("PySparkBasics").getOrCreate()


In [4]:
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
rdd = spark.sparkContext.parallelize(data)
rdd.collect()

[('Alice', 25), ('Bob', 30), ('Charlie', 35)]

In [7]:
rdd_from_file = spark.sparkContext.textFile("sample.txt")
rdd_from_file.take(5)  

['Fraud Detection',
 '148 papers with code • 12 benchmarks • 12 datasets',
 '',
 'Fraud Detection is a vital topic that applies to many industries including the financial sectors, banking, government agencies, insurance, and law enforcement, and more. Fraud endeavors have detected a radical rise in current years, creating this topic more critical than ever. Despite struggles on the part of the troubled organizations, hundreds of millions of dollars are wasted to fraud each year. Because nearly a few samples confirm fraud in a vast community, locating these can be complex. Data mining and statistics help to predict and immediately distinguish fraud and take immediate action to minimize costs.',
 '']

In [8]:
rdd = spark.sparkContext.parallelize([1, 2, 3, 4, 5])

mapped_rdd = rdd.map(lambda x: x * 2)
print(mapped_rdd.collect())  

[2, 4, 6, 8, 10]


In [9]:
sum_of_elements = rdd.reduce(lambda a, b: a + b)
print(sum_of_elements) 

15


In [10]:
from pyspark.sql import Row

data = [Row(name="Alice", age=25), Row(name="Bob", age=30)]
df = spark.createDataFrame(data)
df.show()


+-----+---+
| name|age|
+-----+---+
|Alice| 25|
|  Bob| 30|
+-----+---+



In [11]:
# df = spark.read.csv("sample.csv", header=True, inferSchema=True)
# df.show()

 Basic Operations

In [12]:
df.printSchema()  
# Shows column names and data types

df.select("name").show()  
# Selects a column

df.filter(df.age > 25).show()  
# Filters rows

df.groupBy("age").count().show()  
# Groups and counts


root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)

+-----+
| name|
+-----+
|Alice|
|  Bob|
+-----+

+----+---+
|name|age|
+----+---+
| Bob| 30|
+----+---+

+---+-----+
|age|count|
+---+-----+
| 25|    1|
| 30|    1|
+---+-----+



SQL Queries on DataFrames


In [13]:
df.createOrReplaceTempView("people")
result = spark.sql("SELECT * FROM people WHERE age > 25")
result.show()


+----+---+
|name|age|
+----+---+
| Bob| 30|
+----+---+



PySpark MLlib – Machine Learning in PySpark

In [14]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [15]:
data = [(1, 2, 3), (2, 3, 4), (3, 4, 5)]
df = spark.createDataFrame(data, ["feature1", "feature2", "label"])

In [16]:
assembler = VectorAssembler(inputCols=["feature1", "feature2"], outputCol="features")
df = assembler.transform(df)

In [17]:
lr = LinearRegression(featuresCol="features", labelCol="label")
model = lr.fit(df)

Saving and Loading Data

In [20]:
# df.write.csv("output.csv", header=True)

In [21]:
# df = spark.read.csv("output.csv", header=True, inferSchema=True)
# df.show()

PySpark Streaming - 
PySpark also supports real-time data streaming using Structured Streaming.

In [23]:
# df = spark.readStream.format("socket").option("host", "localhost").option("port", 9999).load()
# df.writeStream.format("console").start().awaitTermination()