In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("find the genius").getOrCreate()
print(spark)

25/03/24 13:49:57 WARN Utils: Your hostname, TTNPL-8203 resolves to a loopback address: 127.0.1.1; using 10.1.209.120 instead (on interface wlp0s20f3)
25/03/24 13:49:57 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/24 13:49:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


<pyspark.sql.session.SparkSession object at 0x7b8e0b52f4f0>


In [4]:
data = [
    (100, "HR", 5000, "2023-01-01"),
    (100, "Finance", 6000, "2023-02-01"),
    (200, "IT", 7000, "2023-03-01"),
    (300, "Sales", 5500, "2023-04-01"),
    (300, "Marketing", 6500, "2023-05-01"),
    (300, "HR", 7500, "2023-06-01"),
]

df = spark.createDataFrame(data = data,schema = ["id","department","salary","date"])
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- date: string (nullable = true)



In [6]:
from pyspark.sql.functions import count
df2 = df.groupBy("id").agg(
    count("id").alias("count")
)

In [18]:
df.join(df2.where(df2["count"]==1),on="id").drop("count").show()

                                                                                

+---+----------+------+----------+
| id|department|salary|      date|
+---+----------+------+----------+
|200|        IT|  7000|2023-03-01|
+---+----------+------+----------+



In [21]:
df.join(df2.where(df2["count"] == 1),"id",how = "leftsemi").show()

                                                                                

+---+----------+------+----------+
| id|department|salary|      date|
+---+----------+------+----------+
|200|        IT|  7000|2023-03-01|
+---+----------+------+----------+



In [31]:
data = ["1","2","4","5","4","acb","nsjn","bjbs"]
rdd = spark.sparkContext.parallelize(data)
rdd2  =rdd.map(lambda x:(x,))

In [32]:
df = rdd2.toDF(["cols"])

In [33]:
from pyspark.sql import functions as f

In [34]:
df1 = df.withColumn("cols",df.cols.cast("Integer"))

In [40]:
df1.filter(df1.cols.isNotNull()).show()

+----+
|cols|
+----+
|   1|
|   2|
|   4|
|   5|
|   4|
+----+



In [41]:
data = [(1, None, 10), (2, None, None), (None, 3, 30)]
columns = ["column_one", "column_two", "column_three"]
df = spark.createDataFrame(data, columns)

In [42]:

df.printSchema()

root
 |-- column_one: long (nullable = true)
 |-- column_two: long (nullable = true)
 |-- column_three: long (nullable = true)



In [43]:
dict ={}
for column in df.columns:
    dict[column]=df.filter(df[column].isNull()).count()

In [50]:
df_nulls = spark.createDataFrame(data = tuple(dict.items()),schema = ["column","null_count"])

In [51]:
df_nulls.show()

+------------+----------+
|      column|null_count|
+------------+----------+
|  column_one|         1|
|  column_two|         2|
|column_three|         1|
+------------+----------+



In [66]:
data = [
    ("John", "HR", 5000),
    ("Smith", "Finance", 6000),
]

In [67]:
df = spark.createDataFrame(data = data , schema = ["name","dept","salary"])

In [73]:
prefix ="de_"
col_list = []
for columns in df.columns:
    print(columns)
    new_columns = prefix+columns
    print(new_columns)
    df = df.withColumnRenamed(existing=columns,new=new_columns)

name
de_name
dept
de_dept
salary
de_salary


In [75]:

df.show()

+-------+-------+---------+
|de_name|de_dept|de_salary|
+-------+-------+---------+
|   John|     HR|     5000|
|  Smith|Finance|     6000|
+-------+-------+---------+



#### Flattening an array column

In [76]:
data = [
    (1, ["mobile", "PC", "tab"]),
    (2, ["mobile", "PC"]),
    (3, ["tab", "pen"]),
]
df = spark.createDataFrame(data = data,schema = ["cusomer_id","product_purchased"])

In [77]:
df.printSchema()

root
 |-- cusomer_id: long (nullable = true)
 |-- product_purchased: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [83]:
df1 = df.withColumn("product_Purchased",f.explode(df.product_purchased))

In [84]:
df.show()

+----------+-----------------+
|cusomer_id|product_purchased|
+----------+-----------------+
|         1|[mobile, PC, tab]|
|         2|     [mobile, PC]|
|         3|       [tab, pen]|
+----------+-----------------+



In [85]:
df1.show()

+----------+-----------------+
|cusomer_id|product_Purchased|
+----------+-----------------+
|         1|           mobile|
|         1|               PC|
|         1|              tab|
|         2|           mobile|
|         2|               PC|
|         3|              tab|
|         3|              pen|
+----------+-----------------+



#### cumulative sales in pyspark

In [86]:
from pyspark.sql.window import Window

In [132]:
sales_data = [(1, '2024-03-01', 100), (1, '2024-03-02', 200),
              (2, '2024-03-01', 150), (2, '2024-03-03', 300)]
df = spark.createDataFrame(data =sales_data , schema = ["product_id", "sales_date", "sales_amount"])
df =df.withColumn("sales_date",df.sales_date.cast("Date"))
df.printSchema()

root
 |-- product_id: long (nullable = true)
 |-- sales_date: date (nullable = true)
 |-- sales_amount: long (nullable = true)



In [133]:
window_spec = Window.partitionBy("product_id").orderBy("sales_date")

In [134]:
df.withColumn("cumulative_sum",f.sum(df["sales_amount"]).over(window_spec)).show()

+----------+----------+------------+--------------+
|product_id|sales_date|sales_amount|cumulative_sum|
+----------+----------+------------+--------------+
|         1|2024-03-01|         100|           100|
|         1|2024-03-02|         200|           300|
|         2|2024-03-01|         150|           150|
|         2|2024-03-03|         300|           450|
+----------+----------+------------+--------------+



#### previous sales and next sales

In [135]:
df.withColumn("previous_sales",f.lag("sales_amount").over(window_spec))\
    .withColumn("next_sales",f.lead("sales_amount").over(window_spec)).show()

+----------+----------+------------+--------------+----------+
|product_id|sales_date|sales_amount|previous_sales|next_sales|
+----------+----------+------------+--------------+----------+
|         1|2024-03-01|         100|          NULL|       200|
|         1|2024-03-02|         200|           100|      NULL|
|         2|2024-03-01|         150|          NULL|       300|
|         2|2024-03-03|         300|           150|      NULL|
+----------+----------+------------+--------------+----------+



In [136]:
max_date = df.agg(f.max("sales_date")).collect()[0][0]

In [137]:
min_date = df.agg(f.min("sales_date")).collect()[0][0]

In [138]:
print(max_date,min_date)

2024-03-03 2024-03-01


In [180]:
from datetime import timedelta,datetime,date

In [159]:
start_date = min_date
end_date = max_date
current_date = start_date
date_list = []
while current_date<=end_date:
    date_list.append(current_date)
    current_date += timedelta(days=1)

In [160]:
print(date_list)

[datetime.date(2024, 3, 1), datetime.date(2024, 3, 2), datetime.date(2024, 3, 3)]


In [161]:
rdd = spark.sparkContext.parallelize(date_list)
rdd2 =rdd.map(lambda x:(x,))
df1 = rdd2.toDF(["date"])

In [162]:
df.printSchema()

root
 |-- product_id: long (nullable = true)
 |-- sales_date: date (nullable = true)
 |-- sales_amount: long (nullable = true)



In [163]:
df1.printSchema() 

root
 |-- date: date (nullable = true)



In [164]:
df1.show()

+----------+
|      date|
+----------+
|2024-03-01|
|2024-03-02|
|2024-03-03|
+----------+



In [166]:
df1.join(df,df["sales_date"]==df1["date"],"leftanti").show()

+----+
|date|
+----+
+----+



In [170]:
df.withColumn("rank",f.rank().over(Window.orderBy(f.col("sales_amount").desc()))).show()

25/03/24 16:46:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/24 16:46:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/03/24 16:46:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+----------+----------+------------+----+
|product_id|sales_date|sales_amount|rank|
+----------+----------+------------+----+
|         2|2024-03-03|         300|   1|
|         1|2024-03-02|         200|   2|
|         2|2024-03-01|         150|   3|
|         1|2024-03-01|         100|   4|
+----------+----------+------------+----+



In [171]:
from pyspark.sql import Row

In [181]:
new_row = Row(product_id = 3,sales_date = date(2024,3,5),sales_amount=500)

In [182]:
new_df_rw = spark.createDataFrame([new_row],["product_id", "sales_date", "sales_amount"])

In [183]:
df1 = df.union(new_df_rw)

In [184]:
df1.show()

+----------+----------+------------+
|product_id|sales_date|sales_amount|
+----------+----------+------------+
|         1|2024-03-01|         100|
|         1|2024-03-02|         200|
|         2|2024-03-01|         150|
|         2|2024-03-03|         300|
|         3|2024-03-05|         500|
+----------+----------+------------+



In [185]:
spark.stop()