In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, minute, avg, count, max, sum, row_number
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("WebTrafficAnalysis").getOrCreate()

In [3]:
from datetime import datetime
from pyspark.sql import Row

web_data = [
    Row(UserID=1, Page="Home", Timestamp="2024-06-12 10:00:00", Duration=35, Device="Mobile", Country="India"),
    Row(UserID=2, Page="Products", Timestamp="2024-06-12 10:02:00", Duration=120, Device="Desktop", Country="USA"),
    Row(UserID=3, Page="Cart", Timestamp="2024-06-12 10:05:00", Duration=45, Device="Tablet", Country="UK"),
    Row(UserID=1, Page="Checkout", Timestamp="2024-06-12 10:08:00", Duration=60, Device="Mobile", Country="India"),
    Row(UserID=4, Page="Home", Timestamp="2024-06-12 10:10:00", Duration=15, Device="Mobile", Country="Canada"),
    Row(UserID=2, Page="Contact", Timestamp="2024-06-12 10:15:00", Duration=25, Device="Desktop", Country="USA"),
    Row(UserID=5, Page="Products", Timestamp="2024-06-12 10:20:00", Duration=90, Device="Desktop", Country="India"),
]

df_web = spark.createDataFrame(web_data)
df_web.show(truncate=False)

+------+--------+-------------------+--------+-------+-------+
|UserID|Page    |Timestamp          |Duration|Device |Country|
+------+--------+-------------------+--------+-------+-------+
|1     |Home    |2024-06-12 10:00:00|35      |Mobile |India  |
|2     |Products|2024-06-12 10:02:00|120     |Desktop|USA    |
|3     |Cart    |2024-06-12 10:05:00|45      |Tablet |UK     |
|1     |Checkout|2024-06-12 10:08:00|60      |Mobile |India  |
|4     |Home    |2024-06-12 10:10:00|15      |Mobile |Canada |
|2     |Contact |2024-06-12 10:15:00|25      |Desktop|USA    |
|5     |Products|2024-06-12 10:20:00|90      |Desktop|India  |
+------+--------+-------------------+--------+-------+-------+



In [4]:
# 1. Display schema
df_web.printSchema()

# 2. Convert Timestamp to proper type
df_web = df_web.withColumn("Timestamp", to_timestamp("Timestamp", "yyyy-MM-dd HH:mm:ss"))
df_web.show()

# 3. Add SessionMinute column
df_web = df_web.withColumn("SessionMinute", minute("Timestamp"))
df_web.show()

root
 |-- UserID: long (nullable = true)
 |-- Page: string (nullable = true)
 |-- Timestamp: string (nullable = true)
 |-- Duration: long (nullable = true)
 |-- Device: string (nullable = true)
 |-- Country: string (nullable = true)

+------+--------+-------------------+--------+-------+-------+
|UserID|    Page|          Timestamp|Duration| Device|Country|
+------+--------+-------------------+--------+-------+-------+
|     1|    Home|2024-06-12 10:00:00|      35| Mobile|  India|
|     2|Products|2024-06-12 10:02:00|     120|Desktop|    USA|
|     3|    Cart|2024-06-12 10:05:00|      45| Tablet|     UK|
|     1|Checkout|2024-06-12 10:08:00|      60| Mobile|  India|
|     4|    Home|2024-06-12 10:10:00|      15| Mobile| Canada|
|     2| Contact|2024-06-12 10:15:00|      25|Desktop|    USA|
|     5|Products|2024-06-12 10:20:00|      90|Desktop|  India|
+------+--------+-------------------+--------+-------+-------+

+------+--------+-------------------+--------+-------+-------+----------

In [5]:
# 4. Mobile users on Checkout page
df_web.filter((df_web.Device == "Mobile") & (df_web.Page == "Checkout")).show()

# 5. Sessions > 60 seconds
df_web.filter(df_web.Duration > 60).show()

# 6. Indian users on Products page
df_web.filter((df_web.Country == "India") & (df_web.Page == "Products")).show()

+------+--------+-------------------+--------+------+-------+-------------+
|UserID|    Page|          Timestamp|Duration|Device|Country|SessionMinute|
+------+--------+-------------------+--------+------+-------+-------------+
|     1|Checkout|2024-06-12 10:08:00|      60|Mobile|  India|            8|
+------+--------+-------------------+--------+------+-------+-------------+

+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|
+------+--------+-------------------+--------+-------+-------+-------------+
|     2|Products|2024-06-12 10:02:00|     120|Desktop|    USA|            2|
|     5|Products|2024-06-12 10:20:00|      90|Desktop|  India|           20|
+------+--------+-------------------+--------+-------+-------+-------------+

+------+--------+-------------------+--------+-------+-------+-------------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|
+-

In [6]:
# 7. Avg duration per device
df_web.groupBy("Device").agg(avg("Duration").alias("AvgDuration")).show()

# 8. Sessions per country
df_web.groupBy("Country").agg(count("UserID").alias("SessionCount")).show()

# 9. Max duration per page
df_web.groupBy("Page").agg(max("Duration").alias("MaxDuration")).show()

+-------+------------------+
| Device|       AvgDuration|
+-------+------------------+
| Mobile|36.666666666666664|
| Tablet|              45.0|
|Desktop| 78.33333333333333|
+-------+------------------+

+-------+------------+
|Country|SessionCount|
+-------+------------+
|  India|           3|
|    USA|           2|
|     UK|           1|
| Canada|           1|
+-------+------------+

+--------+-----------+
|    Page|MaxDuration|
+--------+-----------+
|    Cart|         45|
|    Home|         35|
|Products|        120|
|Checkout|         60|
| Contact|         25|
+--------+-----------+



In [7]:
# 10. Rank pages by timestamp per user
user_window = Window.partitionBy("UserID").orderBy("Timestamp")
df_web = df_web.withColumn("PageRank", row_number().over(user_window))
df_web.show()

# 11. Total duration per user
df_web.groupBy("UserID").agg(sum("Duration").alias("TotalDuration")).show()

+------+--------+-------------------+--------+-------+-------+-------------+--------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|PageRank|
+------+--------+-------------------+--------+-------+-------+-------------+--------+
|     1|    Home|2024-06-12 10:00:00|      35| Mobile|  India|            0|       1|
|     1|Checkout|2024-06-12 10:08:00|      60| Mobile|  India|            8|       2|
|     2|Products|2024-06-12 10:02:00|     120|Desktop|    USA|            2|       1|
|     2| Contact|2024-06-12 10:15:00|      25|Desktop|    USA|           15|       2|
|     3|    Cart|2024-06-12 10:05:00|      45| Tablet|     UK|            5|       1|
|     4|    Home|2024-06-12 10:10:00|      15| Mobile| Canada|           10|       1|
|     5|Products|2024-06-12 10:20:00|      90|Desktop|  India|           20|       1|
+------+--------+-------------------+--------+-------+-------+-------------+--------+

+------+-------------+
|UserID|TotalDuration|
+------

In [8]:
# 12. Create temporary view
df_web.createOrReplaceTempView("traffic_view")

# 13. Top 2 longest sessions
spark.sql("SELECT * FROM traffic_view ORDER BY Duration DESC LIMIT 2").show()

# 14. Unique users per page
spark.sql("SELECT Page, COUNT(DISTINCT UserID) AS UniqueUsers FROM traffic_view GROUP BY Page").show()

+------+--------+-------------------+--------+-------+-------+-------------+--------+
|UserID|    Page|          Timestamp|Duration| Device|Country|SessionMinute|PageRank|
+------+--------+-------------------+--------+-------+-------+-------------+--------+
|     2|Products|2024-06-12 10:02:00|     120|Desktop|    USA|            2|       1|
|     5|Products|2024-06-12 10:20:00|      90|Desktop|  India|           20|       1|
+------+--------+-------------------+--------+-------+-------+-------------+--------+

+--------+-----------+
|    Page|UniqueUsers|
+--------+-----------+
|    Cart|          1|
|    Home|          2|
|Checkout|          1|
|Products|          2|
| Contact|          1|
+--------+-----------+



In [11]:
# For CSV
df_web.write.mode("overwrite").option("header", True).csv("/content/drive/MyDrive/web_traffic_csv")

# For Parquet
df_web.write.mode("overwrite").partitionBy("Country").parquet("/content/drive/MyDrive/web_traffic_parquet")

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
