In [1]:
!pip --version
!pip install pyspark py4j

pip 24.1.2 from /usr/local/lib/python3.11/dist-packages/pip (python 3.11)


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, to_date, sum as _sum

spark = SparkSession.builder.appName('MyApp')\
    .config('spark.master', 'local[*]') \
    .getOrCreate()

df_serv_logs = spark.read.csv('/content/sample_data/web_server_logs.csv', header=True, inferSchema=True)

#количество запросов для каждого IP, top 10 самых активных IP.

df_top_ten_ip = df_serv_logs.groupBy('ip')\
  .agg(count('method').alias('request_count'))\
  .orderBy(col('request_count').desc())\
  .limit(10)

print('Top 10 active IP addresses:')
df_top_ten_ip.show()

#количество запросов для каждого метода.
df_counter_requests = df_serv_logs.groupBy('method') \
    .agg(count('method').alias('method_count')) \
    .orderBy('method_count')


print('Request count by HTTP method:')
df_counter_requests.show()

# Фильтрация, группировка и подсчёт запросов с кодом ответа 404

df_request_404 = df_serv_logs.filter(col('response_code') == '404') \
      .groupBy('response_code')\
      .agg(count('response_code').alias('count_404'))\
      .collect()[0]['count_404']

print(f'Number of 404 response codes: {df_request_404}')

# Группировка данных по дате, суммирование размеров ответов и сортировка по дате

df_group_date = df_serv_logs.select(to_date(col('timestamp')).alias('date'), 'response_size') \
    .groupBy('date') \
    .agg(_sum('response_size').alias('total_response_size')) \
    .orderBy('date')

print(f'Total response size by day:')
df_group_date.show()

Top 10 active IP addresses:
+---------------+-------------+
|             ip|request_count|
+---------------+-------------+
| 97.112.121.128|            1|
| 80.217.143.120|            1|
| 216.127.142.11|            1|
|   168.92.46.63|            1|
| 85.124.111.115|            1|
|  7.182.110.213|            1|
|   77.37.152.79|            1|
| 165.148.226.15|            1|
|   216.61.45.89|            1|
|119.213.102.240|            1|
+---------------+-------------+

Request count by HTTP method:
+------+------------+
|method|method_count|
+------+------------+
|  POST|       18550|
|   GET|       18778|
|   PUT|       18824|
|DELETE|       19299|
+------+------------+

Number of 404 response codes: 18984
Total response size by day:
+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2025-01-01|           15484129|
|2025-01-02|           15649464|
|2025-01-03|           15787867|
|2025-01-04|           15335916|
|2025-01-05|         