<a href="https://colab.research.google.com/github/MitulovValentin/Colab/blob/main/Analize_logs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=f7a1eb3d5e71c6e07c5917298ca6a3083e1e81926704599792d2ac005699dc59
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [4]:
!pip install faker

Collecting faker
  Downloading Faker-28.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-28.0.0-py3-none-any.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-28.0.0


In [5]:
import csv
from faker import Faker
import random

fake = Faker()

num_records = 100000

http_methods = ['GET', 'POST', 'PUT', 'DELETE']
response_codes = [200, 301, 404, 500]

file_path = "web_server_logs.csv"

with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['ip', 'timestamp', 'method', 'url', 'response_code', 'response_size'])

    for _ in range(num_records):
        ip = fake.ipv4()
        timestamp = fake.date_time_this_year().isoformat()
        method = random.choice(http_methods)
        url = fake.uri_path()
        response_code = random.choice(response_codes)
        response_size = random.randint(100, 10000)

        writer.writerow([ip, timestamp, method, url, response_code, response_size])

print(f"Сгенерировано {num_records} записей и сохранено в {file_path}")


Сгенерировано 100000 записей и сохранено в web_server_logs.csv


In [51]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, date_format


spark = SparkSession.builder.appName("Web server logs").getOrCreate()

df = spark.read.csv("web_server_logs.csv", header=True, inferSchema=True)



+---------------+--------------------+------+--------------------+-------------+-------------+
|             ip|           timestamp|method|                 url|response_code|response_size|
+---------------+--------------------+------+--------------------+-------------+-------------+
|     204.7.23.6|2024-04-21 07:01:...|   PUT|                 app|          404|         7984|
| 25.127.151.238|2024-02-19 12:29:...|  POST|     search/blog/tag|          200|         8630|
|  37.176.13.192|2024-06-10 05:09:...|DELETE|                 app|          200|          886|
|  136.215.14.26|2024-03-29 08:25:...|   PUT|             explore|          500|         9747|
| 105.220.185.22|2024-04-21 14:42:...|   GET|   list/app/category|          301|         6666|
| 212.196.55.179|2024-04-10 13:20:...|   PUT|           tags/main|          500|         7344|
|   88.201.38.30|2024-04-08 09:07:...|  POST|main/category/search|          500|         5124|
|   12.224.108.8|2024-04-03 15:30:...|  POST|     

In [64]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, date_format


spark = SparkSession.builder.appName("Web server logs").getOrCreate()

df = spark.read.csv("web_server_logs.csv", header=True, inferSchema=True)

#1: Top 10 active IP adresses:
df_group_ip = df.groupBy("ip").agg({"response_code":"count"})
select_ip = df_group_ip.select("ip", col("count(response_code)").alias("request_count")) \
    .orderBy(col("count(response_code)").desc()) \
    .limit(10)

#2 Request count by HTTP method:
df_group_methods = df.groupBy("method").agg({"method":"count"})
select_methods = df_group_methods.select("method", col("count(method)").alias("method_count"))


#3 Count request with response code 404:
count_request_404 = df.filter(df.response_code == 404).count()

#4 Total response size by day:
df_group_day = df.groupBy(date_format("timestamp", "yyy-MM-dd").alias("date")).agg({"response_size":"sum"})


select_day = df_group_day.select(
      "date",
      col("sum(response_size)").alias("total_response_size")
      ) \
    .orderBy("date")

print("Top 10 active IP adresses:")
select_ip.show()
print("Request count by HTTP method:")
select_methods.show()
print(f"Number of 404 response code: {count_request_404}")
print("Total response size by day:")
select_day.show()

Top 10 active IP adresses:
+---------------+-------------+
|             ip|request_count|
+---------------+-------------+
| 161.158.116.15|            2|
|  2.145.118.227|            1|
|215.204.119.143|            1|
|    2.178.5.247|            1|
|  52.231.202.42|            1|
| 96.105.246.194|            1|
|  90.144.251.28|            1|
|  19.83.177.190|            1|
|   49.11.124.91|            1|
|  70.211.174.64|            1|
+---------------+-------------+

Request count by HTTP method:
+------+------------+
|method|method_count|
+------+------------+
|  POST|       24917|
|DELETE|       25143|
|   PUT|       25048|
|   GET|       24892|
+------+------------+

Number of 404 response code: 24864
Total response size by day:
+----------+-------------------+
|      date|total_response_size|
+----------+-------------------+
|2024-01-01|            2065757|
|2024-01-02|            1906854|
|2024-01-03|            2154029|
|2024-01-04|            2254936|
|2024-01-05|           