<a href="https://colab.research.google.com/github/IlyaDenisov88/dataenj/blob/main/PySpark/PySpark_Itog.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Генерация

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=b3d671322e835647209682ab617e60ff9268b14cbf9a90e5a59e43dadf75c24b
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3


In [2]:
!pip install faker

Collecting faker
  Downloading Faker-29.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-29.0.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-29.0.0


In [3]:
import csv
from faker import Faker
import random

fake = Faker()

num_records = 100000

http_methods = ['GET', 'POST', 'PUT', 'DELETE']
response_codes = [200, 301, 404, 500]

file_path = "web_server_logs.csv"

with open(file_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['ip', 'timestamp', 'method', 'url', 'response_code', 'response_size'])

    for _ in range(num_records):
        ip = fake.ipv4()
        timestamp = fake.date_time_this_year().isoformat()
        method = random.choice(http_methods)
        url = fake.uri_path()
        response_code = random.choice(response_codes)
        response_size = random.randint(100, 10000)

        writer.writerow([ip, timestamp, method, url, response_code, response_size])

print(f"Сгенерировано {num_records} записей и сохранено в {file_path}")

Сгенерировано 100000 записей и сохранено в web_server_logs.csv


# Spark

In [103]:
from pyspark.sql import SparkSession

In [104]:
spark = SparkSession.builder.appName("Final_tasks").getOrCreate()

In [105]:
df = spark.read.option("header", "true").csv("/content/web_server_logs.csv")
df.dtypes

[('ip', 'string'),
 ('timestamp', 'string'),
 ('method', 'string'),
 ('url', 'string'),
 ('response_code', 'string'),
 ('response_size', 'string')]

In [106]:
from pyspark.sql.types import StringType, IntegerType, DateType

In [107]:
from pyspark.sql.functions import count, col, to_date

In [108]:
data_types = {
    "timestamp": DateType(),
    "response_code": IntegerType(),
    "response_size": IntegerType()
}
for column_name, data_type in data_types.items():
  df = df.withColumn(column_name, col(column_name).cast(data_type))

In [109]:
df.dtypes

[('ip', 'string'),
 ('timestamp', 'date'),
 ('method', 'string'),
 ('url', 'string'),
 ('response_code', 'int'),
 ('response_size', 'int')]

In [110]:
df.show()

+---------------+----------+------+--------------------+-------------+-------------+
|             ip| timestamp|method|                 url|response_code|response_size|
+---------------+----------+------+--------------------+-------------+-------------+
|    170.6.26.63|2024-06-04|DELETE|     wp-content/main|          301|         9625|
|170.209.183.250|2024-05-16|   GET|      posts/tag/tags|          500|         1126|
|  99.100.26.231|2024-07-03|DELETE|                main|          301|         7123|
| 193.216.128.75|2024-01-27|   GET|             explore|          301|         8541|
|    82.69.71.59|2024-08-04|   GET|                 tag|          404|         2772|
|  205.95.224.27|2024-05-30|  POST|wp-content/search...|          404|         6646|
|    24.74.81.85|2024-04-11|  POST|          categories|          301|         3238|
| 135.240.26.179|2024-05-23|DELETE|                 tag|          404|         5905|
| 178.220.17.153|2024-07-01|DELETE|  posts/category/app|         

1. Сгруппируйте данные по IP и посчитайте количество запросов для каждого IP, выводим 10 самых активных IP. Формат вывода, как на скрине ниже.

2. Сгруппируйте данные по HTTP-методу и посчитайте количество запросов для каждого метода.

3. Профильтруйте и посчитайте количество запросов с кодом ответа 404.

4. Сгруппируйте данные по дате и просуммируйте размер ответов, сортируйте по дате.



In [111]:
print("Top 10 active IP addresses: ")
df.groupBy("ip").count().withColumnRenamed("count", "request_count").orderBy(col("request_count").desc()).limit(10).show()

Top 10 active IP addresses: 
+--------------+-------------+
|            ip|request_count|
+--------------+-------------+
| 198.15.176.55|            2|
|  99.18.129.10|            2|
| 19.173.74.240|            2|
|  122.65.43.57|            1|
| 72.166.75.216|            1|
|  107.58.77.42|            1|
|46.106.225.165|            1|
| 217.10.165.75|            1|
|79.146.119.219|            1|
|154.158.41.106|            1|
+--------------+-------------+



In [112]:
print("Request count by HTTP method: ")
df.groupBy("method").count().withColumnRenamed("count", "count_method").orderBy(col("count_method").desc()).show()

Request count by HTTP method: 
+------+------------+
|method|count_method|
+------+------------+
|   PUT|       25249|
|   GET|       25104|
|DELETE|       24851|
|  POST|       24796|
+------+------------+



In [113]:
print("Total number of 404 response codes:", df.filter(df["response_code"] == 404).count())

Total number of 404 response codes: 25024


In [114]:
from pyspark.sql.functions import sum

In [115]:
print("Total response size by day: ")
df.withColumnRenamed("timestamp", "date").groupBy("date").agg(sum("response_size").alias("tolal_response_size")).orderBy("date").show()

Total response size by day: 
+----------+-------------------+
|      date|tolal_response_size|
+----------+-------------------+
|2024-01-01|            1819089|
|2024-01-02|            1914425|
|2024-01-03|            1806684|
|2024-01-04|            2058286|
|2024-01-05|            1827053|
|2024-01-06|            1900953|
|2024-01-07|            1990190|
|2024-01-08|            1908416|
|2024-01-09|            2080075|
|2024-01-10|            2030764|
|2024-01-11|            1993260|
|2024-01-12|            1900572|
|2024-01-13|            1852664|
|2024-01-14|            1791681|
|2024-01-15|            1853773|
|2024-01-16|            1864168|
|2024-01-17|            1894158|
|2024-01-18|            1710363|
|2024-01-19|            1866185|
|2024-01-20|            2053273|
+----------+-------------------+
only showing top 20 rows



In [116]:
spark.stop()