In [None]:
# Установка PySpark в Google Colab
!pip install pyspark




In [None]:

# Загрузка файла posts_sample.xml
!wget https://git.ai.ssau.ru/tk/big_data/raw/branch/master/data/posts_sample.xml

# Загрузка файла programming-languages.csv
!wget https://git.ai.ssau.ru/tk/big_data/raw/branch/master/data/programming-languages.csv


--2025-04-09 09:45:23--  https://git.ai.ssau.ru/tk/big_data/raw/branch/master/data/posts_sample.xml
Resolving git.ai.ssau.ru (git.ai.ssau.ru)... 91.222.131.161
Connecting to git.ai.ssau.ru (git.ai.ssau.ru)|91.222.131.161|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 74162295 (71M) [text/plain]
Saving to: ‘posts_sample.xml’


2025-04-09 09:48:42 (367 KB/s) - ‘posts_sample.xml’ saved [74162295/74162295]

--2025-04-09 09:48:42--  https://git.ai.ssau.ru/tk/big_data/raw/branch/master/data/programming-languages.csv
Resolving git.ai.ssau.ru (git.ai.ssau.ru)... 91.222.131.161
Connecting to git.ai.ssau.ru (git.ai.ssau.ru)|91.222.131.161|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 40269 (39K) [text/plain]
Saving to: ‘programming-languages.csv’


2025-04-09 09:48:43 (128 KB/s) - ‘programming-languages.csv’ saved [40269/40269]



In [None]:
import os
import sys
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Настройка переменных окружения для PySpark
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.12:0.17.0 pyspark-shell'

# Создание SparkSession
spark = SparkSession.builder \
    .appName("Top Programming Languages 2010-2020 by Year") \
    .getOrCreate()

In [None]:
# 1. Чтение XML-файла с постами и фильтрация
posts_df = spark.read \
    .format('xml') \
    .option('rowTag', 'row') \
    .option("timestampFormat", 'y/M/d H:m:s') \
    .load('posts_sample.xml') \
    .filter(F.col("_PostTypeId") == 1)

# Фильтрация по дате и добавление года
posts_df = posts_df.filter(F.col("_CreationDate").between("2010-01-01", "2020-12-31")) \
                   .withColumn("Year", F.year(F.col("_CreationDate")))
print("Первые 10 строк из posts_df :")
posts_df.show(10, truncate=False)
print("Общее количество вопросов в posts_df:", posts_df.count())

Первые 10 строк из posts_df :
+-----------------+------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
# 2. Чтение CSV-файла с языками программирования
languages_df = spark.read \
    .format('csv') \
    .option('header', 'true') \
    .option("inferSchema", True) \
    .load('programming-languages.csv') \
    .dropna() \
    .select(F.lower(F.col("name")).alias("language"))
print("Первые 10 строк из languages_df:")
languages_df.show(10, truncate=False)

Первые 10 строк из languages_df:
+----------+
|language  |
+----------+
|a# .net   |
|a# (axiom)|
|a-0 system|
|a+        |
|a++       |
|abap      |
|abc       |
|abc algol |
|abset     |
|absys     |
+----------+
only showing top 10 rows



In [None]:
# 3. Создание временных таблиц для SQL-запроса
posts_df.createOrReplaceTempView("posts")
languages_df.createOrReplaceTempView("languages")

In [None]:
# 4. SQL-запрос для подсчета упоминаний языков по годам
query = """
WITH LanguageMentions AS (
    SELECT
        p.Year,
        l.language,
        COUNT(*) as mentions
    FROM posts p
    CROSS JOIN languages l
    WHERE LOWER(p._Tags) LIKE CONCAT('%<', l.language, '>%')
    GROUP BY p.Year, l.language
),
RankedLanguages AS (
    SELECT
        Year,
        language,
        mentions,
        ROW_NUMBER() OVER (PARTITION BY Year ORDER BY mentions DESC) as rank
    FROM LanguageMentions
)
SELECT Year, language, mentions
FROM RankedLanguages
WHERE rank <= 10
ORDER BY Year, mentions DESC
"""

In [None]:
# 5. Сохранение результата в формате Parquet
result_df = spark.sql(query)
print("Топ-10 языков программирования для каждого года (2010–2020):")
result_df.show(110, truncate=False)

Топ-10 языков программирования для каждого года (2010–2020):
+----+-----------+--------+
|Year|language   |mentions|
+----+-----------+--------+
|2010|java       |52      |
|2010|php        |46      |
|2010|javascript |44      |
|2010|python     |26      |
|2010|objective-c|23      |
|2010|c          |20      |
|2010|ruby       |12      |
|2010|delphi     |8       |
|2010|applescript|3       |
|2010|r          |3       |
|2011|php        |102     |
|2011|java       |93      |
|2011|javascript |83      |
|2011|python     |37      |
|2011|objective-c|34      |
|2011|c          |24      |
|2011|ruby       |20      |
|2011|perl       |9       |
|2011|delphi     |8       |
|2011|bash       |7       |
|2012|php        |154     |
|2012|javascript |132     |
|2012|java       |124     |
|2012|python     |69      |
|2012|objective-c|45      |
|2012|ruby       |27      |
|2012|c          |27      |
|2012|bash       |10      |
|2012|r          |9       |
|2012|scala      |6       |
|2013|php      

In [None]:
# 6. Сохранение результата с перезаписью
result_df.write.mode("overwrite").parquet("top_10_languages_by_year2.parquet")
print("Результат сохранён в top_10_languages_by_year2.parquet")

# Упаковка Parquet-директории в ZIP
import shutil
shutil.make_archive("top_10_languages_by_year2", 'zip', "top_10_languages_by_year2.parquet")
print("Parquet-директория упакована в top_10_languages_by_year2.zip")

# 7. Остановка SparkSession
spark.stop()

# Скачивание ZIP-архива
from google.colab import files
files.download('top_10_languages_by_year2.zip')

Результат сохранён в top_10_languages_by_year2.parquet
Parquet-директория упакована в top_10_languages_by_year2.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>