<h2>Lab5: Попередня обробка даних в Spark MLlib</h2>

In [99]:
from pyspark.sql import SparkSession
import os

os.environ['SPARK_HOME'] = "/home/zaranik/.sdkman/candidates/spark/current"
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python3'

Створення Spark-сессії

In [100]:
spark = SparkSession.builder \
    .appName("MLLib") \
    .getOrCreate()

Задання схеми даних

In [101]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

# Define the schema
schema = StructType([
    StructField("week_ending", StringType(), True),
    StructField("week_number", IntegerType(), True),
    StructField("weekly_gross_overall", IntegerType(), True),
    StructField("show", StringType(), True),
    StructField("theatre", StringType(), True),
    StructField("weekly_gross", IntegerType(), True),
    StructField("potential_gross", StringType(), True),  # NA is treated as StringType
    StructField("avg_ticket_price", DoubleType(), True),
    StructField("top_ticket_price", StringType(), True),  # NA is treated as StringType
    StructField("seats_sold", IntegerType(), True),
    StructField("seats_in_theatre", IntegerType(), True),
    StructField("pct_capacity", DoubleType(), True),
    StructField("performances", IntegerType(), True),
    StructField("previews", IntegerType(), True),
])


Зчитування даних з файлу csv

In [102]:
df = spark.read.csv("./data/grosses.csv", header=True, schema=schema)
df = df.na.fill({"weekly_gross": 0.0})
df.show()

+-----------+-----------+--------------------+--------------------+--------------------+------------+---------------+----------------+----------------+----------+----------------+------------+------------+--------+
|week_ending|week_number|weekly_gross_overall|                show|             theatre|weekly_gross|potential_gross|avg_ticket_price|top_ticket_price|seats_sold|seats_in_theatre|pct_capacity|performances|previews|
+-----------+-----------+--------------------+--------------------+--------------------+------------+---------------+----------------+----------------+----------+----------------+------------+------------+--------+
| 1985-06-09|          1|             3915937|         42nd Street|   St. James Theatre|      282368|             NA|           30.42|              NA|      9281|            1655|       0.701|           8|       0|
| 1985-06-09|          1|             3915937|       A Chorus Line|Sam S. Shubert Th...|      222584|             NA|           27.25|      

Завдання 1

In [103]:
from pyspark.sql import functions as SF

# Find the maximum and minimum values for 'weekly_gross'
max_d = df.select(SF.max("weekly_gross")).collect()[0][0]
min_d = df.select(SF.min("weekly_gross")).collect()[0][0]

print("min_d = ", min_d)
print("max_d = ", max_d)

# Define the number of intervals (borders)
borders_count = 10

# Calculate the border values for equal intervals
borders = [min_d + (max_d - min_d) / (borders_count - 1) * i for i in range(borders_count)]
print(borders)


0
4041493
[0.0, 449054.77777777775, 898109.5555555555, 1347164.3333333333, 1796219.111111111, 2245273.888888889, 2694328.6666666665, 3143383.444444444, 3592438.222222222, 4041493.0]


In [104]:

from pyspark.ml.feature import Bucketizer, QuantileDiscretizer

# Apply Bucketizer to group data into specified intervals
bucketer = Bucketizer(splits=borders, inputCol="weekly_gross", outputCol="weekly_gross_bucket")
bucketed_data = bucketer.transform(df)

# Apply QuantileDiscretizer for automatic quantile-based grouping
quantizer = QuantileDiscretizer(numBuckets=borders_count, inputCol="weekly_gross", outputCol="quantized_weekly_gross")
quantized_data = quantizer.fit(df).transform(bucketed_data)

# Show the result
quantized_data.show()

+-----------+-----------+--------------------+--------------------+--------------------+------------+---------------+----------------+----------------+----------+----------------+------------+------------+--------+-------------------+----------------------+
|week_ending|week_number|weekly_gross_overall|                show|             theatre|weekly_gross|potential_gross|avg_ticket_price|top_ticket_price|seats_sold|seats_in_theatre|pct_capacity|performances|previews|weekly_gross_bucket|quantized_weekly_gross|
+-----------+-----------+--------------------+--------------------+--------------------+------------+---------------+----------------+----------------+----------+----------------+------------+------------+--------+-------------------+----------------------+
| 1985-06-09|          1|             3915937|         42nd Street|   St. James Theatre|      282368|             NA|           30.42|              NA|      9281|            1655|       0.701|           8|       0|            

Завдання 2

In [105]:
from pyspark.ml.feature import VectorAssembler, MinMaxScaler

# Step 1: Assemble 'weekly_gross' into a vector column
assembler = VectorAssembler(inputCols=["weekly_gross"], outputCol="vector_data")
vector_data = assembler.transform(df)

# Step 2: Scale the vector column using MinMaxScaler
scaler = MinMaxScaler(inputCol="vector_data", outputCol="scaled_features")
scaled_data = scaler.fit(vector_data).transform(vector_data)

# Show the scaled data
scaled_data.show()


+-----------+-----------+--------------------+--------------------+--------------------+------------+---------------+----------------+----------------+----------+----------------+------------+------------+--------+-----------+--------------------+
|week_ending|week_number|weekly_gross_overall|                show|             theatre|weekly_gross|potential_gross|avg_ticket_price|top_ticket_price|seats_sold|seats_in_theatre|pct_capacity|performances|previews|vector_data|     scaled_features|
+-----------+-----------+--------------------+--------------------+--------------------+------------+---------------+----------------+----------------+----------+----------------+------------+------------+--------+-----------+--------------------+
| 1985-06-09|          1|             3915937|         42nd Street|   St. James Theatre|      282368|             NA|           30.42|              NA|      9281|            1655|       0.701|           8|       0| [282368.0]|[0.06986724955356...|
| 1985-0

Завдання 3

In [106]:
from pyspark.ml.feature import StringIndexer

# Convert the 'weekly_gross' column into a numerical index
indexer = StringIndexer(inputCol="weekly_gross", outputCol="weekly_gross_index")
indexed_data = indexer.fit(df).transform(df)

# Show the transformed data
indexed_data.show()

+-----------+-----------+--------------------+--------------------+--------------------+------------+---------------+----------------+----------------+----------+----------------+------------+------------+--------+------------------+
|week_ending|week_number|weekly_gross_overall|                show|             theatre|weekly_gross|potential_gross|avg_ticket_price|top_ticket_price|seats_sold|seats_in_theatre|pct_capacity|performances|previews|weekly_gross_index|
+-----------+-----------+--------------------+--------------------+--------------------+------------+---------------+----------------+----------------+----------+----------------+------------+------------+--------+------------------+
| 1985-06-09|          1|             3915937|         42nd Street|   St. James Theatre|      282368|             NA|           30.42|              NA|      9281|            1655|       0.701|           8|       0|           15393.0|
| 1985-06-09|          1|             3915937|       A Chorus Li

24/12/16 11:32:01 WARN DAGScheduler: Broadcasting large task binary with size 1157.0 KiB


Завдання 4

In [107]:
from pyspark.sql.functions import col
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer

# Step 1: Convert 'weekly_gross' to StringType
df = df.withColumn("weekly_gross_str", col("weekly_gross").cast("string"))

# Step 2: Tokenize the 'weekly_gross_str' column
tokens = Tokenizer().setInputCol("weekly_gross_str").setOutputCol("weekly_gross_tokens")
tokenized_data = tokens.transform(df)

# Step 3: Remove stop-words
english_stop_words = StopWordsRemover.loadDefaultStopWords("english")
stops = StopWordsRemover().setStopWords(english_stop_words) \
                          .setInputCol("weekly_gross_tokens") \
                          .setOutputCol("filtered_weekly_gross_tokens")
filtered_data = stops.transform(tokenized_data)

# Step 4: Apply CountVectorizer
cv = CountVectorizer().setInputCol("filtered_weekly_gross_tokens") \
                      .setOutputCol("count_vector").setVocabSize(500)
fitted_cv = cv.fit(filtered_data)
count_vectorized_data = fitted_cv.transform(filtered_data)

# Step 5: Display the result
count_vectorized_data.show(10000, truncate=False)



+-----------+-----------+--------------------+----------------------------------------------------------------+----------------------------+------------+---------------+----------------+----------------+----------+----------------+------------+------------+--------+----------------+-------------------+----------------------------+-----------------+
|week_ending|week_number|weekly_gross_overall|show                                                            |theatre                     |weekly_gross|potential_gross|avg_ticket_price|top_ticket_price|seats_sold|seats_in_theatre|pct_capacity|performances|previews|weekly_gross_str|weekly_gross_tokens|filtered_weekly_gross_tokens|count_vector     |
+-----------+-----------+--------------------+----------------------------------------------------------------+----------------------------+------------+---------------+----------------+----------------+----------+----------------+------------+------------+--------+----------------+---------------

In [108]:
spark.stop()