In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, hour, dayofweek, when, unix_timestamp
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.regression import LinearRegression
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator, RegressionEvaluator, ClusteringEvaluator

In [2]:
spark = SparkSession.builder \
    .appName("MiBiciAnalysis") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

df = spark.read.csv("mibici_trips.csv", header=True, inferSchema=True)

25/06/25 05:23:06 WARN Utils: Your hostname, Molphie resolves to a loopback address: 127.0.1.1; using 192.168.6.223 instead (on interface enp4s0)
25/06/25 05:23:06 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/25 05:23:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [3]:
data = df.withColumn(
    "duration_min",
    (unix_timestamp("Trip_end") - unix_timestamp("Trip_start")) / 60
).withColumn(
    "hour", hour("Trip_start")
).withColumn(
    "dayofweek", dayofweek("Trip_start")
).withColumn(
    "long_trip", when(col("duration_min") > 20, 1).otherwise(0)
)

In [4]:
sex_indexer = StringIndexer(inputCol="Sex", outputCol="Sex_idx", handleInvalid="keep")
data = sex_indexer.fit(data).transform(data)

# Заполнение пропусков в возрасте медианой
if data.filter(col("Age").isNull()).count() > 0:
    median_age = data.approxQuantile("Age", [0.5], 0.01)[0]
    data = data.fillna({"Age": median_age})

                                                                                

# Классификация
предсказать, будет ли поездка длинной (> 20 мин)

In [5]:
feature_cols = ["hour", "dayofweek", "Sex_idx", "Age"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
rf = RandomForestClassifier(labelCol="long_trip", featuresCol="features", seed=42)
pipeline = Pipeline(stages=[assembler, rf])

train, test = data.randomSplit([0.7, 0.3], seed=42)
model = pipeline.fit(train)
pred = model.transform(test)

acc = MulticlassClassificationEvaluator(labelCol="long_trip", metricName="accuracy").evaluate(pred)
auc = BinaryClassificationEvaluator(labelCol="long_trip").evaluate(pred)
print(f"Классификация\nAccuracy: {acc:.3f}\nAUC: {auc:.3f}")

25/06/25 05:23:34 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Trip_Id, User_Id, Sex, Birth_year, Trip_start, Trip_end, Origin_Id, Destination_Id, Age, Duration
 Schema: _c0, Trip_Id, User_Id, Sex, Birth_year, Trip_start, Trip_end, Origin_Id, Destination_Id, Age, Duration
Expected: _c0 but found: 
CSV file: file:///home/evgeniy/HSE/MLBD/Lab3/mibici_trips.csv
25/06/25 05:23:52 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Trip_Id, User_Id, Sex, Birth_year, Trip_start, Trip_end, Origin_Id, Destination_Id, Age, Duration
 Schema: _c0, Trip_Id, User_Id, Sex, Birth_year, Trip_start, Trip_end, Origin_Id, Destination_Id, Age, Duration
Expected: _c0 but found: 
CSV file: file:///home/evgeniy/HSE/MLBD/Lab3/mibici_trips.csv
25/06/25 05:23:55 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Trip_Id, User_Id, Sex, Birth_year, Trip_start, Trip_end, Origin_Id, Destination_Id, Age, Duration
 Schema: _c0, Trip_Id

Классификация
Accuracy: 0.892
AUC: 0.500


                                                                                

# Регрессия длительности поездки

In [6]:
reg_assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
reg = LinearRegression(featuresCol="features", labelCol="duration_min")
pipeline_reg = Pipeline(stages=[reg_assembler, reg])

train_reg, test_reg = data.randomSplit([0.7, 0.3], seed=43)
model_reg = pipeline_reg.fit(train_reg)
pred_reg = model_reg.transform(test_reg)

rmse = RegressionEvaluator(labelCol="duration_min", predictionCol="prediction", metricName="rmse").evaluate(pred_reg)
r2 = RegressionEvaluator(labelCol="duration_min", predictionCol="prediction", metricName="r2").evaluate(pred_reg)
print(f"\nРегрессия\nRMSE: {rmse:.3f}\nR²: {r2:.3f}")

25/06/25 05:26:38 WARN Instrumentation: [f0bb0db5] regParam is zero, which might cause numerical instability and overfitting.
25/06/25 05:26:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Trip_Id, User_Id, Sex, Birth_year, Trip_start, Trip_end, Origin_Id, Destination_Id, Age, Duration
 Schema: _c0, Trip_Id, User_Id, Sex, Birth_year, Trip_start, Trip_end, Origin_Id, Destination_Id, Age, Duration
Expected: _c0 but found: 
CSV file: file:///home/evgeniy/HSE/MLBD/Lab3/mibici_trips.csv
25/06/25 05:27:00 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , Trip_Id, User_Id, Sex, Birth_year, Trip_start, Trip_end, Origin_Id, Destination_Id, Age, Duration
 Schema: _c0, Trip_Id, User_Id, Sex, Birth_year, Trip_start, Trip_end, Origin_Id, Destination_Id, Age, Duration
Expected: _c0 but found: 
CSV file: file:///home/evgeniy/HSE/MLBD/Lab3/mibici_trips.csv
25/06/25 05:27:22 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Hea


Регрессия
RMSE: 963.574
R²: 0.000


                                                                                

# Кластеризация

In [7]:
clust_assembler = VectorAssembler(inputCols=["hour", "dayofweek", "duration_min", "Age"], outputCol="features")
clust_df = clust_assembler.transform(data)

kmeans = KMeans(featuresCol="features", k=5, seed=42)
kmeans_model = kmeans.fit(clust_df)
pred_clust = kmeans_model.transform(clust_df)
pred_clust.groupBy("prediction").count().show()

silhouette = ClusteringEvaluator(metricName="silhouette", featuresCol="features", predictionCol="prediction").evaluate(pred_clust)
cost = kmeans_model.summary.trainingCost

print(f"\nКластеризация\nSilhouette: {silhouette:.3f}\nWSSSE: {cost:.2f}")


25/06/25 05:28:49 WARN MemoryStore: Not enough space to cache rdd_188_16 in memory! (computed 96.3 MiB so far)
25/06/25 05:28:49 WARN BlockManager: Persisting block rdd_188_16 to disk instead.
25/06/25 05:28:49 WARN MemoryStore: Not enough space to cache rdd_188_17 in memory! (computed 96.3 MiB so far)
25/06/25 05:28:49 WARN BlockManager: Persisting block rdd_188_17 to disk instead.
25/06/25 05:28:49 WARN MemoryStore: Not enough space to cache rdd_188_12 in memory! (computed 96.3 MiB so far)
25/06/25 05:28:49 WARN BlockManager: Persisting block rdd_188_12 to disk instead.
25/06/25 05:28:49 WARN MemoryStore: Not enough space to cache rdd_188_15 in memory! (computed 96.3 MiB so far)
25/06/25 05:28:49 WARN BlockManager: Persisting block rdd_188_15 to disk instead.
25/06/25 05:28:49 WARN MemoryStore: Not enough space to cache rdd_188_14 in memory! (computed 96.3 MiB so far)
25/06/25 05:28:49 WARN BlockManager: Persisting block rdd_188_14 to disk instead.
25/06/25 05:28:50 WARN MemoryStore:

+----------+--------+
|prediction|   count|
+----------+--------+
|         2|     649|
|         0|25862409|
|         3|     309|
|         4|      92|
|         1|     231|
+----------+--------+






Кластеризация
Silhouette: 1.000
WSSSE: 805113861995.76


                                                                                