In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import regexp_replace, col, when
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator, RegressionEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import RandomForestClassifier

spark = SparkSession.builder.appName("YouTubeChannelsAnalysis").getOrCreate()

25/06/10 00:27:37 WARN Utils: Your hostname, Molphie resolves to a loopback address: 127.0.1.1; using 192.168.6.223 instead (on interface enp4s0)
25/06/10 00:27:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/10 00:27:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/06/10 00:27:38 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/06/10 00:27:38 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
df = spark.read.csv("youtube_channels.csv", header=True, inferSchema=True)

                                                                                

In [3]:
num_cols = [
    "subscriber_count", "total_views", "total_videos",
    "mean_views_last_30_videos", "median_views_last_30_videos",
    "std_views_last_30_videos", "videos_per_week"
]

for c in num_cols:
    df = df.withColumn(c, regexp_replace(col(c), "[^0-9.]", "").cast("double"))

df = df.dropna(subset=num_cols)

In [4]:
train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

# Кластеризация

In [5]:
cluster_features = [
    "subscriber_count", "total_views", "total_videos",
    "median_views_last_30_videos", "std_views_last_30_videos", "videos_per_week"
]

In [6]:
assembler_c = VectorAssembler(inputCols=cluster_features, outputCol="features_raw")
scaler_c = StandardScaler(inputCol="features_raw", outputCol="features")
pipeline_c = Pipeline(stages=[assembler_c, scaler_c])

In [7]:
pipeline_model_c = pipeline_c.fit(train_df)
train_c = pipeline_model_c.transform(train_df)
test_c = pipeline_model_c.transform(test_df)

                                                                                

In [8]:
kmeans = KMeans(k=4, seed=1, featuresCol="features", predictionCol="cluster")
kmeans_model = kmeans.fit(train_c)
test_clustered = kmeans_model.transform(test_c)

evaluator_cluster = ClusteringEvaluator(predictionCol="cluster", featuresCol="features")
silhouette = evaluator_cluster.evaluate(test_clustered)
print(f"Silhouette Score (test set): {silhouette:.4f}")

25/06/10 00:28:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.

Silhouette Score (test set): 0.9992


                                                                                

# Регрессия

In [9]:
reg_features = [
    "subscriber_count", "total_views", "total_videos",
    "median_views_last_30_videos", "std_views_last_30_videos", "videos_per_week"
]

assembler_r = VectorAssembler(inputCols=reg_features, outputCol="features_raw")
scaler_r = StandardScaler(inputCol="features_raw", outputCol="features")
pipeline_r = Pipeline(stages=[assembler_r, scaler_r])

In [10]:
reg_train = train_df.dropna(subset=["mean_views_last_30_videos"])
reg_test = test_df.dropna(subset=["mean_views_last_30_videos"])

In [11]:
pipeline_model_r = pipeline_r.fit(reg_train)
reg_train = pipeline_model_r.transform(reg_train)
reg_test = pipeline_model_r.transform(reg_test)

                                                                                

In [12]:
lr = LinearRegression(featuresCol="features", labelCol="mean_views_last_30_videos")
lr_model = lr.fit(reg_train)
reg_preds = lr_model.transform(reg_test)

evaluator_rmse = RegressionEvaluator(labelCol="mean_views_last_30_videos", predictionCol="prediction", metricName="rmse")
evaluator_r2 = RegressionEvaluator(labelCol="mean_views_last_30_videos", predictionCol="prediction", metricName="r2")

print(f"Linear Regression RMSE: {evaluator_rmse.evaluate(reg_preds):.2f}")
print(f"Linear Regression R²: {evaluator_r2.evaluate(reg_preds):.4f}")

25/06/10 00:29:10 WARN Instrumentation: [961cd030] regParam is zero, which might cause numerical instability and overfitting.
                                                                                

Linear Regression RMSE: 232305.29




Linear Regression R²: 0.9428


                                                                                

# Классификация

In [13]:
median_val = df.approxQuantile("videos_per_week", [0.5], 0.01)[0]
df_with_label = df.withColumn("label", when(col("videos_per_week") > median_val, 1).otherwise(0))

                                                                                

In [14]:
train_df_l = train_df.join(df_with_label.select("channel_id", "label"), on="channel_id")
test_df_l = test_df.join(df_with_label.select("channel_id", "label"), on="channel_id")

In [15]:
clf_features = [
    "subscriber_count", "total_views", "total_videos",
    "median_views_last_30_videos", "std_views_last_30_videos", "mean_views_last_30_videos"
]

assembler_f = VectorAssembler(inputCols=clf_features, outputCol="features_raw")
scaler_f = StandardScaler(inputCol="features_raw", outputCol="features")
pipeline_f = Pipeline(stages=[assembler_f, scaler_f])

In [16]:
pipeline_model_f = pipeline_f.fit(train_df_l)
train_clf = pipeline_model_f.transform(train_df_l)
test_clf = pipeline_model_f.transform(test_df_l)

rf = RandomForestClassifier(featuresCol="features", labelCol="label", numTrees=50)
rf_model = rf.fit(train_clf)
clf_preds = rf_model.transform(test_clf)

evaluator_acc = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

print(f"Random Forest Accuracy: {evaluator_acc.evaluate(clf_preds):.4f}")
print(f"Random Forest F1 Score: {evaluator_f1.evaluate(clf_preds):.4f}")

                                                                                

Random Forest Accuracy: 0.8040


                                                                                

Random Forest F1 Score: 0.7754
