### Установка пакетов

In [1]:
!pip install pyspark



### Импорт библиотек

In [2]:
import json
import os
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql import SparkSession
from sklearn.datasets import make_classification

### Создание каталога data для хранения данных

In [3]:
if not os.path.exists('data'):
  os.makedirs('data')

### Создание экземпляра SparkSession

In [4]:
spark = SparkSession.builder.appName('Homework_3_lab2').getOrCreate()

## Кластеризация (KMeans)

### Задание 1. Сгенерировать подходящие исходные данные для проведения обучения

#### 1.1. Определение функции генерации данных

In [5]:
def to_file(path, X, y):
  lines = []

  for i in range(len(y)):
    features = []

    for j in range(len(X[0])):
      features.append(f"{j+1}:{X[i][j]}")

    lines.append(f"{y[i]} {' '.join(features)}")

  with open(path, "w") as file:
    file.write('\n'.join(lines))

#### 1.2. Генерация данных

In [6]:
X, y = make_classification(
  n_samples=1000,
  n_features=15,
  n_informative=12,
  n_redundant=3,
  n_clusters_per_class=2,
  n_classes=5
)

to_file('data/clusterization_data', X, y)

#### 1.3. Чтение данных из файла

In [7]:
df = spark.read.format('libsvm').load('data/clusterization_data')
df.head()

Row(label=0.0, features=SparseVector(15, {0: -0.0548, 1: -1.8421, 2: -0.3111, 3: -1.8563, 4: 1.7194, 5: 1.7487, 6: 0.4585, 7: 1.9063, 8: -3.0565, 9: 2.1487, 10: 2.1825, 11: -2.7551, 12: -2.1576, 13: -5.6815, 14: -4.1576}))

### Задание 2. Провести кластеризацию с помощью pyspark.ml / pyspark.mllib

In [8]:
kmeans = KMeans().setK(5)
model = kmeans.fit(df)

In [9]:
predictions = model.transform(df)
predictions.head()

Row(label=0.0, features=SparseVector(15, {0: -0.0548, 1: -1.8421, 2: -0.3111, 3: -1.8563, 4: 1.7194, 5: 1.7487, 6: 0.4585, 7: 1.9063, 8: -3.0565, 9: 2.1487, 10: 2.1825, 11: -2.7551, 12: -2.1576, 13: -5.6815, 14: -4.1576}), prediction=3)

In [10]:
evaluation = ClusteringEvaluator().evaluate(predictions)
evaluation

0.1834406800196666

### Задание 3. Вывести и сохранить в файл полученные центры кластеров

In [11]:
centers = model.clusterCenters()
centers

[array([ 2.07343973,  0.28063027,  0.53107784, -0.80946533, -0.84016241,
         0.32100587, -0.34856495,  0.66882416,  0.53021513, -0.39921477,
        -1.28909384,  0.86802634, -0.28573687,  2.9679894 ,  1.57510421]),
 array([-7.68185403, -1.16618   , -0.77265416,  0.79711368,  1.75812087,
        -0.38748882, -0.27885783, -0.37499013,  1.7436419 ,  1.94492598,
         1.04533645,  0.56350837, -1.62988183, -4.94600059,  2.90165429]),
 array([-4.30742384e+00, -3.09775330e-01,  2.29952237e-01, -2.78838445e-01,
        -1.16793729e-01, -1.44100156e+00,  1.35002269e+00, -3.09411958e-01,
         7.75087359e-01,  8.03381122e-02,  2.82554937e-01,  1.50576013e+00,
        -2.00814419e-03,  1.75042644e+00,  8.50354363e-01]),
 array([ 0.4436992 , -0.08305864, -0.49697804, -0.445623  ,  0.52053708,
         0.60800912, -0.44110774,  0.32071661, -0.32623804,  1.13344187,
         0.43801357, -1.34384465, -0.32844806, -2.77488855, -0.99467163]),
 array([ 7.54008689,  1.13028103, -0.74450282, -

In [12]:
with open('data/clusterization_data_r_2', 'w') as file:
  for center in centers:
    file.write(f'{center}\n')