####**Машинное обучение на pySpark на примере линейной регрессии**
#### Задание: обучить модель с разными параметрами, разделить данные на тренировочную и тестовую выборки, посчитать точность тестовой выборки

In [1]:
!pip install pyspark==3.0.1 py4j==0.10.9

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark==3.0.1
  Downloading pyspark-3.0.1.tar.gz (204.2 MB)
[K     |████████████████████████████████| 204.2 MB 31 kB/s 
[?25hCollecting py4j==0.10.9
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 18.0 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.1-py2.py3-none-any.whl size=204612243 sha256=eab8aaa090043f07218cc960dd15b363c06fa24399949d7c11eaae7f386be292
  Stored in directory: /root/.cache/pip/wheels/5e/34/fa/b37b5cef503fc5148b478b2495043ba61b079120b7ff379f9b
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.1


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
        .appName('PySpark_Tutorial')\
        .getOrCreate()

In [3]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorIndexer, VectorAssembler
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.param import Param, Params

In [4]:
data = (
    spark.read
    .option('sep', ';')
    .option('inferSchema', True)
    .option('header', True)
    .csv('train_case2.csv')
)

In [5]:
data.show(10)

+---+-----+------+------+------+-----+-----+-----------+----+-----+----+------+------+
| id|  age|gender|height|weight|ap_hi|ap_lo|cholesterol|gluc|smoke|alco|active|cardio|
+---+-----+------+------+------+-----+-----+-----------+----+-----+----+------+------+
|  0|18393|     2|   168|  62.0|  110|   80|          1|   1|    0|   0|     1|     0|
|  1|20228|     1|   156|  85.0|  140|   90|          3|   1|    0|   0|     1|     1|
|  2|18857|     1|   165|  64.0|  130|   70|          3|   1|    0|   0|     0|     1|
|  3|17623|     2|   169|  82.0|  150|  100|          1|   1|    0|   0|     1|     1|
|  4|17474|     1|   156|  56.0|  100|   60|          1|   1|    0|   0|     0|     0|
|  8|21914|     1|   151|  67.0|  120|   80|          2|   2|    0|   0|     0|     0|
|  9|22113|     1|   157|  93.0|  130|   80|          3|   1|    0|   0|     1|     0|
| 12|22584|     2|   178|  95.0|  130|   90|          3|   3|    0|   0|     1|     1|
| 13|17668|     1|   158|  71.0|  110|   70

In [6]:
data.select('id').count()

70000

In [7]:
data.printSchema()

root
 |-- id: integer (nullable = true)
 |-- age: integer (nullable = true)
 |-- gender: integer (nullable = true)
 |-- height: integer (nullable = true)
 |-- weight: double (nullable = true)
 |-- ap_hi: integer (nullable = true)
 |-- ap_lo: integer (nullable = true)
 |-- cholesterol: integer (nullable = true)
 |-- gluc: integer (nullable = true)
 |-- smoke: integer (nullable = true)
 |-- alco: integer (nullable = true)
 |-- active: integer (nullable = true)
 |-- cardio: integer (nullable = true)



In [30]:
#преобразование нескольких колонок в вектор-колонку - признаки
assemblerInputs = ['age', 'gender', 'height', 'weight', 'ap_hi', 
                   'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']


assembler = VectorAssembler(inputCols=assemblerInputs, outputCol='features')

In [31]:
data = assembler.transform(data)

In [32]:
#разделим данные на train/test

train, test = data.randomSplit([0.7, 0.3])

In [33]:
#обучаем модель

lr = LogisticRegression(featuresCol='features', labelCol='cardio',  maxIter=10, regParam=0.01)

In [34]:
model = lr.fit(train)

In [35]:
# делаем предсказания на тестовой выборке

predictions = model.transform(test)

In [38]:
#посмотрим на метрику качества обученной модели

from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='cardio')
print('Evaluation:', evaluator.evaluate(predictions))

Evaluation: 0.7211768430962349
