## Лабораторная работа № 2
### Машинное обучение на больших данных
#### Цель и задачи работы:
1. Познакомиться с базовыми алгоритмами машинного обучения;
2. Познакомиться с реализацией машинного обучения в библиотеке Spark ML
3. Получить навыки разработки программного обеспечения для анализа данных с
использованием pyspark.
### Порядок выполнения работы:
1. Выполнить анализ выбранного датасета с помощью двух алгоритмов машинного
обучения: Линейной регрессии и Бустинга над решающими деревьями 
2. Выполнить обучение и валидацию модели, рассчитайте значения метрик классификации и
регрессии.
3. Выполнить подбор гиперпараметров моделей по сетке.

## Ход работы

#### Создание сессии и загрузка датасета

In [43]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StringType, BooleanType, DateType, IntegerType, DoubleType

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer, MinMaxScaler
from pyspark.ml.classification import LogisticRegression, GBTClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator,RegressionEvaluator

from pyspark.ml.regression import LinearRegression

import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('SOBDLab1') \
    .getOrCreate()
csv_file = 'data/cleared.csv'
data = spark.read.csv(csv_file, header=True)

#### Удаление признаков

In [44]:
data = data.drop('_c0','searchDate','flightDate')
data.printSchema()

root
 |-- startingAirport: string (nullable = true)
 |-- destinationAirport: string (nullable = true)
 |-- fareBasisCode: string (nullable = true)
 |-- travelDuration: string (nullable = true)
 |-- elapsedDays: string (nullable = true)
 |-- isBasicEconomy: string (nullable = true)
 |-- isRefundable: string (nullable = true)
 |-- isNonStop: string (nullable = true)
 |-- baseFare: string (nullable = true)
 |-- totalFare: string (nullable = true)
 |-- seatsRemaining: string (nullable = true)
 |-- totalTravelDistance: string (nullable = true)



#### Преобразование типов

In [45]:
data = data.withColumn('travelDuration', data.travelDuration.cast(IntegerType()))
data = data.withColumn('elapsedDays', data.elapsedDays.cast(IntegerType()))
data = data.withColumn('isBasicEconomy', data.isBasicEconomy.cast(BooleanType()))
data = data.withColumn('isNonStop', data.isNonStop.cast(BooleanType()))
data = data.withColumn('baseFare', data.baseFare.cast(DoubleType()))
data = data.withColumn('totalFare', data.totalFare.cast(DoubleType()))
data = data.withColumn('totalTravelDistance', data.totalTravelDistance.cast(IntegerType()))
data = data.withColumn('fareBasisCode',data.fareBasisCode.cast(IntegerType()))
data = data.withColumn('isRefundable',data.isRefundable.cast(BooleanType()))
data = data.withColumn('seatsRemaining',data.seatsRemaining.cast(IntegerType()))
data = data.withColumn('startingAirport',data.startingAirport.cast(IntegerType()))
data = data.withColumn('destinationAirport',data.destinationAirport.cast(IntegerType()))

data.printSchema()
data.show()

root
 |-- startingAirport: integer (nullable = true)
 |-- destinationAirport: integer (nullable = true)
 |-- fareBasisCode: integer (nullable = true)
 |-- travelDuration: integer (nullable = true)
 |-- elapsedDays: integer (nullable = true)
 |-- isBasicEconomy: boolean (nullable = true)
 |-- isRefundable: boolean (nullable = true)
 |-- isNonStop: boolean (nullable = true)
 |-- baseFare: double (nullable = true)
 |-- totalFare: double (nullable = true)
 |-- seatsRemaining: integer (nullable = true)
 |-- totalTravelDistance: integer (nullable = true)

+---------------+------------------+-------------+--------------+-----------+--------------+------------+---------+--------+---------+--------------+-------------------+
|startingAirport|destinationAirport|fareBasisCode|travelDuration|elapsedDays|isBasicEconomy|isRefundable|isNonStop|baseFare|totalFare|seatsRemaining|totalTravelDistance|
+---------------+------------------+-------------+--------------+-----------+--------------+------------

### Линейная регрессия

#### Подготовка данных

In [46]:
assembler = VectorAssembler(inputCols=['travelDuration','isBasicEconomy','isNonStop','totalTravelDistance'], outputCol='features')
final_data = assembler.transform(data)
final_data = final_data.select('features','baseFare')
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

#### Создание и тренировка модели

In [47]:
lr = LinearRegression(featuresCol='features',labelCol='baseFare',predictionCol='predicted_Fare')
lr_model = lr.fit(train_data)

#### Работа модели на тестовых данных и её оценка

In [48]:
predictions = lr_model.transform(test_data)

evaluator = RegressionEvaluator(labelCol="baseFare", predictionCol="predicted_Fare", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data: {:.3f}".format(rmse))

evaluator_r2 = RegressionEvaluator(labelCol="baseFare", predictionCol="predicted_Fare", metricName="r2")
r2 = evaluator_r2.evaluate(predictions)
print("R-squared (R2) on test data: {:.3f}".format(r2))

coefficients = lr_model.coefficients
intercept = lr_model.intercept

print("Coefficients: ", coefficients)
print("Intercept: {:.3f}".format(intercept))
print("pValues: ", lr_model.summary.pValues)

Root Mean Squared Error (RMSE) on test data: 115.844
R-squared (R2) on test data: 0.420
Coefficients:  [0.15674769829843269,-155.55484282745886,-13.63193854431306,0.05466642238033474]
Intercept: 168.887
pValues:  [0.0, 0.0, 0.0, 0.0, 0.0]


#### Гиперпараметры модели и кросс-валидация

In [51]:
grid_search = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.0, 0.01, 0.1]) \
    .addGrid(lr.elasticNetParam, [0.5, 1.0]) \
    .build()
cv = CrossValidator(estimator=lr,
                    estimatorParamMaps=grid_search,
                    evaluator=evaluator)
cv_model = cv.fit(train_data)

In [56]:
print("meanAbsoluteError: ", cv_model.bestModel.summary.meanAbsoluteError)
cv_model.bestModel.extractParamMap()

91.21251167152272


{Param(parent='LinearRegression_1a0478d75fa3', name='aggregationDepth', doc='suggested depth for treeAggregate (>= 2).'): 2,
 Param(parent='LinearRegression_1a0478d75fa3', name='elasticNetParam', doc='the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.'): 0.5,
 Param(parent='LinearRegression_1a0478d75fa3', name='epsilon', doc='The shape parameter to control the amount of robustness. Must be > 1.0. Only valid when loss is huber'): 1.35,
 Param(parent='LinearRegression_1a0478d75fa3', name='featuresCol', doc='features column name.'): 'features',
 Param(parent='LinearRegression_1a0478d75fa3', name='fitIntercept', doc='whether to fit an intercept term.'): True,
 Param(parent='LinearRegression_1a0478d75fa3', name='labelCol', doc='label column name.'): 'baseFare',
 Param(parent='LinearRegression_1a0478d75fa3', name='loss', doc='The loss function to be optimized. Supported options: squaredError, huber.'): 'squaredEr

### Бинарная классификация

#### Подготовка данных

In [40]:
assembler = VectorAssembler(inputCols = ["travelDuration", "baseFare", "totalTravelDistance"], outputCol="features")
final_data = assembler.transform(data)
final_data = final_data.withColumn('isNonStop', data.isNonStop.cast(IntegerType()))
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

#### Создание и тренировка модели

In [41]:
gbm = GBTClassifier(featuresCol='features',labelCol='isNonStop')
gbm_model = gbm.fit(train_data)

#### Работа модели на тестовых данных и её оценка

In [42]:
predictions = gbm_model.transform(test_data)

evaluator = BinaryClassificationEvaluator(labelCol="isNonStop")
print('Evaluation:', evaluator.evaluate(predictions))

Evaluation: 0.999950864433254
