# Linear Regression

General Pipeline:

- Importing Data
- Vectorize and RFormula transformation
- Split into train and test
- Building the model
- Prediction on the test set
- Evaluation

## Importing

In [1]:
import pyspark, findspark
from pyspark.sql import SparkSession

findspark.init()

spark = SparkSession.builder.appName("linearreg").getOrCreate()

In [21]:
from pyspark.ml.feature    import RFormula
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

## Loading Data

In [3]:
cars = spark.read.load(
    "../../data/Carros.csv",
    format="csv",
    sep=";",
    header = True, 
    inferSchema=True)

cars.show(2)

+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|Consumo|Cilindros|Cilindradas|RelEixoTraseiro|Peso|Tempo|TipoMotor|Transmissao|Marchas|Carburadors| HP|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
|     21|        6|        160|             39| 262| 1646|        0|          1|      4|          4|110|
|     21|        6|        160|             39|2875| 1702|        0|          1|      4|          4|110|
+-------+---------+-----------+---------------+----+-----+---------+-----------+-------+-----------+---+
only showing top 2 rows



## Data Preparation

In [4]:
rformula = RFormula(
    formula='HP ~ Consumo + Cilindros + Cilindradas',
    featuresCol="features",
    labelCol="target"
)
cars = rformula.fit(cars).transform(cars)

In [5]:
cars.select("features", "target").show(10)

+------------------+------+
|          features|target|
+------------------+------+
|  [21.0,6.0,160.0]| 110.0|
|  [21.0,6.0,160.0]| 110.0|
| [228.0,4.0,108.0]|  93.0|
| [214.0,6.0,258.0]| 110.0|
| [187.0,8.0,360.0]| 175.0|
| [181.0,6.0,225.0]| 105.0|
| [143.0,8.0,360.0]| 245.0|
|[244.0,4.0,1467.0]|  62.0|
|[228.0,4.0,1408.0]|  95.0|
|[192.0,6.0,1676.0]| 123.0|
+------------------+------+
only showing top 10 rows



## Split into Train and Test

In [11]:
carsTrain, carsTest = cars.randomSplit([0.8, 0.2])

In [12]:
carsTrain.count(), carsTest.count()

(27, 5)

## Model Development and Training

In [18]:
lr = LinearRegression(
    featuresCol="features",
    labelCol="target",
    maxIter=1000,
    loss="squaredError",
    standardization=True
)

model = lr.fit(carsTrain)

## Predicting on Test Set

In [20]:
predictions = model.transform(carsTest)
predictions.select("target", "prediction").show()

+------+------------------+
|target|        prediction|
+------+------------------+
| 245.0| 204.7824120717919|
| 175.0|200.68487056758397|
| 175.0|142.27497543894737|
|  93.0| 80.75316445047979|
|  52.0|  69.4597505585738|
+------+------------------+



## Model Evaluation

In [23]:
evaluation = RegressionEvaluator(
    predictionCol="prediction",
    labelCol="target",
    metricName="rmse"
)

rmse = evaluation.evaluate(predictions)

print(rmse)

27.57869488537411
