# Regresión Lineal

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('reg').getOrCreate()

In [3]:
from pyspark.ml.regression import LinearRegression

In [4]:
train = spark.read.format('libsvm').load('sample_linear_regression_data.txt')

In [6]:
train.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [7]:
lr = LinearRegression(featuresCol= 'features', labelCol='label', predictionCol='precition')

In [9]:
lr

LinearRegression_8516737c83e7

In [10]:
modelo = lr.fit(train)

In [12]:
modelo.coefficients

DenseVector([0.0073, 0.8314, -0.8095, 2.4412, 0.5192, 1.1535, -0.2989, -0.5129, -0.6197, 0.6956])

In [14]:
modelo.intercept

0.14228558260358093

In [15]:
summary = modelo.summary

In [16]:
summary.predictions.show()

+-------------------+--------------------+--------------------+
|              label|            features|           precition|
+-------------------+--------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|  1.5211201432720063|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...| -0.6658770747591632|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|  0.1568703823211514|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|  0.6374146679690593|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|   2.372566473232916|
| -7.896274316726144|(10,[0,1,2,3,4,5,...| -1.9410651727650883|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|  2.2621027950886363|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|-0.00134792656609...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...| -3.0051104606414007|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|  3.5437265095387804|
| -5.082010756207233|(10,[0,1,2,3,4,5,...| -0.4889664122481736|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|  1.5073098457843013|
| 14.323146365332388|(10,[0,1,2,3,4,5,..

In [17]:
summary.rootMeanSquaredError

10.16309157133015

In [18]:
summary.r2

0.027839179518600154

In [19]:
datos = spark.read.format('libsvm').load('sample_linear_regression_data.txt')

In [20]:
split = datos.randomSplit([0.7, 0.3])

In [21]:
split

[DataFrame[label: double, features: vector],
 DataFrame[label: double, features: vector]]

In [22]:
train, test = datos.randomSplit([0.7, 0.3])

In [24]:
train.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|               353|
|   mean|0.2590124978736433|
| stddev| 9.974358595949822|
|    min|-23.51088409032297|
|    max| 27.78383192005107|
+-------+------------------+



In [25]:
test.describe().show()

+-------+-------------------+
|summary|              label|
+-------+-------------------+
|  count|                148|
|   mean| 0.2518235687181994|
| stddev| 11.129920480502053|
|    min|-28.571478869743427|
|    max| 27.111027963108548|
+-------+-------------------+



In [26]:
modelo_correcto = lr.fit(train)

In [27]:
resultados_test = modelo_correcto.evaluate(test)

In [28]:
resultados_test.rootMeanSquaredError

10.967418101195255

In [29]:
datos.show()

+-------------------+--------------------+
|              label|            features|
+-------------------+--------------------+
| -9.490009878824548|(10,[0,1,2,3,4,5,...|
| 0.2577820163584905|(10,[0,1,2,3,4,5,...|
| -4.438869807456516|(10,[0,1,2,3,4,5,...|
|-19.782762789614537|(10,[0,1,2,3,4,5,...|
| -7.966593841555266|(10,[0,1,2,3,4,5,...|
| -7.896274316726144|(10,[0,1,2,3,4,5,...|
| -8.464803554195287|(10,[0,1,2,3,4,5,...|
| 2.1214592666251364|(10,[0,1,2,3,4,5,...|
| 1.0720117616524107|(10,[0,1,2,3,4,5,...|
|-13.772441561702871|(10,[0,1,2,3,4,5,...|
| -5.082010756207233|(10,[0,1,2,3,4,5,...|
|  7.887786536531237|(10,[0,1,2,3,4,5,...|
| 14.323146365332388|(10,[0,1,2,3,4,5,...|
|-20.057482615789212|(10,[0,1,2,3,4,5,...|
|-0.8995693247765151|(10,[0,1,2,3,4,5,...|
| -19.16829262296376|(10,[0,1,2,3,4,5,...|
|  5.601801561245534|(10,[0,1,2,3,4,5,...|
|-3.2256352187273354|(10,[0,1,2,3,4,5,...|
| 1.5299675726687754|(10,[0,1,2,3,4,5,...|
| -0.250102447941961|(10,[0,1,2,3,4,5,...|
+----------

In [30]:
datos_sin_etiqueta = test.select('features')

In [31]:
datos_sin_etiqueta.show()

+--------------------+
|            features|
+--------------------+
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
|(10,[0,1,2,3,4,5,...|
+--------------------+
only showing top 20 rows



In [32]:
predicciones =  modelo_correcto.transform(datos_sin_etiqueta)

In [33]:
predicciones.show()

+--------------------+--------------------+
|            features|           precition|
+--------------------+--------------------+
|(10,[0,1,2,3,4,5,...|-0.26784426930785404|
|(10,[0,1,2,3,4,5,...| -0.7791306497701691|
|(10,[0,1,2,3,4,5,...|   1.540371140729913|
|(10,[0,1,2,3,4,5,...| -2.7606003541108857|
|(10,[0,1,2,3,4,5,...|    1.70832231956631|
|(10,[0,1,2,3,4,5,...|  0.1383175951200476|
|(10,[0,1,2,3,4,5,...| -0.4339905262636348|
|(10,[0,1,2,3,4,5,...| -1.2392179468223974|
|(10,[0,1,2,3,4,5,...|  -2.369612976017477|
|(10,[0,1,2,3,4,5,...|0.018274994357308408|
|(10,[0,1,2,3,4,5,...|  0.6805717566189617|
|(10,[0,1,2,3,4,5,...| -0.2949100176645775|
|(10,[0,1,2,3,4,5,...| -1.1264490590196696|
|(10,[0,1,2,3,4,5,...| -1.9350855670957159|
|(10,[0,1,2,3,4,5,...|  1.7009173841543392|
|(10,[0,1,2,3,4,5,...|  1.3823959672759722|
|(10,[0,1,2,3,4,5,...|   2.327705188326532|
|(10,[0,1,2,3,4,5,...| -1.7608593871493594|
|(10,[0,1,2,3,4,5,...|   2.672078340832507|
|(10,[0,1,2,3,4,5,...|  1.746669