# Regresión Lineal archivo con formato CSV

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('moda_rl').getOrCreate()

In [0]:
ruta = "dbfs:/FileStore/shared_uploads/jgamarramoreno@gmail.com/Ecommerce_Customers.csv"

In [0]:
data = spark.read.csv(ruta,inferSchema=True,header=True)

In [0]:
data.printSchema()

root
 |-- Email: string (nullable = true)
 |-- Address: string (nullable = true)
 |-- Avatar: string (nullable = true)
 |-- Avg Session Length: double (nullable = true)
 |-- Time on App: double (nullable = true)
 |-- Time on Website: double (nullable = true)
 |-- Length of Membership: double (nullable = true)
 |-- Yearly Amount Spent: double (nullable = true)



In [0]:
data.show()

+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|               Email|             Address|          Avatar|Avg Session Length|       Time on App|   Time on Website|Length of Membership|Yearly Amount Spent|
+--------------------+--------------------+----------------+------------------+------------------+------------------+--------------------+-------------------+
|mstephenson@ferna...|835 Frank TunnelW...|          Violet| 34.49726772511229| 12.65565114916675| 39.57766801952616|  4.0826206329529615|  587.9510539684005|
|   hduke@hotmail.com|4547 Archer Commo...|       DarkGreen| 31.92627202636016|11.109460728682564|37.268958868297744|    2.66403418213262|  392.2049334443264|
|    pallen@yahoo.com|24645 Valerie Uni...|          Bisque|33.000914755642675|11.330278057777512|37.110597442120856|   4.104543202376424| 487.54750486747207|
|riverarebecca@gma...|1414 David Throug...|   

In [0]:
data.head()

Out[7]: Row(Email='mstephenson@fernandez.com', Address='835 Frank TunnelWrightmouth, MI 82180-9605', Avatar='Violet', Avg Session Length=34.49726772511229, Time on App=12.65565114916675, Time on Website=39.57766801952616, Length of Membership=4.0826206329529615, Yearly Amount Spent=587.9510539684005)

In [0]:
for item in data.head():
    print(item)

mstephenson@fernandez.com
835 Frank TunnelWrightmouth, MI 82180-9605
Violet
34.49726772511229
12.65565114916675
39.57766801952616
4.0826206329529615
587.9510539684005


### Configuración del DataFrame para Machine Learning
Para que Spark pueda aceptar los datos se necesita llevarlos a la forma de dos columnas ("etiquetas","características")

In [0]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

In [0]:
data.columns

Out[11]: ['Email',
 'Address',
 'Avatar',
 'Avg Session Length',
 'Time on App',
 'Time on Website',
 'Length of Membership',
 'Yearly Amount Spent']

In [0]:
ensamblador = VectorAssembler(
    inputCols=['Avg Session Length','Time on App',
               'Time on Website','Length of Membership'],
    outputCol="features"
)

In [0]:
data02 = ensamblador.transform(data)

In [0]:
data02.select("features").show()

+--------------------+
|            features|
+--------------------+
|[34.4972677251122...|
|[31.9262720263601...|
|[33.0009147556426...|
|[34.3055566297555...|
|[33.3306725236463...|
|[33.8710378793419...|
|[32.0215955013870...|
|[32.7391429383803...|
|[33.9877728956856...|
|[31.9365486184489...|
|[33.9925727749537...|
|[33.8793608248049...|
|[29.5324289670579...|
|[33.1903340437226...|
|[32.3879758531538...|
|[30.7377203726281...|
|[32.1253868972878...|
|[32.3388993230671...|
|[32.1878120459321...|
|[32.6178560628234...|
+--------------------+
only showing top 20 rows



In [0]:
data03 = data02.select("features",'Yearly Amount Spent')

#### División de los datos en conjunto de entrenamiento y prueba

In [0]:
train_data, test_data = data03.randomSplit([0.7,0.3],seed=101)

In [0]:
train_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                345|
|   mean|  498.8667234086796|
| stddev|  76.00416522273527|
|    min|  275.9184206503857|
|    max|  765.5184619388373|
+-------+-------------------+



In [0]:
test_data.describe().show()

+-------+-------------------+
|summary|Yearly Amount Spent|
+-------+-------------------+
|  count|                155|
|   mean|  500.3096745374247|
| stddev|  86.48030978452388|
|    min| 256.67058229005585|
|    max|  744.2218671047146|
+-------+-------------------+



#### Modelo de Regresión Lineal

In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
rl = LinearRegression(labelCol='Yearly Amount Spent')

In [0]:
modelo = rl.fit(train_data)

In [0]:
print("Interceptor: {}, Coeficientes: {}".format(
    modelo.intercept,modelo.coefficients))

Interceptor: -1058.5341715432617, Coeficientes: [25.680250460821927,38.60458989543174,0.6972965442305288,61.58314310264303]


In [0]:
resultados_prueba = modelo.evaluate(test_data)

In [0]:
resultados_prueba.residuals.show()

+-------------------+
|          residuals|
+-------------------+
| 11.093067201538247|
|-16.805503415539192|
| 10.627606863601159|
|-0.4826174601465709|
|  3.368835693335427|
|  4.389239879592537|
| 0.3747057350694831|
| -3.707613615190894|
|-1.7228627211574121|
| 17.594525423695416|
|-4.2611074733673036|
| -5.617477497649531|
|-17.185512431541213|
| -9.163794939416846|
| 1.1639615002861774|
|  8.809755194130332|
| -5.725321892938325|
|-16.933183999719006|
|  11.94115318559244|
| -5.345538836619994|
+-------------------+
only showing top 20 rows



In [0]:
datos_prueba_sin_etiq = test_data.select('features')

In [0]:
datos_prueba_sin_etiq.show()

+--------------------+
|            features|
+--------------------+
|[30.7377203726281...|
|[30.8162006488763...|
|[31.1695067987115...|
|[31.2606468698795...|
|[31.3091926408918...|
|[31.3584771924370...|
|[31.3895854806643...|
|[31.4252268808548...|
|[31.5761319713222...|
|[31.6098395733896...|
|[31.6253601348306...|
|[31.7242025238451...|
|[31.8164283341993...|
|[31.8279790554652...|
|[31.8293464559211...|
|[31.8512531286083...|
|[31.8745516945853...|
|[31.9048571310136...|
|[31.9262720263601...|
|[31.9453957483445...|
+--------------------+
only showing top 20 rows



In [0]:
predicciones = modelo.transform(datos_prueba_sin_etiq)

In [0]:
predicciones.show()

+--------------------+------------------+
|            features|        prediction|
+--------------------+------------------+
|[30.7377203726281...|450.68767499469163|
|[30.8162006488763...| 282.8918443640082|
|[31.1695067987115...|416.72892393869165|
|[31.2606468698795...|421.80924871709794|
|[31.3091926408918...| 429.3518821465982|
|[31.3584771924370...|490.78671056988287|
|[31.3895854806643...| 409.6949053249134|
|[31.4252268808548...| 534.4743322699528|
|[31.5761319713222...| 542.9494467104857|
|[31.6098395733896...|426.95102422741274|
|[31.6253601348306...| 380.5980082302915|
|[31.7242025238451...|   509.00536478561|
|[31.8164283341993...| 518.3080039351976|
|[31.8279790554652...|449.16654248635837|
|[31.8293464559211...| 383.9883764876888|
|[31.8512531286083...|464.18249147266806|
|[31.8745516945853...| 398.0105661392058|
|[31.9048571310136...|490.88304142253514|
|[31.9262720263601...|380.26378025873396|
|[31.9453957483445...| 662.3654627742719|
+--------------------+------------

In [0]:
print("RMSE: {}".format(resultados_prueba.rootMeanSquaredError))
print("MSE: {}".format(resultados_prueba.meanSquaredError))
print("MAE: {}".format(resultados_prueba.meanAbsoluteError))
print("R2: {}".format(resultados_prueba.r2))

RMSE: 10.163389007890343
MSE: 103.29447612570624
MAE: 8.150739808402452
R2: 0.9860987580768822
