In [1]:
from pyspark.sql import SparkSession
spark = SparkSession \
        .builder \
        .appName("SparkML2") \
        .getOrCreate()

In [2]:
df = spark.read \
    .option("header", "True") \
    .option("inferSchema", "True") \
    .csv('tips.csv')
df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [3]:
from pyspark.ml.feature import StringIndexer

In [4]:
indexer = StringIndexer(inputCols=["sex", "smoker", "day", "time"], \
                outputCols=["sex_idx", "smoker_idx", "day_idx", "time_idx"])
df_i = indexer.fit(df).transform(df)
df_i.show(5)

+----------+----+------+------+---+------+----+-------+----------+-------+--------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_idx|smoker_idx|day_idx|time_idx|
+----------+----+------+------+---+------+----+-------+----------+-------+--------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|    1.0|       0.0|    1.0|     0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|    1.0|     0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|    1.0|     0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|    1.0|     0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|    1.0|       0.0|    1.0|     0.0|
+----------+----+------+------+---+------+----+-------+----------+-------+--------+
only showing top 5 rows



In [5]:
from pyspark.ml.feature import VectorAssembler
featureAssembler = VectorAssembler(
    inputCols=["tip", "size", "sex_idx", "smoker_idx", "day_idx", "time_idx"], \
    outputCol="features")

In [6]:
output = featureAssembler.transform(df_i)

In [7]:
output.select("features").show(5)

+--------------------+
|            features|
+--------------------+
|[1.01,2.0,1.0,0.0...|
|[1.66,3.0,0.0,0.0...|
|[3.5,3.0,0.0,0.0,...|
|[3.31,2.0,0.0,0.0...|
|[3.61,4.0,1.0,0.0...|
+--------------------+
only showing top 5 rows



In [8]:
df_f = output.select("features", "total_bill")
df_f.show(5)

+--------------------+----------+
|            features|total_bill|
+--------------------+----------+
|[1.01,2.0,1.0,0.0...|     16.99|
|[1.66,3.0,0.0,0.0...|     10.34|
|[3.5,3.0,0.0,0.0,...|     21.01|
|[3.31,2.0,0.0,0.0...|     23.68|
|[3.61,4.0,1.0,0.0...|     24.59|
+--------------------+----------+
only showing top 5 rows



In [9]:
from pyspark.ml.regression import LinearRegression

train_data, test_data = df_f.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol="features", labelCol="total_bill")
regressor = regressor.fit(train_data)

In [10]:
regressor.coefficients

DenseVector([3.1321, 3.3909, -1.7886, 2.171, -0.1493, -1.1389])

In [11]:
regressor.intercept

2.1161091566692467

In [12]:
pred_results = regressor.evaluate(test_data)

In [13]:
pred_results.predictions.show(10)

+--------------------+----------+------------------+
|            features|total_bill|        prediction|
+--------------------+----------+------------------+
|(6,[0,1],[1.25,2.0])|     10.07|12.812958497656139|
|(6,[0,1],[1.97,2.0])|     12.02| 15.06804268276521|
| (6,[0,1],[2.0,2.0])|     12.69|15.162004523811422|
| (6,[0,1],[2.0,3.0])|     16.31|18.552890839175465|
|(6,[0,1],[2.24,3.0])|     16.04|19.304585567545157|
|(6,[0,1],[2.34,4.0])|     17.81|23.008678019729906|
|(6,[0,1],[2.64,3.0])|     17.59|20.557410114827974|
| (6,[0,1],[3.0,2.0])|      14.0|18.294065892018466|
|(6,[0,1],[4.08,2.0])|     17.92| 21.67669216968207|
|[1.0,1.0,1.0,1.0,...|      3.07|  9.02151257369383|
+--------------------+----------+------------------+
only showing top 10 rows



In [14]:
pred_results.meanSquaredError

50.18278766450729