In [0]:
dbutils.fs.mkdirs('/FileStore/ML/')

Out[3]: True

In [0]:
df=spark.read.csv('dbfs:/FileStore/ML/tips.csv',header=True,inferSchema=True)

##"Tips" DataSet (Total bill is dependent on all the other column) 

In [0]:
df.display()

total_bill,tip,sex,smoker,day,time,size
16.99,1.01,Female,No,Sun,Dinner,2
10.34,1.66,Male,No,Sun,Dinner,3
21.01,3.5,Male,No,Sun,Dinner,3
23.68,3.31,Male,No,Sun,Dinner,2
24.59,3.61,Female,No,Sun,Dinner,4
25.29,4.71,Male,No,Sun,Dinner,4
8.77,2.0,Male,No,Sun,Dinner,2
26.88,3.12,Male,No,Sun,Dinner,4
15.04,1.96,Male,No,Sun,Dinner,2
14.78,3.23,Male,No,Sun,Dinner,2


#Schema of Dataset

In [0]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



#Columns In Dataset

In [0]:
print(df.columns)

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']


#Label Encoding

In [0]:
from pyspark.ml.feature import StringIndexer

In [0]:
indexer=StringIndexer(inputCols=['sex', 'smoker', 'day', 'time'],outputCols=[ 'sex_vector', 'smoker_vector', 'day_vector', 'time_vector'])
df_r=indexer.fit(df).transform(df)
df_r.display()

total_bill,tip,sex,smoker,day,time,size,sex_vector,smoker_vector,day_vector,time_vector
16.99,1.01,Female,No,Sun,Dinner,2,1.0,0.0,1.0,0.0
10.34,1.66,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0
21.01,3.5,Male,No,Sun,Dinner,3,0.0,0.0,1.0,0.0
23.68,3.31,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
24.59,3.61,Female,No,Sun,Dinner,4,1.0,0.0,1.0,0.0
25.29,4.71,Male,No,Sun,Dinner,4,0.0,0.0,1.0,0.0
8.77,2.0,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
26.88,3.12,Male,No,Sun,Dinner,4,0.0,0.0,1.0,0.0
15.04,1.96,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0
14.78,3.23,Male,No,Sun,Dinner,2,0.0,0.0,1.0,0.0


#Grouping The Independent_feature & Dependent Variable

In [0]:
from pyspark.ml.feature import VectorAssembler
Vector_assembler=VectorAssembler(inputCols=['tip','sex_vector','size','smoker_vector','day_vector','time_vector'],outputCol="Independent_feature")
output=Vector_assembler.transform(df_r)

In [0]:
output.show(truncate=False)

+----------+----+------+------+---+------+----+----------+-------------+----------+-----------+--------------------------+
|total_bill|tip |sex   |smoker|day|time  |size|sex_vector|smoker_vector|day_vector|time_vector|Independent_feature       |
+----------+----+------+------+---+------+----+----------+-------------+----------+-----------+--------------------------+
|16.99     |1.01|Female|No    |Sun|Dinner|2   |1.0       |0.0          |1.0       |0.0        |[1.01,1.0,2.0,0.0,1.0,0.0]|
|10.34     |1.66|Male  |No    |Sun|Dinner|3   |0.0       |0.0          |1.0       |0.0        |[1.66,0.0,3.0,0.0,1.0,0.0]|
|21.01     |3.5 |Male  |No    |Sun|Dinner|3   |0.0       |0.0          |1.0       |0.0        |[3.5,0.0,3.0,0.0,1.0,0.0] |
|23.68     |3.31|Male  |No    |Sun|Dinner|2   |0.0       |0.0          |1.0       |0.0        |[3.31,0.0,2.0,0.0,1.0,0.0]|
|24.59     |3.61|Female|No    |Sun|Dinner|4   |1.0       |0.0          |1.0       |0.0        |[3.61,1.0,4.0,0.0,1.0,0.0]|
|25.29     |4.71

In [0]:
final_data=output.select('Independent_feature','total_bill')

In [0]:
final_data.display()

Independent_feature,total_bill
"Map(vectorType -> dense, length -> 6, values -> List(1.01, 1.0, 2.0, 0.0, 1.0, 0.0))",16.99
"Map(vectorType -> dense, length -> 6, values -> List(1.66, 0.0, 3.0, 0.0, 1.0, 0.0))",10.34
"Map(vectorType -> dense, length -> 6, values -> List(3.5, 0.0, 3.0, 0.0, 1.0, 0.0))",21.01
"Map(vectorType -> dense, length -> 6, values -> List(3.31, 0.0, 2.0, 0.0, 1.0, 0.0))",23.68
"Map(vectorType -> dense, length -> 6, values -> List(3.61, 1.0, 4.0, 0.0, 1.0, 0.0))",24.59
"Map(vectorType -> dense, length -> 6, values -> List(4.71, 0.0, 4.0, 0.0, 1.0, 0.0))",25.29
"Map(vectorType -> dense, length -> 6, values -> List(2.0, 0.0, 2.0, 0.0, 1.0, 0.0))",8.77
"Map(vectorType -> dense, length -> 6, values -> List(3.12, 0.0, 4.0, 0.0, 1.0, 0.0))",26.88
"Map(vectorType -> dense, length -> 6, values -> List(1.96, 0.0, 2.0, 0.0, 1.0, 0.0))",15.04
"Map(vectorType -> dense, length -> 6, values -> List(3.23, 0.0, 2.0, 0.0, 1.0, 0.0))",14.78


In [0]:
from pyspark.ml.regression import LinearRegression

In [0]:
train_data,test_data=final_data.randomSplit([0.75,0.25])
liner_model=LinearRegression(featuresCol='Independent_feature',labelCol='total_bill')

In [0]:
model_test=liner_model.fit(train_data)

In [0]:
predt_value=model_test.evaluate(test_data)

In [0]:
predt_value.predictions.show()

+--------------------+----------+------------------+
| Independent_feature|total_bill|        prediction|
+--------------------+----------+------------------+
|      (5,[0],[1.75])|     17.82|15.006173915122227|
|       (5,[0],[2.0])|     13.37|16.075088167398427|
|       (5,[0],[2.5])|     18.35|18.212916671950822|
|      (5,[0],[2.64])|     17.59|18.811508653225495|
|      (5,[0],[5.92])|     29.03| 32.83566364308922|
| (5,[0,1],[3.0,1.0])|     17.07| 19.05889468855092|
|(5,[0,2],[1.64,1.0])|     15.36|17.236852183184823|
|(5,[0,2],[1.76,1.0])|     11.24|17.749931024277398|
|(5,[0,2],[2.09,1.0])|     15.01| 19.16089783728198|
| (5,[0,2],[3.0,1.0])|     15.53|23.051745715567343|
| (5,[0,2],[3.0,1.0])|     15.69|23.051745715567343|
|(5,[0,2],[3.08,1.0])|     17.92|23.393798276295726|
|(5,[0,2],[3.41,1.0])|     26.59| 24.80476508930031|
|(5,[0,2],[3.76,1.0])|     18.29|26.301245042486986|
|(5,[0,2],[4.06,1.0])|     20.49|27.583942145218423|
|(5,[0,2],[10.0,1.0])|     50.81| 52.981344779