In [1]:
df = spark.sql("select * from nyc_tlc_yellow_trips_2018_subset_1_csv")
df.show()

In [2]:
df.take(1)

In [3]:
df.cache()
df.printSchema()

In [4]:
df.describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
vendor_id,10018,1.587841884607706,0.5194816168058155,1,4
passenger_count,10018,1.594529846276702,1.2407581980456013,0,6
trip_distance,10018,2.9677580355360424,3.9519640017902864,0.0,85.0
rate_code,10018,1.054402076262727,0.3626451671597622,1,5
store_and_fwd_flag,10018,,,N,Y
payment_type,10018,1.3190257536434418,0.4901336312431004,1,4
fare_amount,10018,13.10994509882212,12.063520102674282,-52.0,300.0
extra,10018,0.32795967258933917,0.45554479885339005,-4.5,4.5
mta_tax,10018,0.4966560191655021,0.045390474357824326,-0.5,0.5


In [5]:
import pandas as pd
from pandas.plotting import scatter_matrix
numeric_features = [t[0] for t in df.dtypes if t[1] == 'int' or t[1] == 'double']
sampled_data = df.select(numeric_features).sample(False, 0.8).toPandas()
axs =scatter_matrix(sampled_data, figsize=(10, 10))
n = len(sampled_data.columns)
for i in range(n):
    v = axs[i, 0]
    v.yaxis.label.set_rotation(0)
    v.yaxis.label.set_ha('right')
    v.set_yticks(())
    h = axs[n-1, i]
    h.xaxis.label.set_rotation(90)
    h.set_xticks(())

In [6]:
import six
for i in df.columns:
    if not( isinstance(df.select(i).take(1)[0][0], six.string_types)) and i not in["pickup_datetime","dropoff_datetime"]:
        print( "Correlation to total_amount for ", i, df.stat.corr('total_amount',i))

In [7]:
from pyspark.ml.feature import VectorAssembler
vectorAssembler = VectorAssembler(inputCols = ['trip_distance', 'payment_type', 'fare_amount', 'rate_code', 'extra', 'tolls_amount', 'tip_amount'], outputCol = 'features')
v_df = vectorAssembler.transform(df)
v_df = v_df.select(['features', 'total_amount'])
v_df.show(3)

In [8]:
splits = v_df.randomSplit([0.7, 0.3])
train_df = splits[0]
test_df = splits[1]

In [9]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'features', labelCol='total_amount', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train_df)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

In [10]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

In [11]:
lr_predictions = lr_model.transform(test_df)
lr_predictions.select("prediction","total_amount","features").show(5)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="total_amount",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))