<a href="https://colab.research.google.com/github/HaJunYoo/Pyspark-tutorial/blob/main/taxi_fare_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!pip install pyspark==3.3.1 py4j==0.10.9.5 
!pip install -q findspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [14]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark import SparkConf

In [15]:
MAX_MEMORY="8g"
spark = SparkSession.builder.appName("taxi-fare-prediciton")\
                .config("spark.executor.memory", MAX_MEMORY)\
                .config("spark.driver.memory", MAX_MEMORY)\
                .getOrCreate()

In [16]:
spark

In [None]:
cd /content/drive/MyDrive/Spark/01-spark

In [None]:
pwd

In [None]:
trip_files = "/content/drive/MyDrive/Spark/01-spark/data/trips/*"

In [None]:
trips_df = spark.read.csv(trip_files, inferSchema=True, header=True)

In [None]:
trips_df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



In [None]:
trips_df.createOrReplaceTempView("trips")

In [17]:
query = """
SELECT 
    trip_distance,
    total_amount
FROM
    trips
WHERE
    total_amount < 5000
    AND total_amount > 0
    AND trip_distance > 0
    AND trip_distance < 500
    AND passenger_count < 4
    AND TO_DATE(tpep_pickup_datetime) >= '2021-01-01'
    AND TO_DATE(tpep_pickup_datetime) < '2021-08-01'
"""

In [18]:
data_df = spark.sql(query)
data_df.createOrReplaceTempView("data") # filter view로 만듦

In [19]:
data_df.show()

+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|          2.1|        11.8|
|          0.2|         4.3|
|         14.7|       51.95|
|         10.6|       36.35|
|         4.94|       24.36|
|          1.6|       14.15|
|          4.1|        17.3|
|          5.7|        21.8|
|          9.1|        28.8|
|          2.7|       18.95|
|         6.11|        24.3|
|         1.21|       10.79|
|          7.4|       33.92|
|         1.01|        10.3|
|         0.73|       12.09|
|         1.17|       12.36|
|         0.78|        9.96|
|         1.66|        12.3|
|         0.93|         9.3|
|         1.16|       11.84|
+-------------+------------+
only showing top 20 rows



In [20]:
data_df.describe().show()

+-------+------------------+------------------+
|summary|     trip_distance|      total_amount|
+-------+------------------+------------------+
|  count|          13126271|          13126271|
|   mean|2.8820783305479263|17.973117241906895|
| stddev| 3.820284175387752|12.975829282992352|
|    min|              0.01|              0.01|
|    max|             475.5|            4973.3|
+-------+------------------+------------------+



In [21]:
train_df, test_df = data_df.randomSplit([0.8, 0.2], seed=1)

In [22]:
print(train_df.count())
print(test_df.count())

10500537
2625734


In [23]:
from pyspark.ml.feature import VectorAssembler

In [24]:
# 모델 안에 넣기 위해서는 vector assembler를 통해 train df를 바꾼다 -> features를 만든다
vassembler = VectorAssembler(inputCols=["trip_distance"], outputCol="features")

In [25]:
vtrain_df = vassembler.transform(train_df)

In [26]:
vtrain_df.show()

+-------------+------------+--------+
|trip_distance|total_amount|features|
+-------------+------------+--------+
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
+-------------+------------+--------+
only showing top 20 rows



In [27]:
from pyspark.ml.regression import LinearRegression

In [28]:
lr = LinearRegression(
    maxIter=50,
    labelCol="total_amount",
    featuresCol="features"
)

In [29]:
model = lr.fit(vtrain_df)

In [30]:
vtest_df = vassembler.transform(test_df)

In [31]:
prediction = model.transform(vtest_df)

In [32]:
prediction.show()

+-------------+------------+--------+-----------------+
|trip_distance|total_amount|features|       prediction|
+-------------+------------+--------+-----------------+
|         0.01|         3.3|  [0.01]|9.425468862210433|
|         0.01|         3.3|  [0.01]|9.425468862210433|
|         0.01|         3.3|  [0.01]|9.425468862210433|
|         0.01|         3.3|  [0.01]|9.425468862210433|
|         0.01|         3.3|  [0.01]|9.425468862210433|
|         0.01|         3.3|  [0.01]|9.425468862210433|
|         0.01|         3.3|  [0.01]|9.425468862210433|
|         0.01|         3.3|  [0.01]|9.425468862210433|
|         0.01|         3.3|  [0.01]|9.425468862210433|
|         0.01|         3.3|  [0.01]|9.425468862210433|
|         0.01|         3.3|  [0.01]|9.425468862210433|
|         0.01|         3.3|  [0.01]|9.425468862210433|
|         0.01|         3.3|  [0.01]|9.425468862210433|
|         0.01|         3.3|  [0.01]|9.425468862210433|
|         0.01|         3.8|  [0.01]|9.425468862

In [33]:
# summary class를 이용해서 평가 
model.summary.rootMeanSquaredError

6.284656849365227

In [34]:
model.summary.r2

0.766032267370355

In [41]:
from pyspark.sql.types import DoubleType
distance_list = [1.1, 2.0, 5.0, 7.7]
distance_df = spark.createDataFrame(distance_list, DoubleType()).toDF("trip_distance")

In [42]:
distance_df.show()

+-------------+
|trip_distance|
+-------------+
|          1.1|
|          2.0|
|          5.0|
|          7.7|
+-------------+



In [43]:
vdistance_df = vassembler.transform(distance_df)

In [44]:
vdistance_df.show()

+-------------+--------+
|trip_distance|features|
+-------------+--------+
|          1.1|   [1.1]|
|          2.0|   [2.0]|
|          5.0|   [5.0]|
|          7.7|   [7.7]|
+-------------+--------+



In [45]:
model.transform(vdistance_df).show()

+-------------+--------+------------------+
|trip_distance|features|        prediction|
+-------------+--------+------------------+
|          1.1|   [1.1]|12.669472738013738|
|          2.0|   [2.0]|15.348008048310046|
|          5.0|   [5.0]| 24.27645908263107|
|          7.7|   [7.7]| 32.31206501351999|
+-------------+--------+------------------+

