In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Tutorial

### Import required libraries

In [None]:
# A Spark Session is how you interact with Spark SQL to create Dataframes
from pyspark.sql import SparkSession
# PySpark functions
from pyspark.sql.functions import avg, col, count, desc, round, size, udf, to_timestamp, unix_timestamp, broadcast, pandas_udf, PandasUDFType, to_date
# These allow us to create a schema for our data
from pyspark.sql.types import ArrayType, IntegerType, StringType, DoubleType, BooleanType
from pyspark.ml.linalg import Vectors
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

import seaborn as sns
from geopandas import gpd
from shapely import wkt
from shapely.geometry import Point
import pandas as pd

from pyspark.ml.regression import LinearRegression, GBTRegressor, DecisionTreeRegressor, RandomForestRegressor
import matplotlib.pyplot as plt
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator
from shapely.ops import cascaded_union


### Initialize the SparkSession

To use Apache Spark with BigQuery, you must include the [spark-bigquery-connector](https://github.com/GoogleCloudDataproc/spark-bigquery-connector) when you initialize the SparkSession.

In [None]:
# Initialize the SparkSession with the following config.
spark = (
    SparkSession.builder.appName("spark-bigquery-ml-nyc-trips-demo")
    .config(
        "spark.jars",
        "gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.25.2.jar",
    )
#     .config("spark.sql.debug.maxToStringFields", "500")
#     .config("spark.sql.autoBroadcastJoinThreshold", "-1")
    .getOrCreate()
)

### Fetch data from BigQuery

In [None]:
# Load NYC_taxi in Github Activity Public Dataset from BigQuery.
taxi_df = (
    spark.read.format("bigquery")
    .option("table", "bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018")
    .load()
)

# Load NYC_Citibike in Github Acitivity Public dataset from BQ.
bike_df = (
    spark.read.format("bigquery")
    .option("table", "bigquery-public-data.new_york_citibike.citibike_trips")
    .load()
)

In [None]:
gdf_zone = gpd.read_file("https://data.cityofnewyork.us/api/geospatial/d3c5-ddgc?method=export&format=GeoJSON")
gdf_zone['location_id'] = gdf_zone['location_id'].astype('long')
# gdf_zone['borough'] = gdf_zone['borough'].apply(
#     lambda s: 1 if s == "Manhattan" else 0
# ).astype('long')
# pd_zone = pd.DataFrame(gdf_zone.filter(items=["location_id", "borough"]))

# v = gdf_zone.query(f'location_id=={238}')["borough"]
location_set = set()
hm = gdf_zone.to_dict('index')
for i in hm:
    if hm[i]['borough'] == "Manhattan":
        location_set.add(hm[i]['location_id'])
# v = pd_zone.loc[pd_zone.location_id==238,'borough'].values[0]
# print(v, type(v))

### Perform Exploratory Data Analysis(EDA)

As we get started with a new problem, the first step is to gain an understanding of what the dataset contains. EDA is used to derive insights from the data. Data scientists and analysts try to find different patterns, relations, and anomalies in the data using some statistical graphs and other visualization techniques. It allows analysts to understand the data better before making any assumptions.

Check the data types for Taxi dataset first.

In [None]:
taxi_df.printSchema()

Filter out unnecessary columns and check null counts of the fields.

In [None]:
taxi_df = taxi_df.select(
    col("pickup_datetime"),
    col("dropoff_datetime"),
    col("trip_distance"),
    col("fare_amount"),
    col("pickup_location_id"),
    col("dropoff_location_id"),
)
taxi_df.describe().show()

From this summary, you are able to know a lot of information.
  - There are over 112 millions of trip history for Yellow Taxi in 2018. 
  - The current dataset has some abnormal values such as null and negative values in it.
  - `pickup_datetime` and `dropoff_datetime` are string format. To use it effectively, it needs to be re-formatted.
  - In previous years, the exact latitude and longitude were used for the pickup and the dropoff locations. It raised a lot of [privacy concerns](https://agkn.wordpress.com/2014/09/15/riding-with-the-stars-passenger-privacy-in-the-nyc-taxicab-dataset/) and the dataset has been providing `pickup_location_id` and `dropoff_location_id` instead. This id is corresponded to the [NYC Taxi Zones](https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc), roughly based on NYC Department of City Planning’s Neighborhood Tabulation Areas (NTAs) and are meant to approximate neighborhoods.
  - The maximum value of `pickup_location_id` and `dropoff_location_id` shows `99`. However, these might be wrong since the data type of both is string.

First, you can manipulate the time. `pickup_datetime` and `dropoff_datetime` is currently a string format, so using `to_timestamp()` function and `unix_timestamp()` function, you are able to get each pickup and droppoff datetime as a Unix Timestamp type.

Unix time is a way of representing time as the number of seconds since `January 1st, 1970 at 00:00:00 UTC`. Compared to the Timestamp type, Unix time can be represented as an integer, making it easier to parse and use across different systems.

After we get `start_time` and `end_time` by converting the original `pickup_datetime` and `dropoff_datetime`, we are able to get more insteresting columns using these two Timestamps.

In [None]:
@udf(returnType=BooleanType())
def is_weekdays(timestamp):
    """
    The preprocessing function takes timestamp and returns whether the timestamp is weekdays or not.
    Args:
        timestamp: Unix Timestamp format that represent the time.
                (e.g., timestamp = 1659268800, represents "Sun, 31 Jul 2022 12:00:00 GMT")
    Returns:
        A boolean value whether the given timestamp is weekdays or not.
    """
    day_of_week = ((timestamp // 86400) + 4) % 7 if timestamp else 7
    return 0 < day_of_week < 6

@udf(returnType=IntegerType())
def timestamp_to_time_in_minutes(timestamp):
    """
    The preprocessing function takes timestamp and returns whether the timestamp is weekdays or not.
    Args:
        timestamp: Unix Timestamp format that represent the time.
                (e.g., if timestamp == 1659268800, represents "Sun, 31 Jul 2022 12:00:00 GMT")
    Returns:
        A number that represents given time in minutes in EST (UTC-05).
                (e.g., if timestamp == 1659268800, returns 420 since it is 7:00 in EST)
    """
    return ((timestamp % 86400) // 60) - 300 if timestamp else None

@udf(returnType=DoubleType())
def preprocess_dist(start_lat, start_lon, end_lat, end_lon):
    """
    The preprocessing function takes two coordinates(latitude and longitude) and returns the Euclidian distance.
    Args:
        start_lat: The latitude of the start station.
        start_lon: The longitude of the start station.
        end_lat: The latitude of the end station.
        end_lon: The longitude of the end station.
    Returns:
        The Euclidian distance of given two coordinates.
    """
    return Point(start_lon, start_lat).distance(Point(end_lon, end_lat))

In [None]:
# Convert the type of pickup_datetime from a string to a Unix timestamp.
taxi_df = taxi_df.withColumn('start_time', unix_timestamp(to_timestamp(col('pickup_datetime'))))

# Convert the type of dropoff_datetime from a string to a Unix timestamp.
taxi_df = taxi_df.withColumn('end_time', unix_timestamp(to_timestamp(col('dropoff_datetime'))))

# Convert start_time to days_of_week
taxi_df = taxi_df.withColumn('is_weekdays', is_weekdays(col('start_time')))

# Convert start_time to start_time_in_minute
taxi_df = taxi_df.withColumn('start_time_in_minute', timestamp_to_time_in_minutes(col('start_time')))

# Calculate trip_duration
taxi_df = taxi_df.withColumn('trip_duration', col('end_time') - col('start_time'))

In [None]:
taxi_df.printSchema()

Before we go deeper into the Taxi dataset, let's do the similar work for the Citibike dataset.

In [None]:
bike_df.printSchema()

In [None]:
bike_df = bike_df.select(
    col("tripduration").alias("trip_duration"),
    col("starttime"),
    col("stoptime"),
    col("start_station_latitude"),
    col("start_station_longitude"),
    col("end_station_latitude"),
    col("end_station_longitude"),
    col("usertype"),
)
# bike_df.describe().show()

From this summary, there is also interesting information from the dataset's summary.
  - There are over 53 millions of trip history for Citibike from 2013 to 2018.
  - The current dataset has some abnormal values.
  - `starttime` and `stoptime` are string format. To use it effectively, it needs to be re-formatted.
  - Unlike the Taxi dataset, starting and ending location has exact latitude and longitude, but since every bike is parked in their station, these coordinates represent the station.

In [None]:
%%time
# cell for manipulate a timestamp to time and days of week
bike_df = bike_df.withColumn('starttime', unix_timestamp(to_timestamp(col('starttime'))))
bike_df = bike_df.withColumn('stoptime', unix_timestamp(to_timestamp(col('stoptime'))))
bike_df = bike_df.withColumn('is_weekdays', is_weekdays(col('starttime')))
bike_df = bike_df.withColumn('start_time_in_minute', timestamp_to_time_in_minutes(col('starttime')))
bike_df = bike_df.withColumn('trip_distance', preprocess_dist('start_station_latitude', 'start_station_longitude', 'end_station_latitude', 'end_station_longitude'))


Check the distributions for the numerical columns. In PySpark, visualizing is expensive because the data is too large. For example, the NYC Taxi dataset in 2018 has more than 112M rows. Therefore, approximately 2% of total data (approx. 2.2M rows) are extracted as a sample, which is enough to have 99% confidence interval and less than 0.1% of margin of error.

In [None]:
taxi_sample = taxi_df.sample(0.02)

df = taxi_sample.toPandas()
df.info()

When a [Decimal Type](https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.sql.types.DecimalType.html) column in PySpark are converted to Pandas DataFrame, it is converted into object Type, not float type. To visualize these "object" columns, they need to be converted into the float type.

In [None]:
FLOAT_TYPE_COLUMNS = ["trip_distance", "fare_amount", "pickup_location_id", "dropoff_location_id"]

df = df.drop(columns=['pickup_datetime', 'dropoff_datetime'])
for COLUMN in FLOAT_TYPE_COLUMNS:
    df[COLUMN] = df[COLUMN].astype(float)
df.info()

Iterate through columns and plot them to box and histogram.

In [None]:
for column in df.columns:
    if column == 'is_weekdays':
        continue
    _, ax = plt.subplots(1, 2, figsize=(10, 4))
    df[column].plot(kind="box", ax=ax[0])
    df[column].plot(kind="hist", ax=ax[1])
    plt.title(column)
    plt.show()

### Data Cleaning

In [None]:
START_2018 = 1514782800 # Jan 1, 2018 00:00:00
END_2018 = 1546318800 # Dec 31, 2018 23:59:59

@pandas_udf('long')
def preprocess_zone_id(lat: pd.Series, lon: pd.Series) -> pd.Series:
    point_var = [Point(xy) for xy in zip(lon, lat)]
    gdf_points = gpd.GeoDataFrame(pd.DataFrame({'lat': lat, 'lon': lon}), crs='epsg:4326', geometry=point_var)
    gdf_joined = gpd.sjoin(gdf_points, gdf_zone, how='left')
    return gdf_joined['location_id']

@udf(returnType=BooleanType())
def is_in_manhattan(location_id):
    return location_id in location_set

In [None]:
taxi_df = taxi_df.dropna()
taxi_df = taxi_df.withColumn("dropoff_location_id", taxi_df.dropoff_location_id.cast('int'))
taxi_df = taxi_df.withColumn("pickup_location_id", taxi_df.pickup_location_id.cast('int'))
taxi_df = taxi_df.withColumn("is_start_manhattan", is_in_manhattan(col("pickup_location_id")))
taxi_df = taxi_df.withColumn("is_end_manhattan", is_in_manhattan(col("dropoff_location_id")))

taxi_df = taxi_df.where(
    (col('start_time') >= START_2018)
    & (col('start_time') <= END_2018)
    & (col('end_time') >= START_2018)
    & (col('end_time') <= END_2018)
    & (col('start_time') < col('end_time'))
    & (col('trip_duration') > 0) 
    & (col('trip_duration') < 4000)
    & (col('trip_distance') > 0.2)
    & (col('trip_distance') < 15)
    & (col("pickup_location_id") != col("dropoff_location_id")) 
    & (col("fare_amount") > 0)
    & (col("fare_amount") < 500)
    & (col("is_start_manhattan") == True)
    & (col("is_end_manhattan") == True)
)

taxi_df.printSchema()
taxi_df.describe().show()

In [None]:
%%time
bike_df = bike_df.withColumn('start_zone_id', preprocess_zone_id(bike_df['start_station_latitude'], bike_df['start_station_longitude']))
bike_df = bike_df.withColumn('end_zone_id', preprocess_zone_id(bike_df['end_station_latitude'], bike_df['end_station_longitude']))
bike_df = bike_df.withColumn('is_start_manhattan', is_in_manhattan(col('start_zone_id')))
bike_df = bike_df.withColumn('is_end_manhattan', is_in_manhattan(col('end_zone_id')))

bike_df = bike_df.where(
    (col('tripduration') > 0)
    & (col("start_zone_id") != col("end_zone_id")) 
    & (col('tripduration') < 7200)
#     & (col('usertype') == "Subscriber")
    & (col('starttime') < col('stoptime'))
    & (col("is_start_manhattan") == True)
    & (col("is_end_manhattan") == True)
).dropna()

bike_df.printSchema()
bike_df.describe().show()

In [None]:
taxi_feature_cols = [
    "is_weekdays",
    "start_time_in_minute",
    "dropoff_location_id",
    "pickup_location_id",
    "trip_distance",
]

In [None]:
bike_feature_cols = [
    "is_weekdays",
#     "start_station_longitude",
#     "start_station_latitude",
#     "end_station_longitude",
#     "end_station_latitude",
    "start_zone_id",
    "end_zone_id",
    "start_time_in_minute",
    "trip_distance",
]

In [None]:
taxi_assembler = VectorAssembler(inputCols=taxi_feature_cols, outputCol='features')
taxi_transformed_data = taxi_assembler.transform(taxi_df)

standard_scaler = StandardScaler(inputCol="features", outputCol="features_scaled")
taxi_scaled_df = standard_scaler.fit(taxi_transformed_data).transform(taxi_transformed_data)

taxi_scaled_df.select("features", "features_scaled").show(10, truncate=False)
(taxi_training_data, taxi_test_data) = taxi_scaled_df.randomSplit([0.7, 0.3])

In [None]:
bike_assembler = VectorAssembler(inputCols=bike_feature_cols, outputCol='features')
bike_transformed_data = bike_assembler.transform(bike_df)
standard_scaler = StandardScaler(inputCol="features", outputCol="features_scaled")

bike_scaled_df = standard_scaler.fit(bike_transformed_data).transform(bike_transformed_data)
(bike_training_data, bike_test_data) = bike_scaled_df.randomSplit([0.7, 0.3])

In [None]:
gbt = GBTRegressor(
    featuresCol="features",
    labelCol="trip_duration",
    predictionCol="pred_trip_duration",
)

In [None]:
taxi_gbt_model = gbt.fit(taxi_training_data)
taxi_gbt_predictions = taxi_gbt_model.transform(taxi_test_data)

In [None]:
bike_gbt_model = gbt.fit(bike_training_data)
bike_gbt_predictions = bike_gbt_model.transform(bike_test_data)

In [None]:
evaluator_r2 = RegressionEvaluator(
    labelCol=gbt.getLabelCol(),
    predictionCol=gbt.getPredictionCol(),
    metricName="r2"
)
evaluator_rmse = RegressionEvaluator(
    labelCol=gbt.getLabelCol(),
    predictionCol=gbt.getPredictionCol(),
    metricName="rmse"
)

In [None]:
taxi_gbt_accuracy_r2 = evaluator_r2.evaluate(taxi_gbt_predictions)
taxi_gbt_accuracy_rmse = evaluator_rmse.evaluate(taxi_gbt_predictions)

# RMSE:245.99563606237268

# print(f"Taxi Coefficients: {taxi_model.coefficients}")
# print(f"Taxi Intercept: {taxi_model.intercept}")

print(f"Taxi Test GBT R2 Accuracy = {taxi_gbt_accuracy_r2}")
print(f"Taxi Test GBT RMSE Accuracy = {taxi_gbt_accuracy_rmse}")

# Taxi Test GBT R2 Accuracy = 0.708183954907601 <- 
# Taxi Test GBT R2 Accuracy = 0.6598682899938532
# rmse = 265.5806200025988


In [None]:
bike_gbt_accuracy_r2 = evaluator_r2.evaluate(bike_gbt_predictions)
bike_gbt_accuracy_rmse = evaluator_rmse.evaluate(bike_gbt_predictions)

# print(f"bike Coefficients: {bike_gbt_model.coefficients}")
# print(f"bike Intercept: {bike_gbt_model.intercept}")
print(f"Bike Test GBT R2 Accuracy = {bike_gbt_accuracy_r2}")
print(f"Bike Test GBT RMSE Accuracy = {bike_gbt_accuracy_rmse}")

# Bike Test GBT R2 Accuracy = 0.540752577358068
# Bike Test GBT RMSE Accuracy = 341.8116656204376

# Exclude subscriber
# Bike Test GBT R2 Accuracy = 0.4413798408518055
# Bike Test GBT RMSE Accuracy = 440.703561958477


In [None]:
print(f"Bike Test GBT R2 Accuracy = {bike_gbt_accuracy_r2}")
print(f"Bike Test GBT RMSE Accuracy = {bike_gbt_accuracy_rmse}")

In [None]:
bike_gbt_model = gbt.fit(bike_training_data)

# taxi_gbt_predictions = taxi_gbt_model.transform(taxi_test_data)
bike_gbt_predictions = bike_gbt_model.transform(bike_test_data)

In [None]:
bike_gbt_accuracy_r2 = evaluator_r2.evaluate(bike_gbt_predictions)
bike_gbt_accuracy_rmse = evaluator_rmse.evaluate(bike_gbt_predictions)
print(f"Bike Test GBT r2 Accuracy = {bike_gbt_accuracy_r2}")
print(f"Bike Test GBT rmse Accuracy = {bike_gbt_accuracy_rmse}")

# Bike Test GBT r2 Accuracy = 0.5394345501867752
# Bike Test GBT rmse Accuracy = 342.1333445936911
# bike_summary = bike_model.summary


# print(bike_summary.totalIterations)
# print(bike_summary.objectiveHistory)
# print(bike_summary.rootMeanSquaredError)
# print(bike_summary.r2)

In [None]:
# # bike_model.summary.residuals.show()
# taxi_lr_model.summary.residuals.show()
# # print(model.extractParamMap())
print(f"Bike Test GBT Accuracy = {bike_gbt_accuracy_r2}")

In [None]:
print(taxi_gbt_accuracy)
print(taxi_dt_accuracy)

In [None]:
temp_evaluator = RegressionEvaluator(
    labelCol="tripduration",
    predictionCol="prediction",
    metricName="rmse"
)
taxi_gbt_rmse = temp_evaluator.evaluate(taxi_gbt_predictions)
print(taxi_gbt_rmse)


In [None]:
citi_df.info()
# df = df.select(
#     col("pickup_longitude"),
#     col("pickup_latitude"),
#     col("dropoff_longitude"),
#     col("dropoff_latitude"),
#     unix_timestamp(to_timestamp(col("pickup_datetime"))).alias("pickup_datetime"),
#     unix_timestamp(to_timestamp(col("dropoff_datetime"))).alias("dropoff_datetime"),
# )

# df.show()

In [None]:
# param_grid = (ParamGridBuilder()
#               .addGrid(gbt.maxDepth, [2, 5, 10])
#               .addGrid(gbt.maxBins, [10, 20, 40])
#               .addGrid(gbt.maxIter, [5, 10, 20])
#               .build()
# )

# cv = CrossValidator(
#     estimator=gbt,
#     evaluator=evaluator_rmse,
#     estimatorParamMaps=param_grid,
#     numFolds=5
# )

In [None]:
cv_model = cv.fit(taxi_training_data)
print(cv_model)
gb_predictions = cv_model.transform(taxi_test_data)
print(f"RMSE:{evaluator_rmse.evaluate(gb_predictions)}")

After preprocessing, you can see the preprocessed_df's schema, the language column is separated into three string columns, `mono_language`, `mono_size`, and `poly_language`.

### Analyze

#### Which language is the most frequently used among the monoglot repos?
To answer this question, you can execute a query below with the preprocessed column, `mono_language`.

### Write back to the BigQuery

After analyzing these queries, we have several DataFrames. The ranking of monoglot repositories, the average bytes of monoglot repositories, and the frequency table of each language being used in a repository. 

In this project, these three DataFrames will be stored in BigQuery using the [spark-bigquery-connector](https://github.com/GoogleCloudDataproc/spark-bigquery-connector).

If there is no error above, congratulations! your DataFrame is successfully stored in your BigQuery.

You can find the data via [this link](https://pantheon.corp.google.com/bigquery) or execute `bq` command-line tool like below.

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

### Delete Vertex AI Workbench - Managed Notebook

To delete Vertex Ai Workbench - Managed Notebook used in this project, you can use this [Clean up](https://cloud.google.com/vertex-ai/docs/workbench/managed/create-managed-notebooks-instance-console-quickstart#clean-up) part of `Managed notebooks` page.

### Delete a Dataproc Cluster

To delete a Dataproc Cluster, you can use this [Deleting a cluster](https://cloud.google.com/dataproc/docs/guides/manage-cluster#deleting_a_cluster) part of `Manage a cluster` page.

In [None]:
# Delete Google Cloud Storage bucket
! gsutil rm -r $BUCKET_URI

In [None]:
# Delete BigQuery dataset
! bq rm -r -f $DATASET_NAME

After you delete the BigQuery dataset, you can check your Datasets in BigQuery using the following command.

In [None]:
dt = DecisionTreeRegressor(
    featuresCol="features",
    labelCol="tripduration",
    predictionCol="pred_tripduration",
)

taxi_dt_model = dt.fit(taxi_training_data)
# taxi_dt_summary = taxi_dt_model.summary
taxi_dt_predictions = taxi_dt_model.transform(taxi_test_data)

bike_dt_model = dt.fit(bike_training_data)
bike_dt_predictions = bike_dt_model.transform(bike_test_data)

In [None]:
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="tripduration",
    predictionCol="pred_tripduration",
)

taxi_rf_model = rf.fit(taxi_training_data)
bike_rf_model = rf.fit(bike_training_data)
# taxi_gbt_summary = taxi_gbt_model.summary

# print(taxi_gbt_summary.totalIterations)
# print(taxi_gbt_summary.objectiveHistory)
# print(taxi_gbt_summary.rootMeanSquaredError)
# print(taxi_gbt_summary.r2)
# print(f"Taxi Gradient Boost Tree R^2: {taxi_gbt_summary.r2}")


taxi_rf_predictions = taxi_rf_model.transform(taxi_test_data)
bike_rf_predictions = bike_rf_model.transform(bike_test_data)



# lon = -73.993915
# lat = 40.73532427
# point_var = Point(lon, lat)
# gdf_point = gpd.GeoDataFrame(crs='epsg:4326', geometry=[point_var])
# gdf_joined = gpd.sjoin(gdf_point, gdf_zone, how='left')
# print(gdf_point)
# print(gdf_joined)
# point_var = Point(lon, lat)
# zones.contains(point_var, align=True)


In [None]:
! bq ls