In [None]:
# Copyright 2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

## Tutorial

### Import required libraries

In [None]:
# A Spark Session is how you interact with Spark SQL to create Dataframes
from pyspark.sql import SparkSession
# PySpark functions
from pyspark.sql.functions import avg, col, count, desc, round, size, udf, to_timestamp, unix_timestamp, broadcast, pandas_udf, PandasUDFType, to_date
# These allow us to create a schema for our data
from pyspark.sql.types import ArrayType, IntegerType, StringType

from pyspark.ml.linalg import Vectors
from geopandas import gpd
from shapely import wkt
from shapely.geometry import Point
import pandas as pd
from pyspark.ml.regression import LinearRegression, GBTRegressor, DecisionTreeRegressor, RandomForestRegressor
import matplotlib.pyplot as plt
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

### Initialize the SparkSession

To use Apache Spark with BigQuery, you must include the [spark-bigquery-connector](https://github.com/GoogleCloudDataproc/spark-bigquery-connector) when you initialize the SparkSession.

In [None]:
# Initialize the SparkSession with the following config.
spark = (
    SparkSession.builder.appName("spark-bigquery-ml-nyc-trips-demo")
    .config(
        "spark.jars",
        "gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.25.2.jar",
    )
    .config("spark.sql.debug.maxToStringFields", "500")
    .getOrCreate()
)
spark.sparkContext.setLogLevel('WARN')

### Fetch data from BigQuery

In [None]:
gdf_zone = gpd.read_file("https://data.cityofnewyork.us/api/geospatial/d3c5-ddgc?method=export&format=GeoJSON")
gdf_zone['location_id'] = gdf_zone['location_id'].astype('long')
print(gdf_zone.info())

In [None]:
MANHATTAN = {153, 128, 127, 243, 120, 244, 116, 152, 42, 166, 41, 74, 24, 151, 238, 239, 143, 142, 43, 75, 236, 263, 262, 237, 141, 140, 50, 48, 163, 230, 161, 162, 229, 233, 170, 164, 100, 68, 246, 186, 90, 234, 107, 137, 224, 4, 79, 113, 114, 249, 158, 125, 211, 144, 148, 232, 45, 231, 209, 87, 13, 261, 12, 88}

# Load NYC_taxi in Github Activity Public Dataset from BigQuery.
taxi_df = (
    spark.read.format("bigquery")
    .option("table", "bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2018")
    .load()
)
taxi_df.printSchema()

# Load NYC_Citibike in Github Acitivity Public dataset from BQ.
bike_df = (
    spark.read.format("bigquery")
    .option("table", "bigquery-public-data.new_york_citibike.citibike_trips")
    .load()
)
# bike_df = bike_df.dropna()
bike_df.show(5)

taxi_zone_geom = (
    spark.read.format("bigquery")
    .option("table", "bigquery-public-data.new_york_taxi_trips.taxi_zone_geom")
    .load()
)

taxi_zone_geom.show()

# convert string starttime to unix timestamp

In [None]:
@udf(returnType=IntegerType())
def preprocessing_days_of_week(timestamp):
    return ((timestamp // 86400) + 4) % 7 if timestamp else None

@udf(returnType=IntegerType())
def preprocessing_time(timestamp):
    return timestamp % 1440 if timestamp else None

@udf(returnType=StringType())
def preprocessing_borough(lat, lon):
    point_var = [Point(xy) for xy in zip(lon, lat)]
    gdf_points = gpd.GeoDataFrame(pd.DataFrame({'lat': lat, 'lon': lon}), crs='epsg:4326', geometry=point_var)
    gdf_joined = gpd.sjoin(gdf_points, gdf_zone, how='left')
    return gdf_joined['borough']

@udf(returnType=IntegerType())
def preprocessing_man(zone):
    return zone if zone in MANHATTAN else -1

@pandas_udf('string')
def preprocess_zone_name(lat: pd.Series, lon: pd.Series) -> pd.Series:
    point_var = [Point(xy) for xy in zip(lon, lat)]
    gdf_points = gpd.GeoDataFrame(pd.DataFrame({'lat': lat, 'lon': lon}), crs='epsg:4326', geometry=point_var)
    gdf_joined = gpd.sjoin(gdf_points, gdf_zone, how='left')
    return gdf_joined['borough']

@pandas_udf('long')
def preprocess_zone_id(lat: pd.Series, lon: pd.Series) -> pd.Series:
    point_var = [Point(xy) for xy in zip(lon, lat)]
    gdf_points = gpd.GeoDataFrame(pd.DataFrame({'lat': lat, 'lon': lon}), crs='epsg:4326', geometry=point_var)
    gdf_joined = gpd.sjoin(gdf_points, gdf_zone, how='left')
    return gdf_joined['location_id']

In [None]:
taxi_df = taxi_df.withColumn('starttime', unix_timestamp(to_timestamp(col('pickup_datetime'))))
taxi_df = taxi_df.withColumn('endtime', unix_timestamp(to_timestamp(col('dropoff_datetime'))))
taxi_df = taxi_df.withColumn('days_of_week', preprocessing_days_of_week(col('starttime')))
taxi_df = taxi_df.withColumn('time', preprocessing_time(col('starttime')))
taxi_df = taxi_df.withColumn('tripduration', col('endtime') - col('starttime'))
taxi_df.withColumn("dropoff_location_id", taxi_df.dropoff_location_id.cast('int'))
taxi_df.withColumn("pickup_location_id", taxi_df.pickup_location_id.cast('int'))
taxi_df.withColumn("pickup_location_id", preprocessing_man(col("pickup_location_id")))
taxi_df.withColumn("dropoff_location_id", preprocessing_man(col("dropoff_location_id")))

taxi_df = taxi_df.select(
    col("tripduration"),
    col("days_of_week"),
    col("time"),
    col("pickup_location_id").cast("int").alias("pickup_location_id"),
    col("dropoff_location_id").cast("int").alias("dropoff_location_id"),
    col("trip_distance"),
    col("fare_amount"),
).dropna()

taxi_df = taxi_df.where(
    (col('tripduration') > 360) 
    & (col("pickup_location_id") != col("dropoff_location_id")) 
    & (col('tripduration') < 3600)
    & (col("pickup_location_id") >= 0)
    & (col("dropoff_location_id") >= 0)
)

taxi_df.printSchema()
taxi_df.show()

In [None]:
# cell for manipulate a timestamp to time and days of week
bike_df = bike_df.withColumn('starttime', unix_timestamp(to_timestamp(col('starttime'))))
bike_df = bike_df.withColumn('days_of_week', preprocessing_days_of_week(col('starttime')))
bike_df = bike_df.withColumn('time', preprocessing_time(col('starttime')))
bike_df = bike_df.withColumn('start_zone_name', preprocess_zone_name(bike_df['start_station_latitude'], bike_df['start_station_longitude']))
bike_df = bike_df.withColumn('end_zone_name', preprocess_zone_name(bike_df['end_station_latitude'], bike_df['end_station_longitude']))
bike_df = bike_df.withColumn('start_zone_id', preprocess_zone_id(bike_df['start_station_latitude'], bike_df['start_station_longitude']))
bike_df = bike_df.withColumn('end_zone_id', preprocess_zone_id(bike_df['end_station_latitude'], bike_df['end_station_longitude']))
bike_df = bike_df.withColumn("start_zone_id", preprocessing_man(col("start_zone_id")))
bike_df = bike_df.withColumn("end_zone_id", preprocessing_man(col("end_zone_id")))

bike_df = bike_df.select(
    col("tripduration"),
    col("days_of_week"),
    col("time"),
    col('usertype'),
    col("start_station_longitude"),
    col("start_station_latitude"),
    col("end_station_longitude"),
    col("end_station_latitude"),
    col('start_zone_id'),
    col('end_zone_id'),
).dropna()

bike_df = bike_df.where(
    (col('tripduration') > 300) 
    & (col("start_zone_id") != col("end_zone_id")) 
    & (col('tripduration') < 3600)
    & (col('usertype') == "Subscriber")
    & (col("start_zone_id") >= 0)
    & (col("end_zone_id") >= 0)
)

# taxi_df = taxi_df.where(col('days_of_week') < 5)
bike_df = bike_df.withColumnRenamed("start_station_longitude", "pickup_longitude")
bike_df = bike_df.withColumnRenamed("start_station_latitude", "pickup_latitude")
bike_df = bike_df.withColumnRenamed("end_station_longitude", "dropoff_longitude")
bike_df = bike_df.withColumnRenamed("end_station_latitude", "dropoff_latitude")

bike_df.printSchema()

In [None]:
bike_df.describe().show()

In [None]:
taxi_df.describe().show()

In [None]:
# Correlation matrix
# # convert to vector column first
# # took too much time
# vector_col = "corr_features"
# assembler = VectorAssembler(inputCols=citi_df.columns, outputCol=vector_col)
# df_vector = assembler.transform(citi_df).select(vector_col)

# matrix = Correlation.corr(df_vector, vector_col)
# cor_np = matrix.collect()[0][matrix.columns[0]].toArray()


# plt.matshow(cor_np)
# print(cor_np)
# plt.show()

In [None]:
taxi_feature_cols = [
    "days_of_week",
    "time",
    "dropoff_location_id",
    "pickup_location_id",
    "fare_amount",
]
taxi_assembler = VectorAssembler(inputCols=taxi_feature_cols, outputCol='features')
taxi_transformed_data = taxi_assembler.transform(taxi_df)
standard_scaler = StandardScaler(inputCol="features", outputCol="features_scaled")
taxi_scaled_df = standard_scaler.fit(taxi_transformed_data).transform(taxi_transformed_data)
# taxi_scaled_df.select("features", "features_scaled").show(10, truncate=False)
(taxi_training_data, taxi_test_data) = taxi_scaled_df.randomSplit([0.7, 0.3])

In [None]:
bike_feature_cols = [
    "days_of_week",
    "time",
    "pickup_longitude",
    "pickup_latitude",
    "dropoff_longitude",
    "dropoff_latitude",
    "start_zone_id",
    "end_zone_id",
]
bike_assembler = VectorAssembler(inputCols=bike_feature_cols, outputCol='features')
bike_transformed_data = bike_assembler.transform(bike_df)
standard_scaler = StandardScaler(inputCol="features", outputCol="features_scaled")

bike_scaled_df = standard_scaler.fit(bike_transformed_data).transform(bike_transformed_data)
(bike_training_data, bike_test_data) = bike_scaled_df.randomSplit([0.7, 0.3])


In [None]:
dt = DecisionTreeRegressor(
    featuresCol="features",
    labelCol="tripduration",
    predictionCol="pred_tripduration",
)

taxi_dt_model = dt.fit(taxi_training_data)
# taxi_dt_summary = taxi_dt_model.summary
taxi_dt_predictions = taxi_dt_model.transform(taxi_test_data)

bike_dt_model = dt.fit(bike_training_data)
bike_dt_predictions = bike_dt_model.transform(bike_test_data)

In [None]:
gbt = GBTRegressor(
    featuresCol="features",
    labelCol="tripduration",
    predictionCol="pred_tripduration",
    maxIter=10
)

taxi_gbt_model = gbt.fit(taxi_training_data)
bike_gbt_model = gbt.fit(bike_training_data)

taxi_gbt_predictions = taxi_gbt_model.transform(taxi_test_data)
bike_gbt_predictions = bike_gbt_model.transform(bike_test_data)

In [None]:
rf = RandomForestRegressor(
    featuresCol="features",
    labelCol="tripduration",
    predictionCol="pred_tripduration",
)

taxi_rf_model = rf.fit(taxi_training_data)
bike_rf_model = rf.fit(bike_training_data)
# taxi_gbt_summary = taxi_gbt_model.summary

# print(taxi_gbt_summary.totalIterations)
# print(taxi_gbt_summary.objectiveHistory)
# print(taxi_gbt_summary.rootMeanSquaredError)
# print(taxi_gbt_summary.r2)
# print(f"Taxi Gradient Boost Tree R^2: {taxi_gbt_summary.r2}")


taxi_rf_predictions = taxi_rf_model.transform(taxi_test_data)
bike_rf_predictions = bike_rf_model.transform(bike_test_data)

In [None]:
# bike_summary = bike_model.summary


# print(bike_summary.totalIterations)
# print(bike_summary.objectiveHistory)
# print(bike_summary.rootMeanSquaredError)
# print(bike_summary.r2)

In [None]:
# bike_predictions = bike_model.transform(bike_test_data)

evaluator = RegressionEvaluator(
    labelCol="tripduration",
    predictionCol="pred_tripduration",
    metricName="r2"
)

taxi_dt_accuracy = evaluator.evaluate(taxi_dt_predictions)
taxi_gbt_accuracy = evaluator.evaluate(taxi_gbt_predictions)
taxi_rf_accuracy = evaluator.evaluate(taxi_rf_predictions)

bike_dt_accuracy = evaluator.evaluate(bike_dt_predictions)
bike_gbt_accuracy = evaluator.evaluate(bike_gbt_predictions)
bike_rf_accuracy = evaluator.evaluate(bike_rf_predictions)

# print(f"bike Coefficients: {bike_model.coefficients}")
# print(f"Taxi Coefficients: {taxi_model.coefficients}")
# print(f"bike Intercept: {bike_model.intercept}")
# print(f"Taxi Intercept: {taxi_model.intercept}")
# print(f"bike R^2: {bike_model.summary.r2}")

# print(f"bike Test Accuracy = {bike_accuracy}")
print(f"Taxi Test DT Accuracy = {taxi_dt_accuracy}")
print(f"Taxi Test GBT Accuracy = {taxi_gbt_accuracy}")
print(f"Taxi Test RF Accuracy = {taxi_rf_accuracy}")

print(f"Bike Test DT Accuracy = {bike_dt_accuracy}")
print(f"Bike Test GBT Accuracy = {bike_gbt_accuracy}")
print(f"Bike Test RF Accuracy = {bike_rf_accuracy}")

In [None]:
# # bike_model.summary.residuals.show()
# taxi_lr_model.summary.residuals.show()
# # print(model.extractParamMap())
print(f"Bike Test GBT Accuracy = {bike_gbt_accuracy}")

In [None]:
print(taxi_gbt_accuracy)
print(taxi_dt_accuracy)

In [None]:
temp_evaluator = RegressionEvaluator(
    labelCol="tripduration",
    predictionCol="prediction",
    metricName="rmse"
)
taxi_gbt_rmse = temp_evaluator.evaluate(taxi_gbt_predictions)
print(taxi_gbt_rmse)


### Preprocessing

Based on the schema printed above, data of the GitHub Activity is not stored in primitive types, but is instead stored in arrays. 

To work more effectively with the data, you need to preprocess it to primitive types and separate data for monoglot repos and polyglot repos. Once you create preprocessed columns, it makes our future tasks much faster.

You can see three Python functions with the `@udf` annotation with their return type below. The annotation `@udf` is a short form of [User Defined Function](https://spark.apache.org/docs/3.1.3/api/python/reference/api/pyspark.sql.functions.udf.html), which is used to extend the functions of the PySpark framework. 

In [None]:
citi_df.info()
# df = df.select(
#     col("pickup_longitude"),
#     col("pickup_latitude"),
#     col("dropoff_longitude"),
#     col("dropoff_latitude"),
#     unix_timestamp(to_timestamp(col("pickup_datetime"))).alias("pickup_datetime"),
#     unix_timestamp(to_timestamp(col("dropoff_datetime"))).alias("dropoff_datetime"),
# )

# df.show()

In [None]:
df.printSchema()

After preprocessing, you can see the preprocessed_df's schema, the language column is separated into three string columns, `mono_language`, `mono_size`, and `poly_language`.

### Analyze

#### Which language is the most frequently used among the monoglot repos?
To answer this question, you can execute a query below with the preprocessed column, `mono_language`.

### Write back to the BigQuery

After analyzing these queries, we have several DataFrames. The ranking of monoglot repositories, the average bytes of monoglot repositories, and the frequency table of each language being used in a repository. 

In this project, these three DataFrames will be stored in BigQuery using the [spark-bigquery-connector](https://github.com/GoogleCloudDataproc/spark-bigquery-connector).

If there is no error above, congratulations! your DataFrame is successfully stored in your BigQuery.

You can find the data via [this link](https://pantheon.corp.google.com/bigquery) or execute `bq` command-line tool like below.

## Cleaning up

To clean up all Google Cloud resources used in this project, you can [delete the Google Cloud
project](https://cloud.google.com/resource-manager/docs/creating-managing-projects#shutting_down_projects) you used for the tutorial.

Otherwise, you can delete the individual resources you created in this tutorial:

### Delete Vertex AI Workbench - Managed Notebook

To delete Vertex Ai Workbench - Managed Notebook used in this project, you can use this [Clean up](https://cloud.google.com/vertex-ai/docs/workbench/managed/create-managed-notebooks-instance-console-quickstart#clean-up) part of `Managed notebooks` page.

### Delete a Dataproc Cluster

To delete a Dataproc Cluster, you can use this [Deleting a cluster](https://cloud.google.com/dataproc/docs/guides/manage-cluster#deleting_a_cluster) part of `Manage a cluster` page.

In [None]:
# Delete Google Cloud Storage bucket
! gsutil rm -r $BUCKET_URI

In [None]:
# Delete BigQuery dataset
! bq rm -r -f $DATASET_NAME

After you delete the BigQuery dataset, you can check your Datasets in BigQuery using the following command.

In [None]:
! bq ls