<a href="https://colab.research.google.com/github/raccoonback/2024-spark/blob/main/koseungbin2024_spark_study_final_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [데이터셋]
- https://www.kaggle.com/datasets/brllrb/uber-and-lyft-dataset-boston-ma


In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!cp /content/drive/MyDrive/colab_notebooks/spark/spark-3.5.1-bin-hadoop3.tgz /content/
!tar xf spark-3.5.1-bin-hadoop3.tgz
!pip install -q findspark

!cp -r /content/drive/MyDrive/colab_notebooks/sample_data /content/

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.1-bin-hadoop3"

In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .master("local[*]") \
    .getOrCreate()

spark.conf.set("spark.sql.repl.eagerEval.enabled", True)
spark

In [3]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType, BooleanType


raw_df = spark.read.csv(
    './sample_data/yellow_taxi/rideshare_kaggle.csv',
    header=True,
    sep=',',
    inferSchema=True
)

# raw data
print("[raw data]")
raw_df.show()

[raw data]
+--------------------+----------------+----+---+-----+-------------------+----------------+----------------+--------------------+--------+--------------------+------------+-----+--------+----------------+--------+---------+-----------+-------------------+---------------+--------------------+---------------+-----------------+--------+---------+--------+------------+----------+---------------+-------------------+--------------+------------------+-----------------------+---------------------------+----------------------+--------------------------+--------------------+--------+--------+-----------+----------+-------+------------+-----+-----------+----------+---------+------------------+-----------+--------------+------------------+--------------+------------------+----------------------+--------------------------+----------------------+--------------------------+
|                  id|       timestamp|hour|day|month|           datetime|        timezone|          source|         

In [28]:
from pyspark.sql.functions import col, avg, count, year, datediff, expr, min, max, to_date, month, hour, cast, sum, rank, concat, lit
from pyspark.sql.window import Window
import pyspark.pandas as ps


# 공급자 측면 조사
print("[공급자 측면 조사]")

##  평균 가격이 높았던 출발지와 도착지
print("[평균 가격이 많았던 출발지와 도착지]")
result_df = raw_df \
  .withColumn('src_to_dest', concat(col('source'), lit('->'), col('destination'))) \
  .groupBy('src_to_dest') \
  .agg(avg('price').alias('avg_price')) \
  .orderBy('avg_price', ascending=False);

result_df.show();

### 결과 그래프
ps \
  .DataFrame(result_df) \
  .set_index('src_to_dest') \
  .avg_price \
  .plot \
  .bar() \
  .show();


## 월(month) 별 택시 회사 매출 비교
print("[월(month) 별 택시 회사 매출 비교]")
result_df = raw_df \
  .withColumn('year', year(col('datetime'))) \
  .withColumn('month', month(col('datetime'))) \
  .withColumn('year_month', concat(col('year'), lit('-'), col('month'))) \
  .withColumn('year_month_by_cab_type', concat(col('year_month'), lit('/'), col('cab_type'))) \
  .groupBy('year_month_by_cab_type') \
  .agg(sum('price').alias('total_price')) \
  .orderBy("year_month_by_cab_type");

result_df.show()

### 결과 그래프
ps \
  .DataFrame(result_df) \
  .set_index('year_month_by_cab_type') \
  .plot \
  .bar(y = 'total_price') \
  .show();

## 매 시간(hour) 별 가장 콜이 많았던 장소
print("[매 시간(hour) 별 가장 콜이 많았던 장소]")
windowSpec = Window \
      .partitionBy('date_hour') \
      .orderBy(col('count').desc()) \
      .rowsBetween(Window.unboundedPreceding, Window.currentRow)

result_df = raw_df \
  .withColumn('date', to_date(col('datetime'))) \
  .withColumn('hour', hour(col('datetime'))) \
  .withColumn('date_hour', concat(col('date'), lit('/'), col('hour'))) \
  .groupBy('date_hour', 'source') \
  .agg(count('source').alias('count')) \
  .withColumn('rank', rank().over(windowSpec)) \
  .select('date_hour', 'source') \
  .orderBy("date_hour") \
  .filter(col('rank') == 1)

result_df.orderBy('date_hour').show()

### 결과 그래프
ps \
  .DataFrame(result_df) \
  .plot \
  .scatter(x = 'date_hour', y = 'source', s = 'count') \
  .show();


### 검증용
# raw_df.withColumn('date', to_date(col('datetime'))) \
#   .withColumn('hour', hour(col('datetime'))) \
#   .groupBy('date', 'hour', 'source') \
#   .agg(count('source').alias('count')) \
#   .orderBy('date', 'hour', col('count').desc()) \
#   .show(40)

[공급자 측면 조사]
[평균 가격이 많았던 출발지와 도착지]
+--------------------+------------------+
|         src_to_dest|         avg_price|
+--------------------+------------------+
|Financial Distric...|25.029096477794795|
|Boston University...|24.039182282793867|
|Financial Distric...|23.626237623762375|
|Fenway->Financial...|23.088291746641076|
|Northeastern Univ...|22.499134948096884|
|Financial Distric...|21.520358306188925|
|Theatre District-...| 20.76215277777778|
|Boston University...|20.310986964618248|
| North End->Back Bay|19.762027491408936|
| Back Bay->North End| 19.73857404021938|
|South Station->Ba...|19.439338235294116|
|Fenway->North Sta...|19.414495114006513|
|Northeastern Univ...|19.221014492753625|
|North Station->Bo...|19.073701842546065|
|Theatre District-...|19.066096423017107|
|North Station->No...| 19.06513409961686|
|West End->Northea...|18.954682779456192|
|Boston University...| 18.91390728476821|
|Boston University...| 18.82960413080895|
|North Station->Fe...|18.549924357034797|


[월(month) 별 택시 회사 매출 비교]
+----------------------+-----------+
|year_month_by_cab_type|total_price|
+----------------------+-----------+
|          2018-11/Lyft|   154411.5|
|          2018-11/Uber|   141712.5|
|          2018-12/Lyft|   204014.0|
|          2018-12/Uber|   193503.0|
+----------------------+-----------+



[매 시간(hour) 별 가장 콜이 많았던 장소]
+-------------+--------------------+
|    date_hour|              source|
+-------------+--------------------+
|2018-11-26/10|            Back Bay|
|2018-11-26/10|   Boston University|
|2018-11-26/11|Northeastern Univ...|
|2018-11-26/12|            Back Bay|
|2018-11-26/13|    Theatre District|
|2018-11-26/14|       North Station|
|2018-11-26/15|    Haymarket Square|
|2018-11-26/16|       North Station|
|2018-11-26/17|       South Station|
|2018-11-26/17|              Fenway|
|2018-11-26/18|    Theatre District|
|2018-11-26/19|  Financial District|
|2018-11-26/20|Northeastern Univ...|
|2018-11-26/21|         Beacon Hill|
|2018-11-26/22|Northeastern Univ...|
|2018-11-26/23|       North Station|
| 2018-11-26/3|    Theatre District|
| 2018-11-26/3|    Haymarket Square|
| 2018-11-26/3|       North Station|
| 2018-11-26/3|       South Station|
+-------------+--------------------+
only showing top 20 rows



In [34]:
from pyspark.sql.functions import col, avg, count, year, datediff, expr, min, max, to_date, month, hour, cast, sum, rank
from pyspark.sql.window import Window

# 수요자 측면 조사
print("[수요자 측면 조사]")

## 각 회사별 출발지, 목적지 평균 가격
print("[각 회사별 출발지, 목적지 평균 가격]")
result_df = raw_df \
  .withColumn('src_to_dest_by_cab_type', concat(col('cab_type'), lit('('), col('source'), lit('->'), col('destination'), lit(')'))) \
  .groupBy('cab_type', 'source', 'destination', 'src_to_dest_by_cab_type') \
  .agg(avg('price').alias('avg_price')) \
  .orderBy('source', 'destination', 'cab_type')

result_df.show(20)

### 결과 그래프
ps \
  .DataFrame(result_df) \
  .set_index('src_to_dest_by_cab_type') \
  .plot \
  .bar(y = 'avg_price') \
  .show();


## 각 출발지 별로 콜이 잘 잡히는 회사
print("[각 출발지 별로 콜이 잘 잡히는 회사]")

windowSpec = Window \
      .partitionBy('source') \
      .orderBy(col('count').desc()) \
      .rowsBetween(Window.unboundedPreceding, Window.currentRow)

raw_df \
  .groupBy('source', 'cab_type') \
  .count().alias('count') \
  .withColumn('rank', rank().over(windowSpec)) \
  .filter(col('rank') == 1) \
  .select('source', 'cab_type', 'count') \
  .orderBy('source') \
  .show()

### 검증용
# raw_df \
#   .groupBy('source', 'cab_type') \
#   .count().alias('count') \
#   .orderBy('source', 'cab_type', 'count') \
#   .show(50)

## 가성비가 좋은 택시 회사
print("[가성비가 좋은 택시 회사]")
raw_df \
  .groupBy('cab_type') \
  .agg(avg(col('price') / col('distance')).alias('avg_price_per_distance')) \
  .orderBy('avg_price_per_distance') \
  .show()

[수요자 측면 조사]
[각 회사별 출발지, 목적지 평균 가격]
+--------+-----------+--------------------+-----------------------+------------------+
|cab_type|     source|         destination|src_to_dest_by_cab_type|         avg_price|
+--------+-----------+--------------------+-----------------------+------------------+
|    Lyft|   Back Bay|   Boston University|   Lyft(Back Bay->Bo...|14.235887096774194|
|    Uber|   Back Bay|   Boston University|   Uber(Back Bay->Bo...|  13.0688202247191|
|    Lyft|   Back Bay|              Fenway|   Lyft(Back Bay->Fe...| 14.97003745318352|
|    Uber|   Back Bay|              Fenway|   Uber(Back Bay->Fe...|13.309602649006623|
|    Lyft|   Back Bay|    Haymarket Square|   Lyft(Back Bay->Ha...|18.686974789915965|
|    Uber|   Back Bay|    Haymarket Square|   Uber(Back Bay->Ha...|16.791208791208792|
|    Lyft|   Back Bay|           North End|   Lyft(Back Bay->No...|21.535714285714285|
|    Uber|   Back Bay|           North End|   Uber(Back Bay->No...|18.203389830508474|
|    Lyf

[각 출발지 별로 콜이 잘 잡히는 회사]
+--------------------+--------+-----+
|              source|cab_type|count|
+--------------------+--------+-----+
|            Back Bay|    Uber| 2135|
|         Beacon Hill|    Uber| 1905|
|   Boston University|    Uber| 2056|
|              Fenway|    Uber| 2112|
|  Financial District|    Uber| 2058|
|    Haymarket Square|    Uber| 2171|
|           North End|    Uber| 2061|
|       North Station|    Uber| 1990|
|Northeastern Univ...|    Uber| 2139|
|       South Station|    Uber| 1871|
|    Theatre District|    Uber| 2252|
|            West End|    Uber| 2076|
+--------------------+--------+-----+

[가성비가 좋은 택시 회사]
+--------+----------------------+
|cab_type|avg_price_per_distance|
+--------+----------------------+
|    Uber|     9.682674403228246|
|    Lyft|      9.71325494447119|
+--------+----------------------+

