In [1]:
from pyspark import SparkContext, SparkConf 
from pyspark.sql import SparkSession
import pyspark.sql as sql

In [9]:
spark = SparkSession.builder.master('yarn').appName('LR1_Kusakina').getOrCreate()

In [10]:
spark

In [16]:
#!hadoop fs -put station.csv/user/dashakys

put: `station.csv/user/dashakys': No such file or directory


In [86]:
trip = spark.read\
.option("header", True)\
.option("inferSchema", True)\
.option("timestampFormat", 'M/d/y H:m')\
.csv("trip.csv")

In [87]:
trip.printSchema()

root
 |-- id: integer (nullable = true)
 |-- duration: integer (nullable = true)
 |-- start_date: timestamp (nullable = true)
 |-- start_station_name: string (nullable = true)
 |-- start_station_id: integer (nullable = true)
 |-- end_date: timestamp (nullable = true)
 |-- end_station_name: string (nullable = true)
 |-- end_station_id: integer (nullable = true)
 |-- bike_id: integer (nullable = true)
 |-- subscription_type: string (nullable = true)
 |-- zip_code: string (nullable = true)



In [20]:
station= spark.read\
.option("header", True)\
.option("inferSchema", True)\
.option("timestampFormat", 'M/d/y')\
.csv("station.csv")

In [21]:
station.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- dock_count: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- installation_date: timestamp (nullable = true)



1. Найти велосипед с максимальным временем пробега.

Посчитала, что время пробега, как пробег у машины, складывается из суммы поездок, поэтому искала максимум по сумме времен пробега.

In [171]:
from pyspark.sql.functions import col
res = trip.groupBy("bike_id").agg(sum("duration").alias("Max_duration"))
max_ = res.select( max("Max_duration")).collect()[0]["max(Max_duration)"]
answer = res.filter(col("Max_duration") == max_)
answer.show()

+-------+------------+
|bike_id|Max_duration|
+-------+------------+
|    535|    36229902|
+-------+------------+



2. Найти наибольшее геодезическое расстояние между станциями.

Нашла формулу расчета в интернете, применила к данным долготы и широты.

In [189]:
from math import sin, cos, radians, asin,  sqrt
R_of_earth = 6371.0 
def calculate_distance(lat1, long1, lat2, long2):
    lat1 = radians(lat1)
    long1 = radians(long1)
    lat2 = radians(lat2)
    long2 = radians(long2)
    
    dlong = long2 - long1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlong / 2)**2
    S = 2 * asin(sqrt(a))

    return R_of_earth * S

In [196]:
station_2 = station.withColumn("lat_2", col("lat")).withColumn("long_2", col("long")).drop("lat").drop("long")
station_joined = station.crossJoin(station_2)
#station_joined.show()
calculate_distance_udf = udf(calculate_distance, DoubleType())
station_joined_distance = station_joined.withColumn("distance",calculate_distance_udf(col("lat"),col("long"),col("lat_2"),col("long_2")))
answer = station_joined_distance.orderBy(col("distance").desc()).first()["distance"]
print("Max distance = ", answer)

Max distance =  69.92087595428183


3. Найти путь велосипеда с максимальным временем пробега через станции.

In [248]:
res = trip.groupBy("bike_id").agg(max("duration").alias("Max_duration"))
max_ = res.select( max("Max_duration")).collect()[0]["max(Max_duration)"]
ans = res.filter(col("Max_duration") == max_).collect()[0]["bike_id"]
answer = trip.select("bike_id", "start_station_name","end_station_name", "duration").orderBy(col("duration").desc()).filter(col("bike_id")==ans)
answer.show()
print("Путь максимальной длины велосипеда с номером ", ans, "начался в ",answer.collect()[0]["start_station_name"], "и закончился в ", answer.collect()[0]["end_station_name"])

+-------+--------------------+--------------------+--------+
|bike_id|  start_station_name|    end_station_name|duration|
+-------+--------------------+--------------------+--------+
|    535|South Van Ness at...|       2nd at Folsom|17270400|
|    535|South Van Ness at...|       2nd at Folsom|17270400|
|    535|  Powell Street BART|Civic Center BART...|   87638|
|    535|San Francisco Cal...|   Steuart at Market|   33659|
|    535|San Francisco Cal...|   Steuart at Market|   33659|
|    535|      Market at 10th|San Francisco Cal...|   25909|
|    535|       2nd at Folsom|Mechanics Plaza (...|   25179|
|    535|Powell at Post (U...|Powell at Post (U...|   24920|
|    535|   Market at Sansome|   Market at Sansome|   22787|
|    535|Broadway St at Ba...|Harry Bridges Pla...|   22363|
|    535|Broadway St at Ba...|Harry Bridges Pla...|   22363|
|    535|Grant Avenue at C...|Grant Avenue at C...|   21153|
|    535|Grant Avenue at C...|Grant Avenue at C...|   21153|
|    535|     Post at Ke

4. Найти количество велосипедов в системе.

In [198]:
print (trip.select("bike_id").distinct().count())

700


5. Найти пользователей потративших на поездки более 3 часов.

Аналогично с заданием 1, считала, что пользователи потратили на поездки суммарно более 3 часов.

In [231]:
answer = trip.groupBy("zip_code", "subscription_type").agg(sum("duration").alias("Sum_duration")).filter(col("Sum_duration") > 3 * 60 * 60 )
answer.show()
print (answer.count())

+--------+-----------------+------------+
|zip_code|subscription_type|Sum_duration|
+--------+-----------------+------------+
|   95125|       Subscriber|      847657|
|   94619|       Subscriber|      495300|
|   67202|         Customer|       10844|
|   80220|         Customer|      106868|
|   92869|         Customer|       16332|
|   30307|         Customer|       47337|
|   94502|         Customer|       99790|
|   95008|         Customer|     1281308|
|      55|         Customer|     1311638|
|   50324|         Customer|       11700|
|   22000|         Customer|       18586|
|   80219|         Customer|       23142|
|   13000|         Customer|       64020|
|      99|         Customer|       11818|
|    2465|         Customer|       24808|
|   49518|         Customer|       33839|
|   29464|         Customer|      104615|
|    3141|         Customer|       14423|
|   94035|         Customer|      165289|
|   94010|       Subscriber|     5459413|
+--------+-----------------+------