In [1]:
total_df = spark.read.json('/mnt/dacoursedatabricksstg/dacoursedatabricksdata/busFile')

In [2]:
total_df.count()

In [3]:
total_df = total_df[total_df.atStop == True]

In [4]:
total_df.count()

In [5]:
total_df = total_df[['_id','atStop','busStop','congestion','delay','justLeftStop','justStopped','latitude' ,'longitude','vehicleId','vehicleSpeed','actualDelay','timestamp','journeyPatternId']]

In [6]:
#correct original data and add new fields
from pyspark.sql.functions import udf
from pyspark.sql.types import LongType,TimestampType,StringType,DateType

def get_weekday(s):
  return s.weekday()

def get_hour(s):
  return s.hour

def get_is_weekend(s):
  return 1 if s in [6,7] else 0

def get_id(s):
  return s['$oid']

def get_month(s):
  return s.month

def get_only_date(s):
  return str(s.date())

def get_just_left(s):
  return 1 if s else 0

def get_just_stopped(s):
  return 1 if s else 0

def get_congestion(s):
  return 1 if s else 0

def get_timestamp(s):
    return int(s['$numberLong'][:-3])

get_timestamp_udf = udf(get_timestamp,LongType())
get_weekday_udf = udf(get_weekday, LongType())
get_hour_udf = udf(get_hour, LongType())
get_is_weekend_udf = udf(get_is_weekend, LongType())
get_id_udf = udf(get_id, StringType())
get_month_udf = udf(get_month, LongType())
get_only_date_udf = udf(get_only_date, StringType())
get_just_left_udf = udf(get_just_left, LongType())
get_just_stopped_udf = udf(get_just_stopped, LongType())
get_congestion_udf = udf(get_congestion, LongType())


total_df = total_df.withColumn('date', get_timestamp_udf('timestamp').cast('timestamp'))
total_df = total_df.withColumn('weekday', get_weekday_udf('date'))
total_df = total_df.withColumn('month', get_month_udf('date'))
total_df = total_df.withColumn('only_date', get_only_date_udf('date'))
total_df = total_df.withColumn('is_weekend', get_is_weekend_udf('weekday'))
total_df = total_df.withColumn('hour', get_hour_udf('date'))
total_df = total_df.withColumn('id', get_id_udf('_id'))
total_df = total_df.withColumn('just_left', get_just_left_udf('justLeftStop'))
total_df = total_df.withColumn('just_stopped', get_just_stopped_udf('justStopped'))
total_df = total_df.withColumn('conges', get_just_left_udf('congestion'))
total_df = total_df.drop(*['_id','atStop','justLeftStop','justStopped','congestion','timestamp'])

In [7]:
from pyspark.sql.functions import *
import pyspark.sql.functions as f
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.sql.functions import lit

# small_list = total_df.take(1000)
# small_df = spark.createDataFrame(small_list, total_df.columns)

#keep only transitions between stops
my_window = Window.partitionBy('vehicleId').orderBy('date')
learn_df = total_df.withColumn('origin', f.lag(total_df.busStop.cast("bigint")).over(my_window))
learn_df = learn_df[learn_df['busStop'] != learn_df['origin']]

#calculate time difference between stops
learn_df = learn_df.withColumn('prev_ts', f.lag(learn_df.date.cast("bigint")).over(my_window))
learn_df = learn_df.withColumn('time_to_reach_next', f.when(f.isnull(learn_df.date.cast("bigint") - learn_df.prev_ts), 0)
                            .otherwise(learn_df.date.cast("bigint") - learn_df.prev_ts))
learn_df = learn_df[ learn_df['time_to_reach_next'] != 0]

#transform into training data
learn_df = learn_df.withColumnRenamed('busStop', 'dest')
learn_df = learn_df.drop('prev_ts')

#add field of from->to
def get_from_to(frm,to):
  return str(frm)+'->'+str(to)

get_from_to_udf = udf(get_from_to, StringType())
learn_df = learn_df.withColumn('from_to', lit(get_from_to_udf(f.col('origin'),f.col('dest'))))

w = Window.partitionBy('from_to')
learn_df = learn_df.withColumn('seg_count', f.count('from_to').over(w))

In [8]:
learn_df = learn_df[learn_df.time_to_reach_next < 7200] #drop rows with more then two hours between stops
learn_df = learn_df[learn_df.time_to_reach_next > 30]  #drop rows with less then 30 seconds between stops
learn_df = learn_df.filter(learn_df.seg_count > 100) #take only comoon segments 
learn_df = learn_df.filter(learn_df.date < lit('2018-08-01')) #save last two months of the data for testing in streaming (August\July 2018)

In [9]:
learn_df.count()

In [10]:
my_window = Window.partitionBy('from_to')
#mean time segment
learn_df = learn_df.withColumn('mean_seg', f.mean('time_to_reach_next').over(my_window))
#variance in segment
learn_df = learn_df.withColumn('stdev_seg', f.stddev('time_to_reach_next').over(my_window))

#remove outlayres, greater than 3 stdevs from avg or or less than 2 stdevs 
learn_df = learn_df[learn_df.time_to_reach_next < (learn_df.mean_seg + 3 * learn_df.stdev_seg)]
learn_df = learn_df[learn_df.time_to_reach_next > (learn_df.mean_seg - 3 * learn_df.stdev_seg)]

#max time segment
learn_df = learn_df.withColumn('max_in_seg', f.max('time_to_reach_next').over(my_window))

#min time segment
learn_df = learn_df.withColumn('min_in_seg', f.min('time_to_reach_next').over(my_window))

#prev_time_in_segment
my_window = Window.partitionBy('from_to').orderBy('date')
learn_df = learn_df.withColumn('prev_time', f.lag(learn_df.time_to_reach_next).over(my_window))

#calculate distances
my_window = Window.partitionBy('vehicleId').orderBy('date')
learn_df = learn_df.withColumn('prev_lat', f.lag(learn_df.latitude).over(my_window))
learn_df = learn_df.withColumn('prev_lon', f.lag(learn_df.longitude).over(my_window))   
learn_df = learn_df.na.fill(0)
learn_df = learn_df[learn_df['prev_lat'] != 0]

In [11]:
from math import radians, cos, sin, asin, sqrt

@f.udf("float")
def get_distance(longit_a, latit_a, longit_b, latit_b):
    if None in [longit_a, latit_a, longit_b, latit_b]:
        return 9999
    # Transform to radians
    longit_a, latit_a, longit_b, latit_b = map(radians, [longit_a,  latit_a, longit_b, latit_b])
    dist_longit = longit_b - longit_a
    dist_latit = latit_b - latit_a
    # Calculate area
    area = sin(dist_latit/2)**2 + cos(latit_a) * cos(latit_b) * sin(dist_longit/2)**2
    # Calculate the central angle
    central_angle = 2 * asin(sqrt(area))
    radius = 6371 # THIS IN KM
    # Calculate Distance
    distance = central_angle * radius
    return distance * 1000
  
@f.udf("float")
def get_dis_from_center(longit_a, latit_a):
  longit_b, latit_b = 53.3422665, -6.2554468 #city center coordinates
  if None in [longit_a, latit_a, longit_b, latit_b]:
      return 9999
  # Transform to radians
  longit_a, latit_a, longit_b, latit_b = map(radians, [longit_a,  latit_a, longit_b, latit_b])
  dist_longit = longit_b - longit_a
  dist_latit = latit_b - latit_a
  # Calculate area
  area = sin(dist_latit/2)**2 + cos(latit_a) * cos(latit_b) * sin(dist_longit/2)**2
  # Calculate the central angle
  central_angle = 2 * asin(sqrt(area))
  radius = 6371 # THIS IN KM
  # Calculate Distance
  distance = central_angle * radius
  return distance * 1000


learn_df = learn_df.withColumn('distnce_between',get_distance(f.col('prev_lat'),f.col('prev_lon'),f.col('latitude'),f.col('longitude')))
learn_df = learn_df.withColumn('lat_between',(learn_df.prev_lat + learn_df.latitude)/ 2)
learn_df = learn_df.withColumn('lon_between',(learn_df.prev_lon + learn_df.longitude)/ 2)
learn_df = learn_df.withColumn('dis_from_center',get_dis_from_center(f.col('lat_between'),f.col('lon_between')))
learn_df = learn_df[learn_df['distnce_between'] < 100000]
learn_df = learn_df[learn_df['dis_from_center'] < 100000]
learn_df = learn_df.withColumn('speed_in_seg',learn_df.distnce_between / learn_df.time_to_reach_next)
learn_df = learn_df[learn_df['speed_in_seg'] < 55] #drop segments with speed over 200kmh
learn_df = learn_df.drop('prev_lat','prev_lon','lat_between','lon_between')

In [12]:
#count specific attributes from origin to dest
cols = ['journeyPatternId', 'origin','dest']
w = Window.partitionBy(cols)
ranked =  learn_df.withColumn('specific', f.count('dest').over(w))
ranked = ranked.withColumn('avg_ts', f.mean('date').over(w))

#count total attributes from origin
cols = ['journeyPatternId', 'origin']
w = Window.partitionBy(cols)
ranked =  ranked.withColumn('total', f.count('origin').over(w))

#calculate precentege of each option of 'origin->dest'
ranked = ranked.withColumn('precentege', f.col('specific') / f.col('total'))

#create table to store true paths of journy id and origin
true_paths = ranked.groupBy('journeyPatternId','origin','dest').max('precentege','avg_ts')
cols = ['journeyPatternId', 'origin']
w = Window.partitionBy(cols)#.orderBy(count('dest'))
true_paths =  true_paths.withColumn('top_score', f.max('max(precentege)').over(w))
true_paths = true_paths[true_paths['max(precentege)'] > 0.25] #keep only common transitions (over 25%)
true_paths =  true_paths.withColumn('most_relevent_ts', f.max('max(avg_ts)').over(w))

true_paths = true_paths[true_paths['most_relevent_ts'] == true_paths['max(avg_ts)']]
true_paths = true_paths.withColumnRenamed('max(precentege)','seg_score')
true_paths = true_paths.drop('max(avg_ts)','most_relevent_ts')

# true_paths.show()

In [13]:
# true_paths.write.saveAsTable("true_paths_v2")
true_paths = spark.read.options(header='true', inferSchema = 'true').load('/user/hive/warehouse/true_paths_v2')

In [14]:
small_list = learn_df.take(100000)
small_df = spark.createDataFrame(small_list, learn_df.columns)

In [15]:
import six
for i in small_df.drop('date').columns:
    if not( isinstance(small_df.select(i).take(1)[0][0], six.string_types)):
        print( "Correlation for ", i, small_df.stat.corr('time_to_reach_next',i))

In [16]:
import matplotlib.pyplot as plt

small_pandas = small_df.toPandas()
# plt.matshow(small_pandas.corr())
# display(plt.show())
small_pandas = small_pandas.drop(['vehicleId','only_date','id','journeyPatternId','just_stopped','hour','is_weekend','just_left','month','weekday','date','conges','from_to','seg_count'], axis = 1)
f = plt.figure(figsize=(10, 10))
plt.matshow(small_pandas.corr(), fignum=f.number)
plt.xticks(range(small_pandas.shape[1]), small_pandas.columns, fontsize=8, rotation=45)
plt.yticks(range(small_pandas.shape[1]), small_pandas.columns, fontsize=8)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
display(f)

In [17]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml import Pipeline

df = learn_df

numericCols =['delay','vehicleSpeed','actualDelay','max_in_seg','min_in_seg','stdev_seg','distnce_between','dis_from_center','hour','speed_in_seg','latitude','longitude','mean_seg','prev_time']
assembler = VectorAssembler(inputCols = numericCols, outputCol="features")
df = assembler.transform(df)
df = df.select([ 'features','time_to_reach_next'])

In [18]:
df.count()

In [19]:
train, test = df.randomSplit([0.99, 0.01])
# print("Training Dataset Count: " + str(train.count()))
# print("Test Dataset Count: " + str(test.count()))

In [20]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

lr = LinearRegression(featuresCol = 'features', labelCol='time_to_reach_next', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(train)

lr_predictions = lr_model.transform(test)
lr_predictions.select("prediction","time_to_reach_next","features").show(5)

lr_evaluator = RegressionEvaluator(predictionCol="prediction",labelCol="time_to_reach_next",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

test_result = lr_model.evaluate(test)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

In [21]:
from pyspark.ml.regression import DecisionTreeRegressor
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'time_to_reach_next')
dt_model = dt.fit(train)
dt_predictions = dt_model.transform(test)
dt_predictions.select("prediction","time_to_reach_next","features").show(5)

dt_evaluator = RegressionEvaluator(labelCol="time_to_reach_next", predictionCol="prediction", metricName="r2")
print("R Squared (R2) on test data = %g" % dt_evaluator.evaluate(dt_predictions))

dt_evaluator = RegressionEvaluator(labelCol="time_to_reach_next", predictionCol="prediction", metricName="rmse")
print("Root Mean Squared Error (RMSE) on test data = %g" % dt_evaluator.evaluate(dt_predictions))

In [22]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

gbt = GBTRegressor(featuresCol = 'features', labelCol = 'time_to_reach_next', maxIter=10)
gbt_model = gbt.fit(train)
gbt_predictions = gbt_model.transform(test)
gbt_predictions.select("prediction","time_to_reach_next","features").show(5)

gbt_evaluator = RegressionEvaluator(labelCol="time_to_reach_next", predictionCol="prediction", metricName="r2")
print("R Squared (R2) on test data = %g" % gbt_evaluator.evaluate(gbt_predictions))

gbt_evaluator = RegressionEvaluator(labelCol="time_to_reach_next", predictionCol="prediction", metricName="rmse")
print("Root Mean Squared Error (RMSE) on test data = %g" % gbt_evaluator.evaluate(gbt_predictions))

In [23]:
# gbt_model.save('/user/hive/warehouse/streaming_gbt_model')

# from pyspark.ml.regression import GBTRegressionModel
# new_model = GBTRegressionModel.load('/user/hive/warehouse/super_gbt_model')

In [24]:
# all_seg_freq = learn_df.groupBy('from_to').count().orderBy('count', ascending=False)
# all_seg_freq.write.saveAsTable("all_segments_freq")
all_segments_freq = spark.read.options(header='true', inferSchema = 'true').load('/user/hive/warehouse/all_segments_freq')
pandas_freq = all_segments_freq.toPandas()
filtered_freqs = pandas_freq[pandas_freq['count'] > 100]
print('all segments in data: ', len(pandas_freq))
print('segments with over 100 records: ', len(filtered_freqs))

top_50 = [x for x in filtered_freqs['from_to'].values][:50]

In [25]:
display(filtered_freqs['count'].plot.hist(bins = 100, log = True, title = 'Segment Histogram'))

In [26]:
filtered_freqs = filtered_freqs.sort_values(by=['count'], ascending  = False)
filtered_freqs['nums'] = [i for i in range(len(filtered_freqs))]
display(filtered_freqs.plot(x='nums',y ='count',legend = True, title = 'Segments Count', xlim = [-1000, 24000]))

In [27]:
from pyspark.ml.regression import GBTRegressor

total_errors = 0
for seg in top_50:
  df = learn_df[learn_df.from_to == seg] ##learn_df
  numericCols = ['delay','vehicleSpeed','actualDelay','max_in_seg','min_in_seg','stdev_seg','distnce_between','dis_from_center','hour','speed_in_seg','latitude','longitude','mean_seg','prev_time']
  assembler = VectorAssembler(inputCols = numericCols, outputCol="features")
  df = assembler.transform(df)
  df = df.select([ 'features','time_to_reach_next'])
  train, test = df.randomSplit([0.7, 0.3])
  gbt = GBTRegressor(featuresCol = 'features', labelCol = 'time_to_reach_next', maxIter=10)
  gbt_model = gbt.fit(train)
  gbt_predictions = gbt_model.transform(test)
  gbt_evaluator = RegressionEvaluator(labelCol="time_to_reach_next", predictionCol="prediction", metricName="rmse")
  temp_error = gbt_evaluator.evaluate(gbt_predictions)
  print("segment:", seg, "Root Mean Squared Error (RMSE) on test data = %g" %temp_error)
  total_errors += temp_error

In [28]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

gbt = GBTRegressor(featuresCol = 'features', labelCol = 'time_to_reach_next', maxIter=10)
gbt_model = gbt.fit(train)
gbt_predictions = gbt_model.transform(test)
gbt_predictions.select("prediction","time_to_reach_next","features").show(5)

gbt_evaluator = RegressionEvaluator(labelCol="time_to_reach_next", predictionCol="prediction", metricName="r2")
print("R Squared (R2) on test data = %g" % gbt_evaluator.evaluate(gbt_predictions))

gbt_evaluator = RegressionEvaluator(labelCol="time_to_reach_next", predictionCol="prediction", metricName="rmse")
print("Root Mean Squared Error (RMSE) on test data = %g" % gbt_evaluator.evaluate(gbt_predictions))

In [29]:
accidents_df = spark.read.csv("/FileStore/tables/accidents.csv", header = True)
events_df = spark.read.csv("/FileStore/tables/events.csv", header = True)

In [30]:
accidents_df = accidents_df.withColumnRenamed('date', 'only_date')
accidents_df = accidents_df.withColumn('date_time', accidents_df.date_time.cast('timestamp'))
accidents_df = accidents_df.withColumn('lat', accidents_df.lat.cast('float'))
accidents_df = accidents_df.withColumn('lon', accidents_df.lon.cast('float'))

accidents_df.show(5)

In [31]:
events_df = events_df.withColumnRenamed('date', 'only_date')
events_df = events_df.withColumn('lat', events_df.lat.cast('float'))
events_df = events_df.withColumn('lon', events_df.lon.cast('float'))

events_df.show(5)

In [32]:
import pyspark.sql.functions as f

df_joined = learn_df.join(accidents_df,learn_df.only_date == accidents_df.only_date, how = 'left')
df_joined = df_joined.withColumn('distnce_from_acc',get_distance(f.col('lat'),f.col('lon'),f.col('latitude'),f.col('longitude')))
df_joined = df_joined.filter(df_joined.distnce_from_acc < 1000)

df_joined = df_joined.withColumn('time_from_acc', (df_joined.date.cast("bigint") - df_joined.date_time.cast("bigint"))  / 60)
df_joined = df_joined.filter(df_joined.time_from_acc < 1800).filter(df_joined.time_from_acc > -900)

df_joined = df_joined.drop_duplicates(subset=['id'])
df_joined = df_joined.withColumn('near_accident', lit(1))
df_joined = df_joined.withColumnRenamed('id','acc_id')
accidints_ids = df_joined.select(['acc_id','near_accident']) #get all relevant id's

learn_with_accidents = learn_df.join(accidints_ids,learn_df.id == accidints_ids.acc_id, how = 'left_outer')
learn_with_accidents = learn_with_accidents.drop('acc_id')
learn_with_accidents = learn_with_accidents.fillna(0)

In [33]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

df = learn_with_accidents
numericCols = ['delay','vehicleSpeed','actualDelay','max_in_seg','min_in_seg','stdev_seg','distnce_between','dis_from_center','hour','speed_in_seg','latitude','longitude','mean_seg','prev_time','near_accident']
assembler = VectorAssembler(inputCols = numericCols, outputCol="features")
df = assembler.transform(df)
df = df.select([ 'features','time_to_reach_next'])
train, test = df.randomSplit([0.7, 0.3])
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'time_to_reach_next', maxIter=10)
gbt_model = gbt.fit(train)
gbt_predictions = gbt_model.transform(test)
gbt_evaluator = RegressionEvaluator(labelCol="time_to_reach_next", predictionCol="prediction", metricName="rmse")
temp_error = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" %temp_error)

In [34]:
import pyspark.sql.functions as f

df_joined = learn_df.join(events_df,learn_df.only_date == events_df.only_date, how = 'left')

df_joined = df_joined.withColumn('distnce_from_eve',get_distance(f.col('lat'),f.col('lon'),f.col('latitude'),f.col('longitude')))
df_joined = df_joined.filter(df_joined.distnce_from_eve < 1000)

df_joined = df_joined.drop_duplicates(subset=['id'])
df_joined = df_joined.withColumn('near_event', lit(1))
df_joined = df_joined.withColumnRenamed('id','eve_id')
events_ids = df_joined.select(['eve_id','near_event']) #get all relevant id's

learn_with_events = learn_df.join(events_ids,learn_df.id == events_ids.eve_id, how = 'left_outer')
learn_with_events = learn_with_events.drop('eve_id')
learn_with_events = learn_with_events.fillna(0)

In [35]:
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

df = learn_with_events
numericCols = ['delay','vehicleSpeed','actualDelay','max_in_seg','min_in_seg','stdev_seg','distnce_between','dis_from_center','hour','speed_in_seg','latitude','longitude','mean_seg','prev_time','near_event']
assembler = VectorAssembler(inputCols = numericCols, outputCol="features")
df = assembler.transform(df)
df = df.select([ 'features','time_to_reach_next'])
train, test = df.randomSplit([0.7, 0.3])
gbt = GBTRegressor(featuresCol = 'features', labelCol = 'time_to_reach_next', maxIter=10)
gbt_model = gbt.fit(train)
gbt_predictions = gbt_model.transform(test)
gbt_evaluator = RegressionEvaluator(labelCol="time_to_reach_next", predictionCol="prediction", metricName="rmse")
temp_error = gbt_evaluator.evaluate(gbt_predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" %temp_error)

In [36]:
segment_data = learn_df.groupBy('from_to').mean('mean_seg','stdev_seg','max_in_seg','min_in_seg','prev_time','distnce_between','dis_from_center','speed_in_seg')


In [37]:
segment_data = segment_data.withColumnRenamed('avg(mean_seg)', 'mean_seg')
segment_data = segment_data.withColumnRenamed('avg(stdev_seg)', 'stdev_seg')
segment_data = segment_data.withColumnRenamed('avg(max_in_seg)', 'max_in_seg')
segment_data = segment_data.withColumnRenamed('avg(min_in_seg)', 'min_in_seg')
segment_data = segment_data.withColumnRenamed('avg(prev_time)', 'prev_time')
segment_data = segment_data.withColumnRenamed('avg(distnce_between)', 'distnce_between')
segment_data = segment_data.withColumnRenamed('avg(dis_from_center)', 'dis_from_center')
segment_data = segment_data.withColumnRenamed('avg(speed_in_seg)', 'speed_in_seg')

segment_data.write.saveAsTable("segment_data")