In [78]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [79]:
spark = (
    SparkSession.builder.appName("preprocessing of taxi data")
    .config("spark.sql.repl.eagerEval.enabled", True) 
    .config("spark.sql.parquet.cacheMetadata", "true")
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.driver.memory", "15g")
    .getOrCreate()
)

In [80]:
aggregate_sdf = spark.read.parquet('../data/curated/analysed')

In [81]:
aggregate_sdf

PULocationID,fare_amount,extra,tip_amount,duration (minutes),date,time,year,month,day,average_temperature,precip,time_float,cos_time,sin_time,date_float,cos_date,sin_date,month_float,cos_month,sin_month,trip_value,holiday,high_value
238,6.0,0.5,2.0,4.367,2016-04-01,00:41:18,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.5882757,False,True
166,17.0,0.5,3.65,19.2,2016-04-01,00:24:47,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.898875,False,False
164,5.5,0.5,1.35,4.133,2016-04-01,00:45:45,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.4511493,False,True
170,5.5,0.5,1.7,3.517,2016-04-01,00:19:40,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.7865226,False,True
158,10.0,0.5,2.8,12.733,2016-04-01,00:04:05,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.85233647,False,False
170,21.0,0.5,4.45,24.783,2016-04-01,00:30:36,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.8544244,False,False
246,5.5,0.5,1.35,5.817,2016-04-01,00:42:39,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.031047,False,False
164,7.0,0.5,1.65,6.433,2016-04-01,00:53:36,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.1606405,False,True
161,4.5,0.5,1.2,3.7,2016-04-01,00:50:50,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,1.3673513,False,True
48,22.5,0.5,4.75,33.267,2016-04-01,00:13:26,2016,4,1,70.34,0.37,0.020833334,0.9914449,0.13052619,0.5744048,-0.8926982,-0.45065498,0.25,6.123234000000001e-17,1.0,0.68067455,False,False


In [82]:
cnt_cond = lambda cond: F.sum(F.when(cond, 1).otherwise(0))
aggregate_sdf = aggregate_sdf.groupBy("PULocationID", "date", "time_float") \
                             .agg(F.first("average_temperature") ,
                                  F.first("time"),
                                  F.first("precip"),
                                  F.first("cos_time"),
                                  F.first("sin_time"),
                                  F.first("date_float"),
                                  F.first("cos_date"),
                                  F.first("sin_date"),
                                  F.first("month_float"),
                                  F.first("cos_month"),
                                  F.first("sin_month"),
                                  F.first("trip_value"),
                                  F.first("holiday"),
                                  cnt_cond(F.col('high_value') == True)
                                 )

In [83]:

aggregate_sdf = aggregate_sdf.withColumnRenamed("first(average_temperature)", "average_temperature") \
                             .withColumnRenamed("first(time)", "time") \
                             .withColumnRenamed("first(precip)", "precip") \
                             .withColumnRenamed("first(cos_time)", "cos_time")	\
                             .withColumnRenamed("first(sin_time)", "sin_time")	\
                             .withColumnRenamed("first(date_float)", "date_float")	\
                             .withColumnRenamed("first(cos_date)", "cos_date")	\
                             .withColumnRenamed("first(sin_date)", "sin_date")	\
                             .withColumnRenamed("first(month_float)", "month_float") \
                             .withColumnRenamed("first(cos_month)", "cos_month") \
                             .withColumnRenamed("first(holiday)", "holiday") \
                             .withColumnRenamed("first(trip_value)", "trip_value") \
                             .withColumnRenamed("sum(CASE WHEN (high_value = true) THEN 1 ELSE 0 END)", "high_value_trips") \
                             .withColumnRenamed("first(sin_month)", "sin_month") \
                             .withColumnRenamed("first(trip_value)", "trip_value") 
                            

In [84]:
aggregate_sdf.write.mode('overwrite').parquet('../data/curated/aggregated')

                                                                                

In [85]:
train_sdf = aggregate_sdf.filter(F.col("date") <= "2016-12-31")
valid_sdf = aggregate_sdf.filter((F.col("date") > "2016-12-31") & (F.col("date") <= "2017-02-28"))
test_sdf = aggregate_sdf.filter((F.col("date") > "2017-02-28") & (F.col("date") <= "2017-05-31"))


In [86]:
print(train_sdf.count())
print(valid_sdf.count())
print(test_sdf.count())

                                                                                

927997


                                                                                

141709




221574


                                                                                

In [87]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

train = train_sdf.drop("date", "first(trip_value)").toPandas()
valid = valid_sdf.drop("date", "first(trip_value)").toPandas()
test = test_sdf.drop("date", "first(trip_value)").toPandas()


                                                                                

In [88]:
xname =  ['PULocationID',
          'average_temperature',
          'precip',
          'time_float',
          'cos_time',
          'sin_time',
          'date_float',
          'cos_date',
          'sin_date',
          'month_float',
          'cos_month',
          'sin_month',
          'holiday']
xtrain = train[xname]
xvalid = valid[xname]
xtest = test[xname]

ytrain = np.log10(train['high_value_trips'] + 1)
yvalid = np.log10(valid['high_value_trips'] + 1)
ytest = np.log10(test['high_value_trips'] + 1)


In [90]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

ohe = OneHotEncoder(handle_unknown='ignore')

Train_data_to_ohe = xtrain[['PULocationID']]
Train_data_ohe = ohe.fit_transform(Train_data_to_ohe).toarray()

Train_data_ohe = pd.DataFrame(Train_data_ohe, columns=list(ohe.get_feature_names_out(['PULocationID'])))

xtrain = xtrain.drop(['PULocationID'], axis=1)

for col in Train_data_ohe.columns:
    new_col = Train_data_ohe[col]
    new_col.index = range(len(new_col))

    xtrain[str(col)] = new_col


Val_data_to_ohe = xvalid[['PULocationID']]
Val_data_ohe = ohe.transform(Val_data_to_ohe).toarray()

Val_data_ohe = pd.DataFrame(Val_data_ohe, columns=list(ohe.get_feature_names_out(['PULocationID'])))

xvalid = xvalid.drop(['PULocationID'], axis=1)

for col in Val_data_ohe.columns:
    new_col = Val_data_ohe[col]
    new_col.index = range(len(new_col))

    xvalid[str(col)] = new_col


Test_data_to_ohe = xtest[['PULocationID']]
Test_data_ohe = ohe.transform(Test_data_to_ohe).toarray()

Test_data_ohe = pd.DataFrame(Test_data_ohe, columns=list(ohe.get_feature_names_out(['PULocationID'])))

xtest = xtest.drop(['PULocationID'], axis=1)

for col in Test_data_ohe.columns:
    new_col = Test_data_ohe[col]
    new_col.index = range(len(new_col))

    xtest[str(col)] = new_col

KeyError: "None of [Index(['PULocationID'], dtype='object')] are in the [columns]"

In [66]:
from sklearn.metrics import mean_squared_error


reg = RandomForestRegressor(n_estimators=50, max_depth=30, n_jobs=-1, warm_start=True)
print("fitting")
reg.fit(xtrain,ytrain)
print("training")
training_accuracy = reg.score(xtrain, ytrain)
valid_accuracy = reg.score(xvalid, yvalid)
testing_accuracy = reg.score(xtest, ytest)
rmsetrain = np.sqrt(mean_squared_error(reg.predict(xtrain),ytrain))
rmsevalid = np.sqrt(mean_squared_error(reg.predict(xvalid),yvalid))
rmsetest = np.sqrt(mean_squared_error(reg.predict(xtest),ytest))
print("R^2 (train) = %0.3f, R^2 (valid) = %0.3f, R^2 (test) = %0.3f, RMSE (train) = %0.3f, RMSE (valid) = %0.3f, RMSE (test) = %0.3f" % 
        (training_accuracy, valid_accuracy, testing_accuracy, rmsetrain, rmsevalid, rmsetest))

fitting


KeyboardInterrupt: 

In [15]:
import pandas as pd
a = pd.DataFrame(np.round(np.power(10,np.column_stack((reg.predict(xvalid),yvalid))) - 1,decimals=0).astype(int))

In [34]:
import operator


dict_feat_imp = dict(zip(list(xtrain.columns.values),reg.feature_importances_))
sorted_features = sorted(dict_feat_imp.items(), key=operator.itemgetter(1), reverse=True)
sorted_features

[('time_float', 0.03697545899976474),
 ('PULocationID_236', 0.03204937126547357),
 ('PULocationID_237', 0.03204697070688485),
 ('PULocationID_170', 0.03195155559292659),
 ('PULocationID_186', 0.03118746370183021),
 ('PULocationID_141', 0.031104004756603423),
 ('PULocationID_90', 0.031076594300570064),
 ('PULocationID_163', 0.031030844232069758),
 ('PULocationID_231', 0.031019879762767422),
 ('PULocationID_48', 0.030998195862123715),
 ('PULocationID_239', 0.030987504882957432),
 ('PULocationID_229', 0.030970360678122085),
 ('PULocationID_249', 0.03096913766526675),
 ('PULocationID_230', 0.03096546493322501),
 ('PULocationID_140', 0.030955799543883104),
 ('PULocationID_162', 0.030703997217411138),
 ('PULocationID_68', 0.03058781490437246),
 ('PULocationID_138', 0.030457719981457045),
 ('PULocationID_164', 0.03043747087585426),
 ('PULocationID_79', 0.030405712018221922),
 ('PULocationID_132', 0.0303147771010766),
 ('PULocationID_107', 0.03024318307140166),
 ('PULocationID_234', 0.03023279

In [95]:
from sklearn.feature_selection import f_regression

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
#%%
# F-test
f = SelectKBest(f_regression, k=len(xtrain.columns))
f.fit(xtrain, ytrain)
#%%
# get the ranked features names and scores
kbest_scores1 = pd.DataFrame({'features': f.get_feature_names_out(), 'scores': f.scores_})
kbest_scores1 = kbest_scores1.sort_values('scores', ascending=False)
kbest_scores1



Unnamed: 0,features,scores
248,PULocationID_237,17791.006959
247,PULocationID_236,17750.548582
181,PULocationID_170,17251.440011
173,PULocationID_162,15999.901749
250,PULocationID_239,15712.925675
...,...,...
114,PULocationID_103,1.813995
121,PULocationID_110,0.813602
1,precip,0.470941
235,PULocationID_224,0.036190


In [92]:
import xgboost
xtrain["precip"] = pd.to_numeric(xtrain['precip'], errors='coerce')
xtest["precip"] = pd.to_numeric(xtrain['precip'], errors='coerce')
r = xgboost.XGBRegressor()
print("training")
r.fit(xtrain, ytrain)
print("testing")
training_accuracy = r.score(xtrain, ytrain)
print(training_accuracy)

training
testing
0.8799348469849553


In [94]:

ypred = r.predict(xtest)
mse = mean_squared_error(ytest, ypred)

print("MSE: %.2f" % mse)
print("RMSE: %.2f" % (mse**(1/2.0)))
testing_accuracy = r.score(xtest, ytest)
print(testing_accuracy)

MSE: 0.06
RMSE: 0.24
0.8710911130434095


In [59]:
# from sklearn.model_selection import GridSearchCV


# xgb = xgboost.XGBRegressor(
#  learning_rate =0.1,
#  max_depth=5,
#  min_child_weight=1,
#  gamma=0,
#  subsample=0.8,
#  colsample_bytree=0.8,
#  nthread=4,
#  scale_pos_weight=1,
#  seed=27)

# param = {
#  'max_depth':range(3,10),
#  'min_child_weight':range(1,10),
#  'gamma':[i/10.0 for i in range(0,5)],
#  'subsample':[i/10.0 for i in range(6,10)],
#  'colsample_bytree':[i/10.0 for i in range(6,10)],
#  'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05],
#  'n_estimators': range(10, 100, 10)
# }

# gsearch = GridSearchCV(xgb, param_grid = param, n_jobs=-1, cv=5)
# gsearch.fit(xtrain, ytrain)
# print(gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_)



KeyboardInterrupt: 