# This is a basic Starter Kernel for the New York City Taxi Fare Prediction Playground Competition 
Here we'll use a simple linear model based on the travel vector from the taxi's pickup location to dropoff location which predicts the `fare_amount` of each ride.

This kernel uses some `pandas` and mostly `numpy` for the critical work.  There are many higher-level libraries you could use instead, for example `sklearn` or `statsmodels`.  

In [6]:
%pip install pandas
%pip install numpy
%pip install matplotlib
%pip install seaborn
%pip install tensorflow
%pip install keras
%pip install scikit-learn

# Initial Python environment setup...
import numpy as np # linear algebra
import pandas as pd # CSV file I/O (e.g. pd.read_csv)
import os # reading the input files we have access to

train_df =  pd.read_csv('train.csv', nrows = 10_000_000)
train_df.dtypes



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [7]:
# absolute differences in latitude and longitude
def add_travel_vector_features(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()

add_travel_vector_features(train_df)

# remove NaNs
train_df = train_df.dropna(how = 'any', axis = 'rows')
print('New size: %d' % len(train_df))

# remove obvious outliers
train_df = train_df[(train_df.abs_diff_longitude < 5.0) & (train_df.abs_diff_latitude < 5.0)]


New size: 9999931


In [8]:
# describe the dataset
print(train_df.describe().transpose())

# auffaellig sind 0 passenger als min! genauso wie abs diff long & abs diff lat
train_df = train_df[(train_df.abs_diff_longitude != 0) & (train_df.abs_diff_latitude != 0)]
train_df = train_df[(train_df.passenger_count != 0)]
# also raus damit xD
print('New size: %d' % len(train_df))

# todo: export the datetime to a useful format
# New York ist UTC-5 als Zeitzone!
# print(train_df['pickup_datetime'][0])

                        count       mean        std          min        25%  \
fare_amount         9979187.0  11.330602   9.772747  -107.750000   6.000000   
pickup_longitude    9979187.0 -72.569888  10.751870 -3348.349457 -73.992087   
pickup_latitude     9979187.0  39.951586   6.587642 -3488.079513  40.734980   
dropoff_longitude   9979187.0 -72.569024  10.751762 -3348.349457 -73.991402   
dropoff_latitude    9979187.0  39.951931   6.587692 -3488.079513  40.734093   
passenger_count     9979187.0   1.684880   1.323364     0.000000   1.000000   
abs_diff_longitude  9979187.0   0.022498   0.038505     0.000000   0.005789   
abs_diff_latitude   9979187.0   0.021107   0.028934     0.000000   0.006574   

                          50%        75%          max  
fare_amount          8.500000  12.500000  1273.310000  
pickup_longitude   -73.981832 -73.967171  3442.185068  
pickup_latitude     40.752663  40.767142  2973.980450  
dropoff_longitude  -73.980177 -73.963753  3442.185068  
dropoff_

In [18]:
from sklearn import model_selection

# Url vom Beispiel - https://www.tensorflow.org/tutorials/keras/regression
# splitte die labels von den features
# print(train_df.head())

# einfaches modell ohne timestamps
train_labels = np.array(train_df['fare_amount'])
train_features_df = train_df.iloc[:, [7,8,9]]
print(train_features_df.head())
train_features = train_features_df.to_numpy()

# Split the data into training and testing sets
# x = features, y = labels
train_x, val_x, train_y, val_y = model_selection.train_test_split(train_features, train_labels, test_size=0.2, random_state=42)


   passenger_count  abs_diff_longitude  abs_diff_latitude
0                1            0.002701           0.009041
1                1            0.036780           0.070701
2                2            0.008504           0.010708
3                1            0.004437           0.024949
4                1            0.011440           0.015754


In [23]:
import tensorflow as tf
print(tf.__version__)

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping
import keras


# normalisierungslayer --> will irgendwie gerade nicht 
# normalizer = tf.keras.layers.Normalization(axis=None)

model = keras.Sequential()
model.add(keras.layers.Dense(3, activation='relu', input_shape=[3]))
model.add(Dropout(0.25))
model.add(keras.layers.Dense(64, activation='relu'))
model.add(Dropout(0.25))
model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.Dense(1))

model.compile(
    loss='mean_squared_error', 
    optimizer=tf.keras.optimizers.legacy.Adam(0.001),
    metrics=[keras.metrics.RootMeanSquaredError()])

model.summary()

2.15.0
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_16 (Dense)            (None, 3)                 12        
                                                                 
 dropout_8 (Dropout)         (None, 3)                 0         
                                                                 
 dense_17 (Dense)            (None, 64)                256       
                                                                 
 dropout_9 (Dropout)         (None, 64)                0         
                                                                 
 dense_18 (Dense)            (None, 64)                4160      
                                                                 
 dense_19 (Dense)            (None, 1)                 65        
                                                                 
Total params: 4493 (17.55 KB)
Trainable params:

In [24]:
# train the model
epochs = 100
batch_size = 2048


history = model.fit(
    train_x,
    train_y,
    verbose=1,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(val_x, val_y))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
# Evaluate the model






# model.evaluate(X_test, y_test)

In [None]:
# load test data
test_df = pd.read_csv('/Users/lucamainitz/Library/Mobile Documents/com~apple~CloudDocs/HAWK/Machine_Learning/test.csv')
print(test_df.head())

test_df = test_df.iloc[:, [2,3,4,5,6]]

# implement prediction for submission
test_x = test_df.to_numpy()
test_y = test_df.iloc[:, 2].to_numpy()
print(test_y)

results = model.evaluate(x_test, y_test, batch_size=128)


                           key          pickup_datetime  pickup_longitude  \
0  2015-01-27 13:08:24.0000002  2015-01-27 13:08:24 UTC        -73.973320   
1  2015-01-27 13:08:24.0000003  2015-01-27 13:08:24 UTC        -73.986862   
2  2011-10-08 11:53:44.0000002  2011-10-08 11:53:44 UTC        -73.982524   
3  2012-12-01 21:12:12.0000002  2012-12-01 21:12:12 UTC        -73.981160   
4  2012-12-01 21:12:12.0000003  2012-12-01 21:12:12 UTC        -73.966046   

   pickup_latitude  dropoff_longitude  dropoff_latitude  passenger_count  
0        40.763805         -73.981430         40.743835                1  
1        40.719383         -73.998886         40.739201                1  
2        40.751260         -73.979654         40.746139                1  
3        40.767807         -73.990448         40.751635                1  
4        40.789775         -73.988565         40.744427                1  


NameError: name 'x_test' is not defined

In [None]:
# create subset for visualisation 

# berechnen der absoluten strecke luftlinie

train_df = train_df