In [1]:
import numpy as np
import pandas as pd   


In [2]:
import numpy as np
import pandas as pd   
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import torch
import itertools

import pyro
import pyro.distributions as dist
from pyro.contrib.autoguide import AutoDiagonalNormal, AutoMultivariateNormal
from pyro.infer import MCMC, NUTS, HMC, SVI, Trace_ELBO
from pyro.optim import Adam, ClippedAdam

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
# fix random generator seed (for reproducibility of results)
np.random.seed(42)

# matplotlib options
palette = itertools.cycle(sns.color_palette())
plt.style.use('ggplot')
%matplotlib inline
plt.rcParams['figure.figsize'] = (16, 10)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv("Data/2018.csv")

In [4]:
df.FL_DATE = pd.to_datetime(df.FL_DATE, format="%Y-%m-%d")
df = df.set_index('FL_DATE')

In [5]:
df = df.loc['2018-01-01':'2018-03-01']

In [None]:
df.tail()

In [None]:
df.dtypes

In [6]:
df = df.fillna(0)

In [None]:
plt.hist(df["TAXI_OUT"])



In [7]:
X_features = np.concatenate([pd.get_dummies(df[x]) for x in ["ORIGIN", "DEST"]], axis=1)
df = df.drop(columns=['ORIGIN', 'DEST'])
X = np.concatenate([df[["TAXI_OUT"]], X_features], axis=1)
print(X_features.shape)
y = df["CARRIER_DELAY"].values


(1110848, 674)


In [None]:
print(X_features.shape)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


# standardize input features
X_train_mean = X_train.mean(axis=0)
X_train_std = X_train.std(axis=0)
X_train = (X_train - X_train_mean) / X_train_std

X_test_mean = X_test.mean(axis=0)
X_test_std = X_test.std(axis=0)
X_test = (X_test - X_test_mean) / X_test_std

# standardize target
y_train_mean = y_train.mean()
y_train_std = y_train.std()
y_train = (y_train - y_train_mean) / y_train_std

y_test_mean = y_test.mean()
y_test_std = y_test.std()
y_test = (y_test - y_test_mean) / y_test_std


In [9]:
print("num train: %d" % len(y_train))
print("num test: %d" % len(y_test))

num train: 744268
num test: 366580


In [10]:
def compute_error(trues, predicted):
    corr = np.corrcoef(predicted, trues)[0,1]
    mae = np.mean(np.abs(predicted - trues))
    rae = np.sum(np.abs(predicted - trues)) / np.sum(np.abs(trues - np.mean(trues)))
    rmse = np.sqrt(np.mean((predicted - trues)**2))
    r2 = max(0, 1 - np.sum((trues-predicted)**2) / np.sum((trues - np.mean(trues))**2))
    return corr, mae, rae, rmse, r2

In [11]:

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
y_hat = regr.predict(X_test)




corr, mae, rae, rmse, r2 = compute_error(y_test, y_hat)
print("CorrCoef: %.3f\nMAE: %.3f\nRMSE: %.3f\nR2: %.3f" % (corr, mae, rmse, r2))

CorrCoef: 0.001
MAE: 356234964.496
RMSE: 591449137.985
R2: 0.000


REALLY bad model

In [None]:
X.shape[1]

First a model with TAXI_OUT, DESTINATION and ORIGIN as inpot with same regression coefficients

In [12]:
def model(X, obs=None):
    #gp = pyro.sample("gp", dist.Gamma(k,theta))
    alpha = pyro.sample("alpha", dist.Normal(0., 1.))                   # Prior for the bias/intercept
    beta  = pyro.sample("beta", dist.Normal(torch.zeros(X.shape[1]), 
                                            torch.ones(X.shape[1])).to_event())    # Priors for the regression coeffcients
    sigma = pyro.sample("sigma", dist.HalfCauchy(5.))                   # Prior for the variance
    
    with pyro.plate("data"):
        y = pyro.sample("y", dist.Normal(alpha + X.matmul(beta), sigma), obs=obs)
        
    return y

In [13]:
# Prepare data for Pyro model
X_train_small = torch.tensor(X_train[:100,:]).float()
y_train_small = torch.tensor(y_train[:100]).float()

In [None]:
X_train_small

In [14]:
# Run inference in Pyro
nuts_kernel = NUTS(model)
mcmc = MCMC(nuts_kernel, num_samples=1000, warmup_steps=200, num_chains=1)
mcmc.run(X_train_small, y_train_small)

# Show summary of inference results
mcmc.summary()


Sample: 100%|██████████| 1200/1200 [03:34,  5.59it/s, step size=4.75e-02, acc. prob=0.926]



                mean       std    median      5.0%     95.0%     n_eff     r_hat
     alpha      0.12      0.79      0.11     -1.18      1.43   1306.14      1.00
   beta[0]     -0.11      0.51     -0.11     -0.96      0.68   1156.14      1.00
   beta[1]     -0.02      1.01      0.01     -1.57      1.71   3278.12      1.00
   beta[2]     -0.01      1.01     -0.02     -1.69      1.60   1969.67      1.00
   beta[3]     -0.01      0.34     -0.02     -0.55      0.54    801.82      1.00
   beta[4]     -0.02      1.02     -0.02     -1.65      1.67   2786.35      1.00
   beta[5]     -0.01      1.00     -0.01     -1.59      1.64   1857.33      1.00
   beta[6]     -0.02      1.01     -0.02     -1.57      1.64   2396.03      1.00
   beta[7]      0.00      0.97     -0.02     -1.60      1.51   2201.83      1.00
   beta[8]      0.01      0.12      0.01     -0.18      0.21   1259.36      1.00
   beta[9]     -0.01      0.98      0.04     -1.67      1.66   1789.76      1.00
  beta[10]     -0.01      1

In [None]:
mcmc.run(X_train_small, y_train_small)
