In [15]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from scipy.stats import pearsonr

In [6]:
# read in data
bus = pd.read_csv('cleaned_bus.csv', index_col='Month', parse_dates=True)

In [17]:
# compute first 12 lags of ACF with statsmodels
sm_acf = sm.tsa.acf(bus, nlags=12, fft=False)
sm_acf

array([1.        , 0.96185762, 0.92394043, 0.89402625, 0.86212294,
       0.82747604, 0.7949652 , 0.7697359 , 0.74331566, 0.71953315,
       0.69907988, 0.68922447, 0.68267552])

In [19]:
# compute ACF by shifting and using scipy.stats.pearsonr
# drop nulls that arise from shifting and corresponding rows of unshifted data
trimmed_acf = []
for lag in range(13):
    shifted = bus.riders.shift(lag).iloc[lag:]
    trimmed = bus.riders.iloc[lag:]
    corr = pearsonr(shifted, trimmed)[0] # [0] to grab r ([1] is p-value)
    trimmed_acf.append(corr)
trimmed_acf = np.array(trimmed_acf)
trimmed_acf

array([1.        , 0.97784005, 0.95790248, 0.94820686, 0.93529963,
       0.92144877, 0.91243471, 0.90775495, 0.90613805, 0.90499072,
       0.90268042, 0.91201503, 0.92676713])

In [20]:
# not the same - how different are they?
sm_acf - trimmed_acf

array([ 0.        , -0.01598243, -0.03396205, -0.05418061, -0.07317669,
       -0.09397274, -0.11746951, -0.13801905, -0.16282239, -0.18545757,
       -0.20360053, -0.22279057, -0.24409161])

In [21]:
# maybe acf is filling in the missing values with zeroes?
# from looking at the source code, doesn't seem like it, but let's try
zeroed_acf = []
for lag in range(13):
    shifted = bus.riders.shift(lag).fillna(0)
    corr = pearsonr(shifted, bus.riders)[0]
    zeroed_acf.append(corr)
zeroed_acf = np.array(zeroed_acf)
zeroed_acf

array([1.        , 0.95881238, 0.92946761, 0.91543468, 0.90219115,
       0.89313096, 0.89021813, 0.89252098, 0.89759803, 0.90131633,
       0.90374304, 0.91321243, 0.9245165 ])

In [22]:
# different again!
sm_acf - zeroed_acf

array([ 0.        ,  0.00304524, -0.00552718, -0.02140843, -0.04006821,
       -0.06565492, -0.09525293, -0.12278508, -0.15428237, -0.18178318,
       -0.20466316, -0.22398796, -0.24184098])

In [None]:
# so why does statsmodels give a different ACF than calculating directly with Pearson's r?