In [243]:
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pandas as pd
import scipy.stats
import numpy as np
from math import sqrt

In [244]:
#set up the metric calculations
def CalcRSqaured(observed, estimated):
    """Calculate the r^2 from a series of observed and estimated target values
    inputs:
    Observed: Series of actual observed values
    estimated: Series of predicted values"""
    
    r, p = scipy.stats.pearsonr(observed, estimated)
    R2 = r **2
    
    return R2

def CalcRMSE(observed, estimated):
    """Calculate Root Mean Square Error between a series of observed and estimated values
    inputs:
    Observed: Series of actual observed values
    estimated: Series of predicted values"""
    
    res = (observed -estimated)**2
    RMSE = round(sqrt(res.mean()), 3)
    
    return RMSE

In [245]:
cdatasub = pd.read_csv("london_flows.csv")

In [246]:
file_network = cdatasub

In [247]:
file_network = file_network.drop(file_network[file_network.distance==0].index, axis=0)
file_network = file_network.drop(file_network[file_network.jobs==0].index, axis=0)
file_network = file_network.drop(file_network[file_network.population==0].index, axis=0)

In [248]:
cdatasubmat = cdatasub.pivot_table(values ="flows", index="station_origin", columns = "station_destination",
                                    aggfunc=np.sum, margins=True)
cdatasubmat

  cdatasubmat = cdatasub.pivot_table(values ="flows", index="station_origin", columns = "station_destination",
  cdatasubmat = cdatasub.pivot_table(values ="flows", index="station_origin", columns = "station_destination",
  cdatasubmat = cdatasub.pivot_table(values ="flows", index="station_origin", columns = "station_destination",


station_destination,Abbey Road,Acton Central,Acton Town,Aldgate,Aldgate East,All Saints,Alperton,Amersham,Anerley,Angel,...,Wimbledon,Wimbledon Park,Wood Green,Wood Lane,Wood Street,Woodford,Woodgrange Park,Woodside Park,Woolwich Arsenal,All
station_origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbey Road,,,,,,,,,,,...,,,,,,,,,32.0,599
Acton Central,,,,,,,,,,,...,,,,,,,0.0,,,1224
Acton Town,,,,3.0,17.0,,35.0,0.0,,11.0,...,77.0,3.0,6.0,9.0,,0.0,,0.0,,3745
Aldgate,,,0.0,,0.0,,,0.0,,17.0,...,0.0,,4.0,8.0,,0.0,,0.0,,2886
Aldgate East,,,2.0,0.0,,,0.0,0.0,,20.0,...,24.0,0.0,0.0,12.0,,1.0,,1.0,,3172
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Woodford,,,2.0,5.0,47.0,,,,,22.0,...,2.0,,1.0,,,,,,,4868
Woodgrange Park,,0.0,,,,,,,,,...,,,,,,,,,,530
Woodside Park,,,1.0,26.0,11.0,,0.0,,,59.0,...,0.0,,0.0,,,,,,,3093
Woolwich Arsenal,20.0,,,,,7.0,,,,,...,,,,,,,,,,7892


In [249]:
x_variables = ["population", "jobs", "distance"]
log_x_vars = []
for x in x_variables:
    file_network[f"log_{x}"] = np.log(file_network[x])
    log_x_vars.append(f"log_{x}")

formula = 'flows ~ station_origin + log_jobs + distance-1'

# run a production constrained sim.
prodSim = smf.glm(formula=formula, data=file_network, family=sm.families.Poisson()).fit()

# let's have a look at it's summary.
print(prodSim.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:                  flows   No. Observations:                61413
Model:                            GLM   Df Residuals:                    61013
Model Family:                 Poisson   Df Model:                          399
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -9.0994e+05
Date:                Sat, 06 Apr 2024   Deviance:                   1.6477e+06
Time:                        12:47:10   Pearson chi2:                 2.40e+06
No. Iterations:                     8   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
                                                  coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------

In [250]:
#create some Oi and Dj columns in the dataframe and store row and column totals in them:
#to create O_i, take file_network ...then... group by origcodenew ...then... summarise by calculating the sum of Total
O_i = pd.DataFrame(file_network.groupby(["station_origin"])["flows"].agg(np.sum))
O_i.rename(columns={"flows":"O_i"}, inplace = True)
file_network = file_network.merge(O_i, on = "station_origin", how = "left" )

D_j = pd.DataFrame(file_network.groupby(["station_destination"])["flows"].agg(np.sum))
D_j.rename(columns={"flows":"D_j"}, inplace = True)
file_network = file_network.merge(D_j, on = "station_destination", how = "left" )

  O_i = pd.DataFrame(file_network.groupby(["station_origin"])["flows"].agg(np.sum))
  D_j = pd.DataFrame(file_network.groupby(["station_destination"])["flows"].agg(np.sum))


In [251]:
# we can do this by pulling out the parameter values.
coefs = pd.DataFrame(prodSim.params)
coefs.reset_index(inplace=True)
coefs.rename(columns={0: "alpha_i", "index": "coef"}, inplace=True)
coefs

Unnamed: 0,coef,alpha_i
0,station_origin[Abbey Road],-2.914322
1,station_origin[Acton Central],-1.162092
2,station_origin[Acton Town],-1.613081
3,station_origin[Aldgate],-2.943047
4,station_origin[Aldgate East],-2.854752
...,...,...
395,station_origin[Woodgrange Park],-0.896422
396,station_origin[Woodside Park],-1.149110
397,station_origin[Woolwich Arsenal],0.518041
398,log_jobs,0.755222


In [252]:
to_repl = ["(station_origin)", "\[", "\]"]
for x in to_repl:
    coefs["coef"] = coefs["coef"].str.replace(x, "",regex=True)
coefs

Unnamed: 0,coef,alpha_i
0,Abbey Road,-2.914322
1,Acton Central,-1.162092
2,Acton Town,-1.613081
3,Aldgate,-2.943047
4,Aldgate East,-2.854752
...,...,...
395,Woodgrange Park,-0.896422
396,Woodside Park,-1.149110
397,Woolwich Arsenal,0.518041
398,log_jobs,0.755222


In [253]:
# then once you have done this you can join them back into the dataframes
file_network = file_network.merge(coefs, left_on="station_origin", right_on="coef", how="left")
file_network.drop(columns=["coef"], inplace=True)

# check this has worked
file_network.head()

Unnamed: 0,station_origin,station_destination,flows,population,jobs,distance,log_population,log_jobs,log_distance,O_i,D_j,alpha_i
0,Abbey Road,Bank and Monument,0,599,78549,8131.525097,6.395262,11.271478,9.003504,599,78549,-2.914322
1,Abbey Road,Beckton,1,599,442,8510.121774,6.395262,6.09131,9.049012,599,442,-2.914322
2,Abbey Road,Blackwall,3,599,665,3775.448872,6.395262,6.499787,8.236275,599,665,-2.914322
3,Abbey Road,Canary Wharf,1,599,58772,5086.51422,6.395262,10.981421,8.534348,599,58772,-2.914322
4,Abbey Road,Canning Town,37,599,15428,2228.923167,6.395262,9.643939,7.709274,599,15428,-2.914322


In [254]:
alpha_i = prodSim.params[0:-2]
gamma = prodSim.params[-2]
beta = -prodSim.params[-1]

  gamma = prodSim.params[-2]
  beta = -prodSim.params[-1]


In [255]:
alpha_i

station_origin[Abbey Road]         -2.914322
station_origin[Acton Central]      -1.162092
station_origin[Acton Town]         -1.613081
station_origin[Aldgate]            -2.943047
station_origin[Aldgate East]       -2.854752
                                      ...   
station_origin[Wood Street]        -0.942621
station_origin[Woodford]           -0.633605
station_origin[Woodgrange Park]    -0.896422
station_origin[Woodside Park]      -1.149110
station_origin[Woolwich Arsenal]    0.518041
Length: 398, dtype: float64

In [256]:
gamma

0.7552215895693627

In [257]:
beta

0.00015316619346473007

In [258]:
file_network["prodsimest1"] = np.round(prodSim.mu,0)

In [259]:
cdatasubmatS = file_network.pivot_table(values ="prodsimest1", index="station_origin", columns = "station_destination",
                                    aggfunc=np.sum, margins=True)
cdatasubmatS

  cdatasubmatS = file_network.pivot_table(values ="prodsimest1", index="station_origin", columns = "station_destination",
  cdatasubmatS = file_network.pivot_table(values ="prodsimest1", index="station_origin", columns = "station_destination",
  cdatasubmatS = file_network.pivot_table(values ="prodsimest1", index="station_origin", columns = "station_destination",


station_destination,Abbey Road,Acton Central,Acton Town,Aldgate,Aldgate East,All Saints,Alperton,Amersham,Anerley,Angel,...,Wimbledon,Wimbledon Park,Wood Green,Wood Lane,Wood Street,Woodford,Woodgrange Park,Woodside Park,Woolwich Arsenal,All
station_origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbey Road,,,,,,,,,,,...,,,,,,,,,7.0,595.0
Acton Central,,,,,,,,,,,...,,,,,,,0.0,,,1226.0
Acton Town,,,,13.0,13.0,,14.0,0.0,,16.0,...,13.0,3.0,2.0,20.0,,0.0,,1.0,,3744.0
Aldgate,,,1.0,,37.0,,,0.0,,27.0,...,2.0,,2.0,2.0,,1.0,,1.0,,2885.0
Aldgate East,,,1.0,40.0,,,0.0,0.0,,29.0,...,2.0,0.0,3.0,2.0,,1.0,,1.0,,3160.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Woodford,,,1.0,41.0,52.0,,,,,30.0,...,2.0,,6.0,,,,,,,4867.0
Woodgrange Park,,1.0,,,,,,,,,...,,,,,,,,,,532.0
Woodside Park,,,2.0,19.0,19.0,,0.0,,,32.0,...,3.0,,4.0,,,,,,,3100.0
Woolwich Arsenal,34.0,,,,,36.0,,,,,...,,,,,,,,,,7893.0


In [260]:
CalcRSqaured(file_network["flows"],file_network["prodsimest1"])

0.46806484435428586

In [261]:
CalcRMSE(file_network["flows"],file_network["prodsimest1"])

96.263

## Scenario A

In [262]:
def new_sal(row):
    if row["station_destination"] == "Canary Wharf":
        val = row["jobs"]/2
    else:
        val = row["jobs"]
    return val


file_network["jobs_scenario_A"] = file_network.apply(new_sal, axis=1)
file_network.head(5)

Unnamed: 0,station_origin,station_destination,flows,population,jobs,distance,log_population,log_jobs,log_distance,O_i,D_j,alpha_i,prodsimest1,jobs_scenario_A
0,Abbey Road,Bank and Monument,0,599,78549,8131.525097,6.395262,11.271478,9.003504,599,78549,-2.914322,78.0,78549.0
1,Abbey Road,Beckton,1,599,442,8510.121774,6.395262,6.09131,9.049012,599,442,-2.914322,1.0,442.0
2,Abbey Road,Blackwall,3,599,665,3775.448872,6.395262,6.499787,8.236275,599,665,-2.914322,4.0,665.0
3,Abbey Road,Canary Wharf,1,599,58772,5086.51422,6.395262,10.981421,8.534348,599,58772,-2.914322,99.0,29386.0
4,Abbey Road,Canning Town,37,599,15428,2228.923167,6.395262,9.643939,7.709274,599,15428,-2.914322,56.0,15428.0


In [265]:
file_network_A = file_network
file_network_A = file_network_A.drop(columns=["jobs"])
file_network_A = file_network_A.drop(columns=["log_jobs"])
file_network_A["jobs"] = file_network_A["jobs_scenario_A"]

In [266]:
x_variables = ["jobs"]
log_x_vars = []
for x in x_variables:
    file_network_A[f"log_{x}"] = np.log(file_network_A[x])
    log_x_vars.append(f"log_{x}")

file_network_A

Unnamed: 0,station_origin,station_destination,flows,population,distance,log_population,log_distance,O_i,D_j,alpha_i,prodsimest1,jobs_scenario_A,jobs,log_jobs
0,Abbey Road,Bank and Monument,0,599,8131.525097,6.395262,9.003504,599,78549,-2.914322,78.0,78549.0,78549.0,11.271478
1,Abbey Road,Beckton,1,599,8510.121774,6.395262,9.049012,599,442,-2.914322,1.0,442.0,442.0,6.091310
2,Abbey Road,Blackwall,3,599,3775.448872,6.395262,8.236275,599,665,-2.914322,4.0,665.0,665.0,6.499787
3,Abbey Road,Canary Wharf,1,599,5086.514220,6.395262,8.534348,599,58772,-2.914322,99.0,29386.0,29386.0,10.288274
4,Abbey Road,Canning Town,37,599,2228.923167,6.395262,7.709274,599,15428,-2.914322,56.0,15428.0,15428.0,9.643939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61408,Woolwich Arsenal,Tower Gateway,127,7892,13401.795549,8.973605,9.503144,7892,3342,0.518041,99.0,3342.0,3342.0,8.114325
61409,Woolwich Arsenal,West Ham,608,7892,8701.454361,8.973605,9.071245,7892,5487,0.518041,295.0,5487.0,5487.0,8.610137
61410,Woolwich Arsenal,West India Quay,6,7892,9536.720451,8.973605,9.162905,7892,400,0.518041,36.0,400.0,400.0,5.991465
61411,Woolwich Arsenal,West Silvertown,81,7892,5355.248554,8.973605,8.585832,7892,893,0.518041,125.0,893.0,893.0,6.794587


In [272]:
# to check everything works, recreate the original estimates
file_network_A["prodsimestA2"] = np.exp(file_network_A["alpha_i"]+gamma*file_network_A["log_jobs"] - beta*file_network_A["distance"])


# round
file_network_A["prodsimestA2"] = round(file_network_A["prodsimestA2"],0)

cdatasubmatA2 = file_network_A.pivot_table(values ="prodsimestA2", index="station_origin", columns = "station_destination",
                                    aggfunc=np.sum, margins=True)
cdatasubmatA2

  cdatasubmatA2 = file_network_A.pivot_table(values ="prodsimestA2", index="station_origin", columns = "station_destination",
  cdatasubmatA2 = file_network_A.pivot_table(values ="prodsimestA2", index="station_origin", columns = "station_destination",
  cdatasubmatA2 = file_network_A.pivot_table(values ="prodsimestA2", index="station_origin", columns = "station_destination",


station_destination,Abbey Road,Acton Central,Acton Town,Aldgate,Aldgate East,All Saints,Alperton,Amersham,Anerley,Angel,...,Wimbledon,Wimbledon Park,Wood Green,Wood Lane,Wood Street,Woodford,Woodgrange Park,Woodside Park,Woolwich Arsenal,All
station_origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abbey Road,,,,,,,,,,,...,,,,,,,,,7.0,555.0
Acton Central,,,,,,,,,,,...,,,,,,,0.0,,,1226.0
Acton Town,,,,13.0,13.0,,14.0,0.0,,16.0,...,13.0,3.0,2.0,20.0,,0.0,,1.0,,3730.0
Aldgate,,,1.0,,37.0,,,0.0,,27.0,...,2.0,,2.0,2.0,,1.0,,1.0,,2854.0
Aldgate East,,,1.0,40.0,,,0.0,0.0,,29.0,...,2.0,0.0,3.0,2.0,,1.0,,1.0,,3117.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Woodford,,,1.0,41.0,52.0,,,,,30.0,...,2.0,,6.0,,,,,,,4765.0
Woodgrange Park,,1.0,,,,,,,,,...,,,,,,,,,,532.0
Woodside Park,,,2.0,19.0,19.0,,0.0,,,32.0,...,3.0,,4.0,,,,,,,3083.0
Woolwich Arsenal,34.0,,,,,36.0,,,,,...,,,,,,,,,,7290.0


In [273]:
#calculate some new Dj^gamma and d_ij^beta values
Dj2_gamma = file_network_A["log_jobs"]**gamma
dist_beta = file_network_A["distance"]**-beta
#calcualte the first stage of the Ai values
file_network_A["Ai1"] = Dj2_gamma * dist_beta
#now do the sum over all js bit
A_i = pd.DataFrame(file_network_A.groupby(["station_origin"])["Ai1"].agg(np.sum))
#now divide into 1
A_i["Ai1"] = 1/A_i["Ai1"]
A_i.rename(columns={"Ai1":"A_i"}, inplace=True)
#and write the A_i values back into the dataframe
file_network_A = file_network_A.merge(A_i, left_on="station_origin", right_index=True, how="left")

  A_i = pd.DataFrame(file_network_A.groupby(["station_origin"])["Ai1"].agg(np.sum))


In [278]:
#to check everything works, recreate the original estimates
file_network_A["prodsimest3"] = file_network_A["A_i"]*file_network_A["O_i"]*Dj2_gamma*dist_beta
#round
file_network_A["prodsimest3"] = round(file_network_A["prodsimest3"])
#check
file_network_A[["prodsimest1", "prodsimest3"]]

Unnamed: 0,prodsimest1,prodsimest3
0,78.0,26.0
1,1.0,16.0
2,4.0,17.0
3,99.0,24.0
4,56.0,23.0
...,...,...
61408,99.0,220.0
61409,295.0,230.0
61410,36.0,175.0
61411,125.0,192.0


In [68]:
cdatasubmat1 = file_network.pivot_table(values="prodsimest1", index="station_origin", columns="station_destination",
                                        aggfunc=np.sum, margins=True)
cdatasubmat1

KeyError: 'prodsimest1'

In [20]:
CalcRSqaured(file_network["flows"], file_network["prodsimest1"])

0.3740090678013128

In [21]:
CalcRMSE(file_network["flows"], file_network["prodsimest1"])


104.106