In [15]:
import numpy as np
import pandas as pd
import datetime
from dateutil.parser import parse
import requests, re, time
import pandas_datareader
import pickle
import math

import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.lines import Line2D
import matplotlib.patches as mpatches
import seaborn as sns 

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
import linearmodels as ln
from pystout import pystout



import json
from pandas.io.json import json_normalize
from bs4 import BeautifulSoup
from selenium import webdriver
import time

import geopy.distance

In [16]:
Apartments = pickle.load(open('/Users/holger/Documents/Python/Harmsen_Repo/Metro-Study/Pickles/Apartment Data 4.pkl','rb'))
Apartments['Year Sold'] = [i.year for i in Apartments['Date_sold']]
Apartments['Closest Metro dist'] = [i['Distance'] for i in Apartments['Closest Metro']]
Apartments['Cityring'] = [i['Cityring'] for i in Apartments['Closest Metro']]
Apartments['sq_m'] = (Apartments['Price']/1000)/Apartments['Price_sq_m_1000']

In [17]:
Event = Apartments.copy()

In [18]:
Event['Opened'] = 0
for i in range(0,len(Event)):
    if Event['Year Sold'][i] == 2020:
        Event['Opened'][i] = 1

In [19]:
# Generating Close-dummy:
Event['Close to Metro'] = 0
for i in range(0,len(Event)):
    if Event['Closest Metro dist'][i] < 0.5:
        Event['Close to Metro'][i] = 1

In [20]:
# Generating location dummies:
Event['Metro_loc'] = [i['Metro'] for i in Event['Closest Metro']]
Event['log_Price'] = np.log(Event['Price'])
area_dummy = pd.get_dummies(Event['Area'])
year_dummy = pd.get_dummies(Event['Year Sold'])

Deleting dummy-columns for reference-variables. In the area-dummy I remove those areas not defined for those to be the control-variable - all area-dummies are thus evaluated relative to this.
In the year-dummy I delete both 2019 and 2020; 2020, because this year is already represented by the Opened-dummy, and 2019 for this year to be the control in the year dummy-vector. All year-dummies are thus evaluated relative to 2019

In [21]:
area_dummy['Other'] = area_dummy['NaN']
del area_dummy['NaN']

In [22]:
year_dummy.columns = [str(i) for i in year_dummy.columns] # Setting columns to strings for pystout-module to work later.
del year_dummy['2019'] # For reference-year
del year_dummy['2020'] # Included in Opened-variable instead

In [23]:
# Generating regression-data:
X = pd.DataFrame()
X['Rooms'] = Event['Rooms']
X['Close'] = Event['Close to Metro']
X['Cityring'] = Event['Cityring']
X['Opened'] = Event['Opened']
X['sq_m'] = Event['sq_m']

X['OpenedXClose'] = Event['Opened']*Event['Close to Metro']
X['OpenedXCityring'] = Event['Opened']*Event['Cityring']
X['CityringXClose'] = Event['Close to Metro']*Event['Cityring']

X['OpenedXCityringXClose'] = Event['Close to Metro'] * Event['Opened'] * Event['Cityring']
X = pd.merge(X,area_dummy,left_index=True,right_index=True)
X = pd.merge(X,year_dummy,left_index=True,right_index=True)

## Describing data:

In [24]:
# Removing irrelevant description-variables.
describe = np.transpose(X.describe())
del describe['min']
del describe['25%']
del describe['50%']
del describe['75%']
del describe['max']
del describe['count']

In [25]:
# Setting up a three-rowed table of descriptions:
first = describe[:17]
first['Variable'] = first.index.to_list()
first.reset_index(inplace=True)
del first['index']

second = describe[17:30]
second['Variable'] = second.index.to_list()
second.reset_index(inplace=True)
del second['index']

third = describe[30:]
third['Variable'] = third.index.to_list()
third.reset_index(inplace=True)
del third['index']

In [26]:
table = pd.merge(first,second,how='outer',left_index=True,right_index=True)
table = pd.merge(table,third,how='outer',left_index=True,right_index=True)

In [27]:
table_sorted = table[['Variable_x','mean_x','std_x','Variable_y','mean_y','std_y','Variable','mean','std']]
table_sorted.columns = ['var1', 'mean1', 'std1', 'var2', 'mean2', 'std2','var3','mean3','std3']
table_sorted = table_sorted.round(decimals=3)

In [28]:
table_sorted

Unnamed: 0,var1,mean1,std1,var2,mean2,std2,var3,mean3,std3
0,Rooms,2.795,1.126,1994.0,0.003,0.057,2007.0,0.016,0.125
1,Close,0.435,0.496,1995.0,0.005,0.068,2008.0,0.011,0.106
2,Cityring,0.419,0.493,1996.0,0.007,0.083,2009.0,0.016,0.125
3,Opened,0.099,0.299,1997.0,0.008,0.089,2010.0,0.02,0.139
4,sq_m,84.954,38.281,1998.0,0.008,0.086,2011.0,0.022,0.145
5,OpenedXClose,0.046,0.21,1999.0,0.007,0.086,2012.0,0.028,0.165
6,OpenedXCityring,0.044,0.206,2000.0,0.008,0.087,2013.0,0.033,0.179
7,CityringXClose,0.213,0.41,2001.0,0.007,0.086,2014.0,0.054,0.227
8,OpenedXCityringXClose,0.024,0.153,2002.0,0.01,0.097,2015.0,0.068,0.252
9,FRB,0.037,0.188,2003.0,0.011,0.106,2016.0,0.078,0.268


In [60]:
# Printing LaTeX-code for the assignment:
print(table_sorted.to_latex())

\begin{tabular}{llrrlrrlrr}
\toprule
{} &                   var1 &   mean1 &    std1 &  var2 &  mean2 &   std2 &  var3 &  mean3 &   std3 \\
\midrule
0  &                  Rooms &   2.795 &   1.126 &  1994 &  0.003 &  0.057 &  2007 &  0.016 &  0.125 \\
1  &                  Close &   0.435 &   0.496 &  1995 &  0.005 &  0.068 &  2008 &  0.011 &  0.106 \\
2  &               Cityring &   0.419 &   0.493 &  1996 &  0.007 &  0.083 &  2009 &  0.016 &  0.125 \\
3  &                 Opened &   0.099 &   0.299 &  1997 &  0.008 &  0.089 &  2010 &  0.020 &  0.139 \\
4  &                   sq\_m &  84.954 &  38.281 &  1998 &  0.008 &  0.086 &  2011 &  0.022 &  0.145 \\
5  &           OpenedXClose &   0.046 &   0.210 &  1999 &  0.007 &  0.086 &  2012 &  0.028 &  0.165 \\
6  &        OpenedXCityring &   0.044 &   0.206 &  2000 &  0.008 &  0.087 &  2013 &  0.033 &  0.179 \\
7  &         CityringXClose &   0.213 &   0.410 &  2001 &  0.007 &  0.086 &  2014 &  0.054 &  0.227 \\
8  &  OpenedXCityringXClos

## Regressing on data:

In [61]:
# Deleting 'Other'-location variable from vector for this to be the control.
del X['Other']

In [62]:
# Regressing:
X = sm.add_constant(X)
model = sm.OLS(Event['log_Price'],X)
results_1 = model.fit(cov_type='HC1')
print(results_1.summary())

OLS Regression Results                            
Dep. Variable:              log_Price   R-squared:                       0.649
Model:                            OLS   Adj. R-squared:                  0.649
Method:                 Least Squares   F-statistic:                     1653.
Date:                Wed, 05 Aug 2020   Prob (F-statistic):               0.00
Time:                        08:34:32   Log-Likelihood:                -29856.
No. Observations:               56949   AIC:                         5.980e+04
Df Residuals:                   56907   BIC:                         6.017e+04
Df Model:                          41                                         
Covariance Type:                  HC1                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    13.7503      0.009   1476.681    

In [63]:
pystout(models=[results_1],
        file='/Users/holger/Documents/Python/Harmsen_Repo/Metro-Study/Tex files/Metro Regressions 1.tex',
        digits=3,
        endog_names=['Opening'],
        varlabels={'const':'Constant','displacement':'Disp','mpg':'MPG'},
        mgroups={'y = log(Price per sq. m, DKK)':[1,3]},
        modstat={'nobs':'Obs','rsquared_adj':'Adj. R\sym{2}','fvalue':'F-stat'}
        )