# Data Extraction

In [2]:
# Connect to folder where source file are stored
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [3]:
# Install required packages
!pip install osmium

Collecting osmium
  Downloading osmium-3.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.3 MB)
[?25l[K     |▎                               | 10 kB 19.5 MB/s eta 0:00:01[K     |▌                               | 20 kB 25.4 MB/s eta 0:00:01[K     |▉                               | 30 kB 13.7 MB/s eta 0:00:01[K     |█                               | 40 kB 9.0 MB/s eta 0:00:01[K     |█▎                              | 51 kB 5.1 MB/s eta 0:00:01[K     |█▋                              | 61 kB 5.2 MB/s eta 0:00:01[K     |█▉                              | 71 kB 5.4 MB/s eta 0:00:01[K     |██                              | 81 kB 6.0 MB/s eta 0:00:01[K     |██▍                             | 92 kB 4.6 MB/s eta 0:00:01[K     |██▋                             | 102 kB 5.0 MB/s eta 0:00:01[K     |██▉                             | 112 kB 5.0 MB/s eta 0:00:01[K     |███▏                            | 122 kB 5.0 MB/s eta 0:00:01[K     |███▍                      

In [4]:
import osmium as osm
import pandas as pd

# Defining the Handler to extract all the wanted informations
# (Type; id; Version; Timestamp; Latitude/Longitude; Amenity type.)
class TimelineHandler(osm.SimpleHandler):
    def __init__(self):
        osm.SimpleHandler.__init__(self)
        self.elemtimeline = []
        
    def node(self, n):
        if 'amenity' in n.tags and 'name' in n.tags:
            self.elemtimeline.append(["node",
                                      n.id,
                                      n.version,
                                      pd.Timestamp(n.timestamp),
                                      n.location.lat,
                                      n.location.lon,
                                      n.tags["amenity"],
                                      n.tags["name"]])
        elif 'amenity' in n.tags:
            self.elemtimeline.append(["node",
                                      n.id,
                                      n.version,
                                      pd.Timestamp(n.timestamp),
                                      n.location.lat,
                                      n.location.lon,
                                      n.tags["amenity"],
                                      "N/A"])

In [7]:
tlhandler = TimelineHandler()
# change file name to the OSM data download for the specific place under study
file_name = "./gdrive/MyDrive/COMP0158/greater-london-internal.osh.pbf"
tlhandler.apply_file(file_name)
colnames = ['type','id','Version','TS',"Lat","Lon",'amenity','name']
elements = pd.DataFrame(tlhandler.elemtimeline, columns=colnames)
elements = elements.sort_values(by=['type','TS'],ascending=False)
elements = elements.reset_index(drop=True)

In [8]:
elements.head()

Unnamed: 0,type,id,Version,TS,Lat,Lon,amenity,name
0,node,185743749,7,2021-05-09 23:14:56+00:00,51.550833,-0.138445,post_box,
1,node,303198052,3,2021-05-09 23:14:56+00:00,51.550804,-0.14039,bicycle_parking,
2,node,8715968899,1,2021-05-09 23:14:56+00:00,51.550761,-0.1356,public_bookcase,Leighton Road Community Book Swap
3,node,8716017943,1,2021-05-09 23:14:56+00:00,51.550776,-0.140567,bicycle_parking,
4,node,8716017952,1,2021-05-09 23:14:56+00:00,51.550088,-0.140706,waste_basket,


# Classification Logic


1.   If a record is of Version 1, it is classified as ***New Opening***.

2.   If there is a change in the amenity type/name of POI compared to its previous version, it is regarded as ***New Opening***, and the business it represented previously counted as ***Closure***.
3.   All other records that do not meet 1 or 2 are considered as ***Updates***.

Overall, the idea is to create flags where appropraite and then we will be able to extract the three measurement accordingly.

In [9]:
# import libraries
import pandas as pd
import numpy as np
import datetime as dt
import math
import scipy.stats as sp
import matplotlib.pyplot as plt
from pandas.tseries.offsets import MonthEnd
from pandas.tseries.offsets import MonthBegin

## Amendement Tag

In [10]:
# Name changing flag
# To check if there is a any change in the name of the POI
def flag(version,id,df):
    if version > 1:
        " Comparing the name of business units and check if there is a change in their names"
        try:
            return df[(df['id']==id) & (df['Version']==version-1)]['name'].values[0] !=  df[(df['id']==id) & (df['Version']==version)]['name'].values[0]
        except:
            return "N/A"
    else:
        return "N/A"

def name_change_flag(df):
    return df.apply(lambda row:flag(row.Version, row.id, df),axis=1)

In [11]:
ele_name_change_flags = name_change_flag(elements)
elements["amend_tag"] = ele_name_change_flags

In [12]:
elements.head()

Unnamed: 0,type,id,Version,TS,Lat,Lon,amenity,name,amend_tag
0,node,185743749,7,2021-05-09 23:14:56+00:00,51.550833,-0.138445,post_box,,False
1,node,303198052,3,2021-05-09 23:14:56+00:00,51.550804,-0.14039,bicycle_parking,,False
2,node,8715968899,1,2021-05-09 23:14:56+00:00,51.550761,-0.1356,public_bookcase,Leighton Road Community Book Swap,
3,node,8716017943,1,2021-05-09 23:14:56+00:00,51.550776,-0.140567,bicycle_parking,,
4,node,8716017952,1,2021-05-09 23:14:56+00:00,51.550088,-0.140706,waste_basket,,


## Closure Tag

In [13]:
# Helper function:
# To Check if two lists have any value in common
def no_common_member(a, b):
    a_set = set(a)
    b_set = set(b)
    if (a_set & b_set):
        return False 
    else:
        return True

In [14]:
# Closure flag
def closure(version,amend_flg,id,df):
    " Comparing the type of business units and check if there is a change in their names"
    if amend_flg == True:
        try:

            # Detect any change in amenity type
            if ((df[(df["id"] == id) & (df['Version']==version-1)]["amenity"].values[0] 
                 != df[(df["id"] == id) & (df['Version']==version)]["amenity"].values[0])
            and (df[(df["id"] == id) & (df['Version']==version-1)]["name"].values[0] 
                 != df[(df["id"] == id) & (df['Version']==version)]["name"].values[0])):
                return df[(df["id"] == id) & (df['Version']==version-1)]["amenity"].values[0]
                 
            # Detect if a new business has opened and the previous one closed
            elif no_common_member(df[(df["id"] == id) & (df['Version']==version-1)]["name"].values[0].lower().split(),
                                   df[(df["id"] == id) & (df['Version']==version)]["name"].values[0].lower().split()):
                return df[(df["id"] == id) & (df['Version']==version-1)]["amenity"].values[0]
                 
            # If not above, classified as no closures happened
            else:
                return False
        except:
            return False
    else:
        return False

def closure_flag(df):
    return df.apply(lambda row:closure(row.Version, row.amend_tag, row.id, df),axis=1)

# Previous POI name
def closure_name(version,amend_flg,name,id,df):
    " Comparing the type of business units and check if there is a change in their names"
    if (amend_flg == True) and (type(df[(df["id"] == id) & (df['Version']==version-1)]["name"].values[0]) == str):
        try:
            return df[(df["id"] == id) & (df['Version']==version-1)]["name"].values[0]
        except:
            return False
    else:
        return False

def closure_name_func(df):
    return df.apply(lambda row:closure_name(row.Version, row.amend_tag, row.name, row.id, df),axis=1)

In [15]:
closure_flags = closure_flag(elements)
elements["closure_tag"] = closure_flags
closure_names = closure_name_func(elements)
elements["previous_name"] = closure_names

## Amenity Grouping

In [17]:
# Data prepared for amenity categories grouping
amenity_group = pd.read_excel('Amenity_Grouping.xlsx', index_col=None)

# Restructuring the data for grouping purpose
amenity_dict = {}
for group in amenity_group:
    items = str(amenity_group[str(group)].values[0]).split(",")
    for item in items:
        amenity_dict[item.lower()] = str(group)

# Amenity Categorization where all amenity type could not be classified are set to be Other
elements["amenity_group"] = elements['amenity'].map(amenity_dict).fillna('Other')

# Amenity Categorization where all previous amenity type could not be classified are set to be Other
elements["amenity_group"] = elements['amenity'].map(amenity_dict).fillna('Other')
elements["pre_amenity_group"] = elements['closure_tag'].map(amenity_dict).fillna("Other")

In [18]:
elements.head()

Unnamed: 0,type,id,Version,TS,Lat,Lon,amenity,name,amend_tag,closure_tag,previous_name,amenity_group,pre_amenity_group
0,node,185743749,7,2021-05-09 23:14:56+00:00,51.550833,-0.138445,post_box,,False,False,False,Public_Service,Other
1,node,303198052,3,2021-05-09 23:14:56+00:00,51.550804,-0.14039,bicycle_parking,,False,False,False,Transportation,Other
2,node,8715968899,1,2021-05-09 23:14:56+00:00,51.550761,-0.1356,public_bookcase,Leighton Road Community Book Swap,,False,False,Entertainment_Arts_Culture,Other
3,node,8716017943,1,2021-05-09 23:14:56+00:00,51.550776,-0.140567,bicycle_parking,,,False,False,Transportation,Other
4,node,8716017952,1,2021-05-09 23:14:56+00:00,51.550088,-0.140706,waste_basket,,,False,False,Waste_Management,Other


**Classification Logic**


*   Add: Version 1 + with Closure Tag not False
*   Delete: All Closure Tag
*   Update: Amend_Tag True/False + Closure Tag False





## Bulk Import Cleaning

In [19]:
# Bulk Import
# Data Donation Cleansing
elements = elements[(elements["amenity"]!="bench") | (elements["closure_tag"] != "False")]
elements = elements[(elements["amenity"]!="bicycle_parking") | (elements["closure_tag"] != "False")]
elements = elements[(elements["amenity"]!="waste_basket") | (elements["closure_tag"] != "False")]
elements = elements[(elements["amenity"]!="telephone") | (elements["closure_tag"] != "False")]
elements = elements[(elements["amenity"]!="post_box") | (elements["closure_tag"] != "False")]
elements = elements.reset_index(drop=True)

In [20]:
elements.head()

Unnamed: 0,type,id,Version,TS,Lat,Lon,amenity,name,amend_tag,closure_tag,previous_name,amenity_group,pre_amenity_group
0,node,185743749,7,2021-05-09 23:14:56+00:00,51.550833,-0.138445,post_box,,False,False,False,Public_Service,Other
1,node,303198052,3,2021-05-09 23:14:56+00:00,51.550804,-0.14039,bicycle_parking,,False,False,False,Transportation,Other
2,node,8715968899,1,2021-05-09 23:14:56+00:00,51.550761,-0.1356,public_bookcase,Leighton Road Community Book Swap,,False,False,Entertainment_Arts_Culture,Other
3,node,8716017943,1,2021-05-09 23:14:56+00:00,51.550776,-0.140567,bicycle_parking,,,False,False,Transportation,Other
4,node,8716017952,1,2021-05-09 23:14:56+00:00,51.550088,-0.140706,waste_basket,,,False,False,Waste_Management,Other


In [21]:
# Save the file for further use in order to save computational time
elements.to_csv("London_Data_v6.csv",index=False)