# Connect to Google Drive

(This step can be skipped if everything is on local)

In [1]:
# Connect to folder where source file are stored
from google.colab import drive
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


# Classification

## Classification Logic

1.   If a record is of Version 1, it is classified as ***New Opening***.

2.   If there is a change in the amenity type/name of POI compared to its previous version, it is regarded as ***New Opening***, and the business it represented previously counted as ***Closure***.
3.   All other records that do not meet 1 or 2 are considered as ***Updates***.

Overall, the idea is to create flags where appropraite and then we will be able to extract the three measurement accordingly.

## Set-up

In [2]:
# import libraries
import pandas as pd
import numpy as np
import datetime as dt
import math
import scipy.stats as sp
import matplotlib.pyplot as plt
from pandas.tseries.offsets import MonthEnd
from pandas.tseries.offsets import MonthBegin

## Read Extracted Data

In [6]:
elements = pd.read_csv("./gdrive/MyDrive/Target Folder/London_Extracted.csv")
elements.head()

Unnamed: 0,type,id,Version,TS,Lat,Lon,amenity,name
0,node,185743749,7,2021-05-09 23:14:56+00:00,51.550833,-0.138445,post_box,
1,node,303198052,3,2021-05-09 23:14:56+00:00,51.550804,-0.14039,bicycle_parking,
2,node,8715968899,1,2021-05-09 23:14:56+00:00,51.550761,-0.1356,public_bookcase,Leighton Road Community Book Swap
3,node,8716017943,1,2021-05-09 23:14:56+00:00,51.550776,-0.140567,bicycle_parking,
4,node,8716017952,1,2021-05-09 23:14:56+00:00,51.550088,-0.140706,waste_basket,


## Amendement Tag

In [24]:
# Filtering out all POIs with Name changing
# To check if there is a any change in the name of the POI
def flag(version,id,df):
    if version > 1:
        " Comparing the name of business units and check if there is a change in their names"
        try:
            return str(df[(df['id']==id) & (df['Version']==version-1)]['name'].values[0]) !=  str(df[(df['id']==id) & (df['Version']==version)]['name'].values[0])
        except:
            return "N/A"
    else:
        return "N/A"

def name_change_flag(df):
    return df.apply(lambda row:flag(row.Version, row.id, df),axis=1)

ele_name_change_flags = name_change_flag(elements)
elements["amend_tag"] = ele_name_change_flags

In [25]:
elements.head()

Unnamed: 0,type,id,Version,TS,Lat,Lon,amenity,name,amend_tag,closure_tag,previous_name,amenity_group,pre_amenity_group
0,node,185743749,7,2021-05-09 23:14:56+00:00,51.550833,-0.138445,post_box,,False,False,False,Public_Service,Other
1,node,303198052,3,2021-05-09 23:14:56+00:00,51.550804,-0.14039,bicycle_parking,,False,False,False,Transportation,Other
2,node,8715968899,1,2021-05-09 23:14:56+00:00,51.550761,-0.1356,public_bookcase,Leighton Road Community Book Swap,,False,False,Entertainment_Arts_Culture,Other
3,node,8716017943,1,2021-05-09 23:14:56+00:00,51.550776,-0.140567,bicycle_parking,,,False,False,Transportation,Other
4,node,8716017952,1,2021-05-09 23:14:56+00:00,51.550088,-0.140706,waste_basket,,,False,False,Waste_Management,Other


## Closure Tag

In [11]:
# Helper function:
# To Check if two lists have any value in common
def no_common_member(a, b):
    a_set = set(a)
    b_set = set(b)
    if (a_set & b_set):
        return False 
    else:
        return True

# Closure flag
def closure(version,amend_flg,id,df):
    " Comparing the type of business units and check if there is a change in their names"
    if amend_flg == True:
        try:

            # Detect any change in amenity type
            if ((df[(df["id"] == id) & (df['Version']==version-1)]["amenity"].values[0] 
                 != df[(df["id"] == id) & (df['Version']==version)]["amenity"].values[0])
            and (df[(df["id"] == id) & (df['Version']==version-1)]["name"].values[0] 
                 != df[(df["id"] == id) & (df['Version']==version)]["name"].values[0])):
                return df[(df["id"] == id) & (df['Version']==version-1)]["amenity"].values[0]
                 
            # Detect if a new business has opened and the previous one closed
            elif no_common_member(df[(df["id"] == id) & (df['Version']==version-1)]["name"].values[0].lower().split(),
                                   df[(df["id"] == id) & (df['Version']==version)]["name"].values[0].lower().split()):
                return df[(df["id"] == id) & (df['Version']==version-1)]["amenity"].values[0]
                 
            # If not above, classified as no closures happened
            else:
                return False
        except:
            return False
    else:
        return False

def closure_flag(df):
    return df.apply(lambda row:closure(row.Version, row.amend_tag, row.id, df),axis=1)

# Previous POI name
def closure_name(version,amend_flg,name,id,df):
    " Comparing the type of business units and check if there is a change in their names"
    if (amend_flg == True) and (type(df[(df["id"] == id) & (df['Version']==version-1)]["name"].values[0]) == str):
        try:
            return df[(df["id"] == id) & (df['Version']==version-1)]["name"].values[0]
        except:
            return False
    else:
        return False

def closure_name_func(df):
    return df.apply(lambda row:closure_name(row.Version, row.amend_tag, row.name, row.id, df),axis=1)

In [12]:
closure_flags = closure_flag(elements)
elements["closure_tag"] = closure_flags
closure_names = closure_name_func(elements)
elements["previous_name"] = closure_names
elements.head()

Unnamed: 0,type,id,Version,TS,Lat,Lon,amenity,name,amend_tag,closure_tag,previous_name
0,node,185743749,7,2021-05-09 23:14:56+00:00,51.550833,-0.138445,post_box,,True,False,False
1,node,303198052,3,2021-05-09 23:14:56+00:00,51.550804,-0.14039,bicycle_parking,,True,False,False
2,node,8715968899,1,2021-05-09 23:14:56+00:00,51.550761,-0.1356,public_bookcase,Leighton Road Community Book Swap,,False,False
3,node,8716017943,1,2021-05-09 23:14:56+00:00,51.550776,-0.140567,bicycle_parking,,,False,False
4,node,8716017952,1,2021-05-09 23:14:56+00:00,51.550088,-0.140706,waste_basket,,,False,False


## Amenity Grouping

In [13]:
# Data prepared for amenity categories grouping
amenity_group = pd.read_excel('Amenity_Grouping.xlsx', index_col=None)

# Restructuring the data for grouping purpose
amenity_dict = {}
for group in amenity_group:
    items = str(amenity_group[str(group)].values[0]).split(",")
    for item in items:
        amenity_dict[item.lower()] = str(group)

# Amenity Categorization where all amenity type could not be classified are set to be Other
elements["amenity_group"] = elements['amenity'].map(amenity_dict).fillna('Other')

# Amenity Categorization where all previous amenity type could not be classified are set to be Other
elements["amenity_group"] = elements['amenity'].map(amenity_dict).fillna('Other')
elements["pre_amenity_group"] = elements['closure_tag'].map(amenity_dict).fillna("Other")

In [14]:
elements.head()

Unnamed: 0,type,id,Version,TS,Lat,Lon,amenity,name,amend_tag,closure_tag,previous_name,amenity_group,pre_amenity_group
0,node,185743749,7,2021-05-09 23:14:56+00:00,51.550833,-0.138445,post_box,,True,False,False,Public_Service,Other
1,node,303198052,3,2021-05-09 23:14:56+00:00,51.550804,-0.14039,bicycle_parking,,True,False,False,Transportation,Other
2,node,8715968899,1,2021-05-09 23:14:56+00:00,51.550761,-0.1356,public_bookcase,Leighton Road Community Book Swap,,False,False,Entertainment_Arts_Culture,Other
3,node,8716017943,1,2021-05-09 23:14:56+00:00,51.550776,-0.140567,bicycle_parking,,,False,False,Transportation,Other
4,node,8716017952,1,2021-05-09 23:14:56+00:00,51.550088,-0.140706,waste_basket,,,False,False,Waste_Management,Other


## Bulk Import Cleaning

In [15]:
# Bulk Import
# Data Donation Cleansing
elements = elements[(elements["amenity"]!="bench") | (elements["closure_tag"] != "False")]
elements = elements[(elements["amenity"]!="bicycle_parking") | (elements["closure_tag"] != "False")]
elements = elements[(elements["amenity"]!="waste_basket") | (elements["closure_tag"] != "False")]
elements = elements[(elements["amenity"]!="telephone") | (elements["closure_tag"] != "False")]
elements = elements[(elements["amenity"]!="post_box") | (elements["closure_tag"] != "False")]
elements = elements.reset_index(drop=True)
elements.head()

Unnamed: 0,type,id,Version,TS,Lat,Lon,amenity,name,amend_tag,closure_tag,previous_name,amenity_group,pre_amenity_group
0,node,185743749,7,2021-05-09 23:14:56+00:00,51.550833,-0.138445,post_box,,True,False,False,Public_Service,Other
1,node,303198052,3,2021-05-09 23:14:56+00:00,51.550804,-0.14039,bicycle_parking,,True,False,False,Transportation,Other
2,node,8715968899,1,2021-05-09 23:14:56+00:00,51.550761,-0.1356,public_bookcase,Leighton Road Community Book Swap,,False,False,Entertainment_Arts_Culture,Other
3,node,8716017943,1,2021-05-09 23:14:56+00:00,51.550776,-0.140567,bicycle_parking,,,False,False,Transportation,Other
4,node,8716017952,1,2021-05-09 23:14:56+00:00,51.550088,-0.140706,waste_basket,,,False,False,Waste_Management,Other


## Save File

In [26]:
# Save the file for further use in order to save computational time
elements.to_csv("./gdrive/MyDrive/Target Folder/London_Data_classified.csv",index=False)