In [1]:
import pandas as pd
import glob
import os
import pyarrow.parquet as pq
import numpy as np
import unidecode
import re
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer, util
import torch

  from .autonotebook import tqdm as notebook_tqdm


# FOURSQUARE - CATEGORIES

## Categories

In [2]:
# Import categories file from the FourSquare website
table_cat = pq.read_table(r"Database\FourSquare\categories\categories.zstd.parquet")

# Convert to a pandas DataFrame if necessary
df_cat = table_cat.to_pandas()


In [3]:
# Simplify the dataframe by keeping only category levels and names (removing level IDs)
df_cat_fs = df_cat[["category_level", "category_name", "level1_category_name", "level2_category_name", "level3_category_name", "level4_category_name", "level5_category_name", "level6_category_name"]]

# Rename columns to more descriptive English names
df_cat_fs = df_cat_fs.rename(columns={"category_level":"Tag_depth","category_name":"Tag", "level1_category_name":"Depth_1", "level2_category_name": "Depth_2", "level3_category_name": "Depth_3", "level4_category_name": "Depth_4", "level5_category_name": "Depth_5", "level6_category_name": "Depth_6"})

# Convert all values in Tag and Depth columns to lowercase
df_cat_fs['Tag'] = df_cat_fs['Tag'].str.lower()
df_cat_fs['Depth_1'] = df_cat_fs['Depth_1'].str.lower()
df_cat_fs['Depth_2'] = df_cat_fs['Depth_2'].str.lower()
df_cat_fs['Depth_3'] = df_cat_fs['Depth_3'].str.lower()
df_cat_fs['Depth_4'] = df_cat_fs['Depth_4'].str.lower()
df_cat_fs['Depth_5'] = df_cat_fs['Depth_5'].str.lower()
df_cat_fs['Depth_6'] = df_cat_fs['Depth_6'].str.lower()

# Reorder the dataframe columns in the desired order
df_cat_fs = df_cat_fs[["Tag", "Tag_depth","Depth_1", "Depth_2", "Depth_3",  "Depth_4",  "Depth_5", "Depth_6"]]

# Remove the row where Tag is 'restaurant' and Tag_depth is 3
df_cat_fs = df_cat_fs[~((df_cat_fs["Tag"] == "restaurant") & (df_cat_fs["Tag_depth"] == 3))]

# Display the resulting dataframe
df_cat_fs


Unnamed: 0,Tag,Tag_depth,Depth_1,Depth_2,Depth_3,Depth_4,Depth_5,Depth_6
0,kaiseki restaurant,5,dining and drinking,restaurant,asian restaurant,japanese restaurant,kaiseki restaurant,
1,art museum,3,arts and entertainment,museum,art museum,,,
2,rental car location,3,travel and transportation,transport hub,rental car location,,,
3,shabu-shabu restaurant,5,dining and drinking,restaurant,asian restaurant,japanese restaurant,shabu-shabu restaurant,
4,peking duck restaurant,5,dining and drinking,restaurant,asian restaurant,chinese restaurant,peking duck restaurant,
...,...,...,...,...,...,...,...,...
1240,lake,2,landmarks and outdoors,lake,,,,
1241,barbershop,3,business and professional services,health and beauty service,barbershop,,,
1242,hospice,2,health and medicine,hospice,,,,
1243,chinese aristocrat restaurant,5,dining and drinking,restaurant,asian restaurant,chinese restaurant,chinese aristocrat restaurant,


In [None]:
# Export the cleaned dataframe to a CSV file with ';' as separator and without the index column
df_cat_fs.to_csv('Database/Clean_categories/categories_FS_clean.csv', index=False, sep=";")

# OSM - CATEGORIES

In [57]:
# Load OSM categories from a CSV file
df_cate_osm = pd.read_csv('Database/OSM/osm_categories.csv', sep=',')

# Rename columns to make them clearer
df_cate_osm = df_cate_osm.rename(columns={"Key_category":"Subcategory_before_table",
                                          'Key_subcategory':"Subcategory_in_table",
                                          'Key':"Main_category"})

# Keep only the relevant columns in the desired order
df_cate_osm = df_cate_osm[["Main_category", "Subcategory_before_table", "Subcategory_in_table", "Value", "Description", "Element"]]

# Display the dataframe
df_cate_osm

Unnamed: 0,Main_category,Subcategory_before_table,Subcategory_in_table,Value,Description,Element
0,aerialway,aerialway,aerialway,cable_car,"A cable car run. Just one or two large cars. The traction cable forms a loop, but the cars do not loop around, they just move up and down on their own side, rolling along static cables over which they are suspended.",way
1,aerialway,aerialway,aerialway,gondola,An aerialway where the cabins go around in a circle.,way
2,aerialway,aerialway,aerialway,mixed_lift,"A mixed lift, containing both gondolas and chairs.",way
3,aerialway,aerialway,aerialway,chair_lift,An open chairlift run. These have one or more seats or benches and are open to the outside air.,way
4,aerialway,aerialway,aerialway,drag_lift,An overhead tow-line for skiers and riders.,way
...,...,...,...,...,...,...
1303,waterway,barriers on waterways,barriers on waterways,lock_gate,A gate of a lock,node / way
1304,waterway,other features on waterways,other features on waterways,soakhole,The point at which a river or stream percolates into or through the soil,node
1305,waterway,other features on waterways,other features on waterways,turning_point,"A place to turn the driving direction for vessels, where the boats are longer than the river/canal is wide.",node
1306,waterway,other features on waterways,other features on waterways,water_point,A place to fill fresh water holding tanks of a boat.,node


### Remove everything that is not a point of interest, such as attributes:

In [58]:
# Remove rows that are not points of interest as well as obsolete (deprecated) points of interest

# Create a mask for rows where either 'Subcategory_before_table' or 'Subcategory_in_table'
# contains the word 'attributes' (case-insensitive)
masque_attributes = (
    df_cate_osm['Subcategory_before_table'].str.contains(r'\battributes\b', case=False, na=False) |
    df_cate_osm['Subcategory_in_table'].str.contains(r'\battributes\b', case=False, na=False)
)

# Create a mask for rows where 'Description' contains the word 'deprecated' (case-insensitive)
masque_deprecated = (
    df_cate_osm['Description'].str.contains(r'\bdeprecated\b', case=False, na=False)
)

# Combine the two masks (attributes OR deprecated)
mask_att_dep = masque_attributes | masque_deprecated

# Keep only rows that are NOT flagged by the mask
df_osm_pre_clean = df_cate_osm[~mask_att_dep]
df_osm_pre_clean


# Alternative step-by-step version (commented out):
# First remove 'attributes'
# df_osm_without_attributes = df_cate_osm[~masque_attributes]

# Then remove 'deprecated' from the filtered result
# df_osm_without_att_and_deprecated = df_osm_without_attributes[~masque_deprecated]

# Final cleaned dataframe
# df_osm_pre_clean = df_osm_without_att_and_deprecated


Unnamed: 0,Main_category,Subcategory_before_table,Subcategory_in_table,Value,Description,Element
0,aerialway,aerialway,aerialway,cable_car,"A cable car run. Just one or two large cars. The traction cable forms a loop, but the cars do not loop around, they just move up and down on their own side, rolling along static cables over which they are suspended.",way
1,aerialway,aerialway,aerialway,gondola,An aerialway where the cabins go around in a circle.,way
2,aerialway,aerialway,aerialway,mixed_lift,"A mixed lift, containing both gondolas and chairs.",way
3,aerialway,aerialway,aerialway,chair_lift,An open chairlift run. These have one or more seats or benches and are open to the outside air.,way
4,aerialway,aerialway,aerialway,drag_lift,An overhead tow-line for skiers and riders.,way
...,...,...,...,...,...,...
1303,waterway,barriers on waterways,barriers on waterways,lock_gate,A gate of a lock,node / way
1304,waterway,other features on waterways,other features on waterways,soakhole,The point at which a river or stream percolates into or through the soil,node
1305,waterway,other features on waterways,other features on waterways,turning_point,"A place to turn the driving direction for vessels, where the boats are longer than the river/canal is wide.",node
1306,waterway,other features on waterways,other features on waterways,water_point,A place to fill fresh water holding tanks of a boat.,node


### Focus on rows that have three different values in the first three columns, which normally should not happen


In [59]:
# Select the first three columns by position
df_subset = df_osm_pre_clean.iloc[:, :3]

# Create a mask where each row has 3 unique values (all different)
mask = df_subset.apply(lambda row: len(set(row)) == 3, axis=1)

# Apply the mask to filter the dataframe
df_tree_different = df_osm_pre_clean[mask]

# Print the number of rows that passed the filter
print(len(df_tree_different))

# Display the filtered dataframe
df_tree_different

15


Unnamed: 0,Main_category,Subcategory_before_table,Subcategory_in_table,Value,Description,Element
525,footway,highway,when sidewalk/crosswalk is tagged as a separate way,sidewalk,Sidewalk that runs typically along residential road. Use in combination with highway=footway or highway=path,way
526,footway,highway,when sidewalk/crosswalk is tagged as a separate way,crossing,"Crosswalk that connects two sidewalks on the opposite side of the road. Often recognized by painted markings on the road, road sign or traffic lights. Use in combination with highway=footway or highway=path. Useful information is presence of tactile_paving=*, wheelchair=* suitability and kerb=* represented as a node on the crosswalk way.",way
527,footway,highway,when sidewalk/crosswalk is tagged as a separate way,traffic_island,"The way between two crossings, safespot for pedestrians, has micromapping characteristics as a detailed alternative to =*. Use in combination with highway=footway or highway=path.",way
528,sidewalk,highway,when sidewalk (or pavement) is tagged on the main roadway (see sidewalks),both | left | right | no,"Specifies that the highways has sidewalks on both sides, on one side or no sidewalk at all",way
530,cycleway,highway,cycleway tagged on the main roadway or lane (see bicycle),lane,A lane is a route that lies within the roadway,way
533,cycleway,highway,cycleway tagged on the main roadway or lane (see bicycle),track,"A track provides a route that is separated from traffic. In the United States, this term is often used to refer to bike lanes that are separated from lanes for cars by pavement buffers, bollards, parking lanes, and curbs. Note that a cycle track may alternatively be drawn as a separate way next to the road which is tagged as highway=cycleway.",way
535,cycleway,highway,cycleway tagged on the main roadway or lane (see bicycle),share_busway,There is a bus lane that cyclists are permitted to use.,way
537,cycleway,highway,cycleway tagged on the main roadway or lane (see bicycle),shared_lane,"Cyclists share a lane with motor vehicles, there are markings reminding about this. In some places these markings are known as ""sharrows"" ('sharing arrows') and this is the tag to use for those.",way
541,parking:left / :right / :both\n(hereafter: parking:side),highway,street parking tagged on the main roadway (see street parking),lane | street_side | on_kerb | half_on_kerb | shoulder | no | separate | yes,Primary key to record parking along the street. Describes the parking position of parked vehicles in the street.,way
542,parking:sideorientation=*,highway,street parking tagged on the main roadway (see street parking),parallel | diagonal | perpendicular,To specify the orientation of parked vehicles if there is street parking.,way


In [60]:
# Fix the 15 problematic rows

#--------------ROWS TO MODIFY-------------------

# Filter to identify rows related to 'footway'
mask_footway = (
    (df_osm_pre_clean['Main_category'] == 'footway') &
    (df_osm_pre_clean['Subcategory_before_table'] == 'highway') &
    (df_osm_pre_clean['Subcategory_in_table'].str.startswith('when sidewalk/crosswalk is tagged as a separate way'))
)

# Replace the first three columns for these rows
df_osm_pre_clean.loc[mask_footway, ['Main_category', 'Subcategory_before_table', 'Subcategory_in_table']] = [
    'highway', 'footway', 'footway'
]

# Filter to identify rows related to 'cycleway'
mask_cycleway = (
    (df_osm_pre_clean['Main_category'] == 'cycleway') &
    (df_osm_pre_clean['Subcategory_before_table'] == 'highway') &
    (df_osm_pre_clean['Subcategory_in_table'].str.startswith('cycleway tagged on the main roadway or lane (see bicycle)'))
)

# Replace the first three columns for these rows
df_osm_pre_clean.loc[mask_cycleway, ['Main_category', 'Subcategory_before_table', 'Subcategory_in_table']] = [
    'highway', 'cycleway', 'cycleway'
]

# Filter to identify rows related to 'emergency'
mask_emergency = (
    (df_osm_pre_clean['Main_category'] == 'emergency') &
    (df_osm_pre_clean['Subcategory_before_table'] == 'highway') &
    (df_osm_pre_clean['Subcategory_in_table'].str.startswith('other highway features'))
)

# Replace the first three columns for these rows
df_osm_pre_clean.loc[mask_emergency, ['Main_category', 'Subcategory_before_table', 'Subcategory_in_table']] = [
    'emergency', 'highway', 'highway'
]

#------------------ROWS TO DELETE--------------------------------------

# Remove rows that are no longer wanted

# Sidewalk-related rows
mask_sidewalk = (
    (df_osm_pre_clean['Main_category'] == 'sidewalk') &
    (df_osm_pre_clean['Subcategory_before_table'] == 'highway') &
    (df_osm_pre_clean['Subcategory_in_table'].str.startswith('when sidewalk (or pavement) is tagged on the main roadway (see sidewalks)'))
)

# Parking-related rows (variant 1)
mask_parking = (
    (df_osm_pre_clean['Main_category'] == "parking:left / :right / :both\n(hereafter: parking:side)")
)

# Parking-related rows (variant 2)
mask_parking2 = (
    (df_osm_pre_clean['Main_category'] == "parking:sideorientation=*")
)

# Landuse-related rows under railway
mask_landuse = (
    (df_osm_pre_clean['Main_category'] == 'landuse') &
    (df_osm_pre_clean['Subcategory_before_table'] == 'railway') &
    (df_osm_pre_clean['Subcategory_in_table'].str.startswith('infrastructure'))
)

# Public transport-related rows under railway
mask_publi_transport = (
    (df_osm_pre_clean['Main_category'] == 'public_transport') &
    (df_osm_pre_clean['Subcategory_before_table'] == 'railway') &
    (df_osm_pre_clean['Subcategory_in_table'].str.startswith('stations and stops'))
)

# Combine all deletion masks
combined_mask = mask_sidewalk | mask_parking | mask_parking2 | mask_landuse | mask_publi_transport

# Keep only rows that do not match any of the deletion masks
df_osm_clean =  df_osm_pre_clean[~combined_mask]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_osm_pre_clean.loc[mask_footway, ['Main_category', 'Subcategory_before_table', 'Subcategory_in_table']] = [
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_osm_pre_clean.loc[mask_cycleway, ['Main_category', 'Subcategory_before_table', 'Subcategory_in_table']] = [
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_osm_pre_clean.loc[mask_emergency, ['Main_category', 'Subcategory_before_table', 'Subcategory_in_table']] = [


### Fix the issue of having two columns for the subcategory

In [61]:
# Observation: where the subcategories differ, we keep the 'in_table' version
(df_osm_clean['Subcategory_before_table'] != df_osm_clean['Subcategory_in_table']).sum()

# Display the rows where the subcategories are different
df_osm_clean[df_osm_clean['Subcategory_before_table'] != df_osm_clean['Subcategory_in_table']]

Unnamed: 0,Main_category,Subcategory_before_table,Subcategory_in_table,Value,Description,Element
27,amenity,amenity,sustenance,bar,"Bar is a purpose-built commercial establishment that sells alcoholic drinks to be consumed on the premises. They are characterised by a noisy and vibrant atmosphere, similar to a party and usually don't sell food. See also the description of the tags amenity=pub;bar;restaurant for a distinction between these.",node / area
28,amenity,amenity,sustenance,biergarten,"Biergarten or beer garden is an open-air area where alcoholic beverages along with food is prepared and served. See also the description of the tags amenity=pub;bar;restaurant. A biergarten can commonly be found attached to a beer hall, pub, bar, or restaurant. In this case, you can use biergarten=yes additional to amenity=pub;bar;restaurant.",node / area
29,amenity,amenity,sustenance,cafe,"Cafe is generally an informal place that offers casual meals and beverages; typically, the focus is on coffee or tea. Also known as a coffeehouse/shop, bistro or sidewalk cafe. The kind of food served may be mapped with the tags cuisine=* and diet:*=*. See also the tags amenity=restaurant;bar;fast_food.",node / area
30,amenity,amenity,sustenance,fast_food,Fast food restaurant (see also amenity=restaurant). The kind of food served can be tagged with cuisine=* and diet:*=*.,node / area
31,amenity,amenity,sustenance,food_court,"An area with several different restaurant food counters and a shared eating area. Commonly found in malls, airports, etc.",node / area
...,...,...,...,...,...,...
1241,shop,shop,others,travel_agency,Shop focused on selling tickets for travelling. Also known as a tour operator.,
1242,shop,shop,others,vacant,"An unused vacant shop. Can be used for an empty or abandoned retail space that seems to be available for lease or purchase where a retail store (or similar businesses) may be opened. No store is being operated in a vacant shop, and neither goods nor services are being offered. This tag can be used after a store was closed or for a newly built shop before any busisness was operated in the shop.",
1243,shop,shop,others,weapons,"Shop focused on selling weapons like knives, guns etc.",
1244,shop,shop,others,yes,"A shop of unspecified type – it is always better to use another value that gives info about shop type, if possible. Used also as indicator that feature such as fuel station has a shop.",


In [62]:
# Count the rows where the two subcategory columns are identical
(df_osm_clean['Subcategory_before_table'] == df_osm_clean['Subcategory_in_table']).sum()

# Display the rows where the two subcategory columns are identical
df_osm_clean[df_osm_clean['Subcategory_before_table'] == df_osm_clean['Subcategory_in_table']]

Unnamed: 0,Main_category,Subcategory_before_table,Subcategory_in_table,Value,Description,Element
0,aerialway,aerialway,aerialway,cable_car,"A cable car run. Just one or two large cars. The traction cable forms a loop, but the cars do not loop around, they just move up and down on their own side, rolling along static cables over which they are suspended.",way
1,aerialway,aerialway,aerialway,gondola,An aerialway where the cabins go around in a circle.,way
2,aerialway,aerialway,aerialway,mixed_lift,"A mixed lift, containing both gondolas and chairs.",way
3,aerialway,aerialway,aerialway,chair_lift,An open chairlift run. These have one or more seats or benches and are open to the outside air.,way
4,aerialway,aerialway,aerialway,drag_lift,An overhead tow-line for skiers and riders.,way
...,...,...,...,...,...,...
1303,waterway,barriers on waterways,barriers on waterways,lock_gate,A gate of a lock,node / way
1304,waterway,other features on waterways,other features on waterways,soakhole,The point at which a river or stream percolates into or through the soil,node
1305,waterway,other features on waterways,other features on waterways,turning_point,"A place to turn the driving direction for vessels, where the boats are longer than the river/canal is wide.",node
1306,waterway,other features on waterways,other features on waterways,water_point,A place to fill fresh water holding tanks of a boat.,node


In [63]:
# Keep only the relevant columns for the final OSM dataframe
df_osm = df_osm_clean[["Main_category", "Subcategory_in_table", "Value", "Description", "Element"]]

# Rename 'Subcategory_in_table' to 'Sub_category' for clarity
df_osm = df_osm.rename(columns={"Subcategory_in_table":"Sub_category"})

# Optional: export the cleaned OSM dataframe to a CSV file
# df_osm.to_csv('categories_OSM.csv', index=False, sep=";")

In [11]:
df_osm

Unnamed: 0,Main_category,Sub_category,Value,Description,Element
0,aerialway,aerialway,cable_car,A cable car run. Just one or two large cars. T...,way
1,aerialway,aerialway,gondola,An aerialway where the cabins go around in a c...,way
2,aerialway,aerialway,mixed_lift,"A mixed lift, containing both gondolas and cha...",way
3,aerialway,aerialway,chair_lift,An open chairlift run. These have one or more ...,way
4,aerialway,aerialway,drag_lift,An overhead tow-line for skiers and riders.,way
...,...,...,...,...,...
1303,waterway,barriers on waterways,lock_gate,A gate of a lock,node / way
1304,waterway,other features on waterways,soakhole,The point at which a river or stream percolate...,node
1305,waterway,other features on waterways,turning_point,A place to turn the driving direction for vess...,node
1306,waterway,other features on waterways,water_point,A place to fill fresh water holding tanks of a...,node


### Format the data to match the FourSquare categories

In [64]:
# Create a new column 'Tag_depth' that contains the depth of categorization for each tag
df_osm['Tag_depth'] = df_osm.iloc[:, :3].apply(lambda row: len(set(row)), axis=1)

# Rename 'Main_category' to 'Depth_1'
df_osm = df_osm.rename(columns={"Main_category":"Depth_1"})

# Fill 'Depth_2': use 'Sub_category' if the tag depth is 3, otherwise use 'Value'
df_osm['Depth_2'] = df_osm.apply(
    lambda row: row['Sub_category'] if row['Tag_depth'] == 3 else row['Value'],
    axis=1
)

# Fill 'Depth_3': use 'Value' if the tag depth is 3, otherwise None
df_osm['Depth_3'] = df_osm.apply(
    lambda row: row['Value'] if row['Tag_depth'] == 3 else None,
    axis=1
)

# Rename columns and reorder to match the desired format
df_osm_final = df_osm[["Value", "Tag_depth","Depth_1", "Depth_2", "Depth_3", "Description", "Element"]]
df_osm_final = df_osm_final.rename(columns={"Value":"Tag"})

# Remove underscores from names in all relevant columns
df_osm_final['Tag'] = df_osm_final['Tag'].str.replace('_', ' ', regex=False)
df_osm_final['Depth_1'] = df_osm_final['Depth_1'].str.replace('_', ' ', regex=False)
df_osm_final['Depth_2'] = df_osm_final['Depth_2'].str.replace('_', ' ', regex=False)
df_osm_final['Depth_3'] = df_osm_final['Depth_3'].str.replace('_', ' ', regex=False)

In [30]:
df_osm_final

Unnamed: 0,Tag,Tag_depth,Depth_1,Depth_2,Depth_3,Description,Element
0,cable car,2,aerialway,cable car,,A cable car run. Just one or two large cars. T...,way
1,gondola,2,aerialway,gondola,,An aerialway where the cabins go around in a c...,way
2,mixed lift,2,aerialway,mixed lift,,"A mixed lift, containing both gondolas and cha...",way
3,chair lift,2,aerialway,chair lift,,An open chairlift run. These have one or more ...,way
4,drag lift,2,aerialway,drag lift,,An overhead tow-line for skiers and riders.,way
...,...,...,...,...,...,...,...
1303,lock gate,3,waterway,barriers on waterways,lock gate,A gate of a lock,node / way
1304,soakhole,3,waterway,other features on waterways,soakhole,The point at which a river or stream percolate...,node
1305,turning point,3,waterway,other features on waterways,turning point,A place to turn the driving direction for vess...,node
1306,water point,3,waterway,other features on waterways,water point,A place to fill fresh water holding tanks of a...,node


In [65]:
# Correct the three rows that have the wrong Depth_1

# Filter for the row related to 'busbar'
mask_emergency = (
    (df_osm_final['Depth_1'] == 'line') &
    (df_osm_final['Depth_2'] == 'power')
)

# Replace the first two depth columns for this row
df_osm_final.loc[mask_emergency, ['Depth_1', 'Depth_2']] = [
    'power', 'line'
]

# Filter for the row related to 'tower'
mask_emergency = (
    (df_osm_final['Depth_1'] == 'lifeguard') &
    (df_osm_final['Depth_2'] == 'lifeguards') &
    (df_osm_final['Depth_3'].str.startswith('tower'))
)

# Replace the three depth columns for this row
df_osm_final.loc[mask_emergency, ['Depth_1', 'Depth_2', 'Depth_3']] = [
    'emergency', 'lifeguards', 'tower'
]

In [None]:
# Remove tags that are not meaningful or valid
tags_to_remove = ['user defined', 'yes', 'User Defined', "User defined"]
df_osm_final = df_osm_final[~df_osm_final['Tag'].isin(tags_to_remove)]

# Remove tags that do not have an equivalent in FourSquare
tags_to_remove = ['busbar', 'pumping station', 'pump', "retaining wall"]
df_osm_final = df_osm_final[~df_osm_final['Tag'].isin(tags_to_remove)]

# Correct specific tags 'caravan' and 'it', which will later be removed during stopword cleaning
df_osm_final.loc[df_osm_final['Tag'] == 'it', 'Tag'] = 'information technology'
df_osm_final.loc[df_osm_final['Depth_2'] == 'it', 'Depth_2'] = 'information technology'

df_osm_final.loc[df_osm_final['Tag'] == '[[ Too many Data Items entities accessed. | caravan ]]', 'Tag'] = 'caravan'
df_osm_final.loc[df_osm_final['Depth_3'] == '[[ Too many Data Items entities accessed. | caravan ]]', 'Depth_3'] = 'caravan'

In [None]:
# Save as 
#df_osm_final.to_csv('Database/Clean_categories/categories_OSM_clean.csv', index=False, sep=";")