Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update get_location documentation and add caching #171

Merged
merged 1 commit into from
Aug 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pynsee/geodata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
from .get_geodata_list import get_geodata_list
from .GeoFrDataFrame import GeoFrDataFrame


__all__ = ["get_geodata", "get_geodata_list", "GeoFrDataFrame"]
2 changes: 1 addition & 1 deletion pynsee/geodata/get_geodata.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-

import warnings
from pynsee.geodata.GeoFrDataFrame import GeoFrDataFrame
from pynsee.geodata import GeoFrDataFrame
from pynsee.geodata._get_geodata import _get_geodata


Expand Down
232 changes: 231 additions & 1 deletion pynsee/sirene/SireneDataFrame.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,44 @@
import logging
import re
import requests
import warnings

from functools import lru_cache
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

import numpy as np
import pandas as pd

from tqdm import trange
from shapely.geometry import Point
from shapely.errors import ShapelyDeprecationWarning

from pynsee.geodata import GeoFrDataFrame
from pynsee.sirene._get_location_openstreetmap import (
_get_location_openstreetmap,
)


logger = logging.getLogger(__name__)


@lru_cache(maxsize=None)
def _warning_get_location():
logger.warning(
"For at least one point, exact location has not been found, city "
"location has been given instead"
)


@lru_cache(maxsize=None)
def _warning_OSM():
logger.info(
"This function returns data made available by OpenStreetMap and its "
"contributors.\n"
"Please comply with Openstreetmap's Copyright and ODbL Licence"
)


class SireneDataFrame(pd.DataFrame):
"""Class for handling dataframes built from INSEE SIRENE API's data"""
Expand All @@ -11,4 +50,195 @@ def __init__(self, *args, **kwargs):
def _constructor(self):
return SireneDataFrame

from pynsee.sirene.get_location import get_location
def get_location(self, update=False):
"""
Get latitude and longitude from OpenStreetMap, add geometry column and
turn ``SireneDataframe`` into ``GeoFrDataFrame``.

Args:
update (bool, optional): data is saved locally, set update=True to
trigger an update. Defaults to False.

Notes:
If it fails to find the exact location, by default it returns the
location of the city. Whether the exact location has been found or
not is encoded in the `exact_location` column of the new
``GeoFrDataFrame``.

Examples:
>>> from pynsee.metadata import get_activity_list
>>> from pynsee.sirene import search_sirene
>>> #
>>> # Get activity list
>>> naf5 = get_activity_list('NAF5')
>>> #
>>> # Get alive legal entities belonging to the automotive industry
>>> df = search_sirene(variable = ["activitePrincipaleEtablissement"],
>>> pattern = ['29.10Z'], kind = 'siret')
>>> #
>>> # Keep businesses with more than 100 employees
>>> df = df.loc[df['effectifsMinEtablissement'] > 100]
>>> df = df.reset_index(drop=True)
>>> #
>>> # Get location
>>> df = df.get_location()
"""
_warning_OSM()

with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", category=ShapelyDeprecationWarning)

df = self.reset_index(drop=True)

def clean(string):
if pd.isna(string):
cleaned = ""
else:
cleaned = string
return cleaned

list_col = [
"siret",
"numeroVoieEtablissement",
"typeVoieEtablissementLibelle",
"libelleVoieEtablissement",
"codePostalEtablissement",
"libelleCommuneEtablissement",
]

if set(list_col).issubset(df.columns):
list_location = []
timeSleep = 1
session = requests.Session()
retry = Retry(connect=3, backoff_factor=timeSleep)
adapter = HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)

for i in trange(len(df.index), desc="Getting location"):
siret = clean(df.loc[i, "siret"])
nb = clean(df.loc[i, "numeroVoieEtablissement"])
street_type = clean(
df.loc[i, "typeVoieEtablissementLibelle"])
street_name = clean(df.loc[i, "libelleVoieEtablissement"])

postal_code = clean(df.loc[i, "codePostalEtablissement"])
city = clean(df.loc[i, "libelleCommuneEtablissement"])
city = re.sub("[0-9]|EME", "", city)

city = re.sub(" D ", " D'", re.sub(" L ", " L'", city))
street_name = re.sub(
" D ", " D'", re.sub(" L ", " L'", street_name)
)
street_type = re.sub(
" D ", " D'", re.sub(" L ", " L'", street_type)
)

list_var = []

variables = [
nb, street_type, street_name, postal_code, city
]

for var in variables:
if var != "":
list_var += [re.sub(" ", "+", var)]

query = "+".join(list_var)

if query != "":
query += "+FRANCE"

list_var_backup = []

for var in [postal_code, city]:
if var != "":
list_var_backup += [re.sub(" ", "+", var)]

query_backup = "+".join(list_var_backup)

if query_backup != "":
query_backup += "+FRANCE"

exact_location = True

try:
(
lat,
lon,
category,
typeLoc,
importance,
) = _get_location_openstreetmap(
query=query, session=session, update=update
)
except Exception:
exact_location = False

try:
(
lat,
lon,
category,
typeLoc,
importance,
) = _get_location_openstreetmap(
query=query_backup, session=session,
update=update
)
importance = None
except Exception:
lat, lon, category, typeLoc, importance = (
None,
None,
None,
None,
None,
)
else:
_warning_get_location()

df_location = pd.DataFrame({
"siret": siret,
"latitude": lat,
"longitude": lon,
"category": category,
"crsCoord": "EPSG:4326",
"type": typeLoc,
"importance": importance,
"exact_location": exact_location,
},
index=[0],
)

list_location.append(df_location)

df_location = pd.concat(list_location)
df_location = df_location.reset_index(drop=True)

sirene_df = pd.merge(
self, df_location, on="siret", how="left")

sirene_df["latitude"] = pd.to_numeric(sirene_df["latitude"])
sirene_df["longitude"] = pd.to_numeric(sirene_df["longitude"])
list_points = []

for i in range(len(sirene_df.index)):
if (sirene_df.loc[i, "latitude"] is None) or np.isnan(
sirene_df.loc[i, "latitude"]
):
list_points += [None]
else:
list_points += [
Point(
sirene_df.loc[i, "longitude"],
sirene_df.loc[i, "latitude"],
)
]

sirene_df["geometry"] = list_points

return GeoFrDataFrame(sirene_df)

return df
38 changes: 28 additions & 10 deletions pynsee/sirene/_get_location_openstreetmap.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import json
import os
from pathlib import Path
import requests

from pathlib import Path
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

import pandas as pd

from pynsee.utils._create_insee_folder import _create_insee_folder
from pynsee.utils._hash import _hash
from pynsee.utils._make_dataframe_from_dict import _make_dataframe_from_dict
from pynsee.utils._warning_cached_data import _warning_cached_data


def _get_location_openstreetmap(query, session=None):
def _get_location_openstreetmap(query, session=None, update=False):

if session is None:
session = requests.Session()
Expand All @@ -17,26 +23,38 @@ def _get_location_openstreetmap(query, session=None):
session.mount("http://", adapter)
session.mount("https://", adapter)

api_link = "https://nominatim.openstreetmap.org/search.php?q={}&format=jsonv2&limit=1".format(
query
)
# api_link = 'https://nominatim.openstreetmap.org/search?q=ZONE+INDUSTRIELLE+54980+BATILLY+FRANCE&format=json&limit=1'
api_link = "https://nominatim.openstreetmap.org/search.php?" \
f"q={query}&format=jsonv2&limit=1"

insee_folder = _create_insee_folder()
filename = os.path.join(insee_folder, f"{_hash(api_link)}.json")

try:
home = str(Path.home())
user_agent = os.path.basename(home)
except:
except Exception:
user_agent = ""

headers = {"User-Agent": "python_package_pynsee_" + user_agent.replace("/", "")}

try:
proxies = {"http": os.environ["http_proxy"], "https": os.environ["https_proxy"]}
except:
except Exception:
proxies = {"http": "", "https": ""}

results = session.get(api_link, proxies=proxies, headers=headers)
data = results.json()
data = None

if update or not os.path.isfile(filename):
results = session.get(api_link, proxies=proxies, headers=headers)
data = results.json()

with open(filename, "w") as f:
json.dump(data, f)
else:
with open(filename, "r") as f:
data = json.load(f)

_warning_cached_data(filename)

list_dataframe = []

Expand Down
Loading
Loading