# Data anonymization notebook

## Steps

- GPS coordinates anonymization
    - Read legs coords
    - Read shapefile
    - add geometry to End poin

- Coordinate Anonymizazion Procedure
- Conversion from pkl to csv

In [None]:
# import libraries
## system libraries
import os
import sys
import json
import time
import random
import pathlib
from glob import glob
from datetime import date, datetime

## numerical libraries
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize

## plotting libraries
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import rcParams

# geo libraries
import geopandas as gpd

# reports
from docx import Document
from docx.shared import Inches

import warnings

warnings.filterwarnings("ignore")

In [None]:
# notebool options
%matplotlib inline

pd.set_option("display.max_columns", 500)
rcParams["axes.titlepad"] = 45
rcParams["font.size"] = 16

In [None]:
# global variables
CUTTING_DATE = "2019-05-01"  # remove trips and data published before this date

data_campaigns_path = os.path.join("../..", "data-campaigns/")
meta_data_path = os.path.join(data_campaigns_path, "meta-data/")
shape_data = os.path.join(data_campaigns_path, "shapefiles/")
gps_data = os.path.join(data_campaigns_path, "2020-01-15.GPS/")
out_path = pathlib.Path("../../2019-12-16.out/")
out_path_dataset = pathlib.Path("../../2019-12-16.out/dataset/")

# preprocessed data
input_path = os.path.join("../..", "2019-12-16.out/")

# raw input data
raw_data_path = os.path.join(data_campaigns_path, "2019-10-30.all/")
raw_data_update_path = os.path.join(data_campaigns_path, "2019-12-16.update/")

# anon dataset (output)
anon_dataset_dir = input_path.rstrip("/").split("/")[-1].replace(".out", ".anon")
anon_dataset_path = os.path.join("../..", "anon-dataset", anon_dataset_dir)

In [None]:
try:
    os.makedirs(os.path.abspath(anon_dataset_path))
except FileExistsError:
    print("Directory '{}' already exists".format(anon_dataset_path), file=sys.stderr)

In [None]:
try:
    os.makedirs(os.path.abspath(out_path_dataset))
except FileExistsError:
    print("Directory '{}' already exists".format(out_path_dataset), file=sys.stderr)

### Utils

In [None]:
def check_value_coords(value, lat, lon):
    if pd.isna(lat) or pd.isna(lon):
        return np.nan
    else:
        return value

## Read input and output

In [None]:
# input files
legs = "all_legs_merged_no_outlier_0.01.pkl"
trips_users = "trips_users_df.pkl"
trips = "trips_df.pkl"

# read datasets
legs_df = pd.read_pickle(input_path + legs)
trips_users_df = pd.read_pickle(input_path + trips_users)
trips_df = pd.read_pickle(input_path + trips)

In [None]:
legs_df.head(3)

In [None]:
trips_users_df.head(3)

In [None]:
trips_df.head(3)

## GPS coordinates anonymization

### Read legs gps coorinates

In [None]:
# read legs data
gps_data_df = pd.read_csv(gps_data + "allLegs.csv")
gps_data_df.rename(columns={"legId": "legid"}, inplace=True)

gps_data_df.drop_duplicates(["legid"], keep="first")

In [None]:
print(gps_data_df.shape)
gps_data_df.head(3)

### Read shapefile

In [None]:
shape_data

In [None]:
xxx = gpd.read_file(shape_data + "Italy/Italy.shp")
xxx.head()

In [None]:
xxx.plot()

In [None]:
xxx = gpd.read_file(shape_data + "Italy/Italy_core.shp")
xxx.head()

In [None]:
xxx.plot()

In [None]:
from shapely.geometry import Point


def read_shapes_in_dir_with_ext(directory_str, extension_str=".shp", quiet=False):

    shape_path = os.path.join(directory_str, "*{}".format(extension_str))
    shape_files = glob(shape_path)

    df_list = []
    for shape_file in shape_files:
        filename = shape_file.rstrip("/").split("/")[-1]
        if not quiet:
            print("  - reading {}...".format(filename))

        tmp_df = gpd.read_file(shape_file)
        df_list.append(tmp_df)

    final_df = pd.concat(df_list, axis=0, ignore_index=True)
    return final_df


def read_shapes(extension_str=".shp", quiet=False):
    shape_df_list = list()
    shape_dirs = glob(os.path.join(shape_data, "*/"))

    for country_dir in shape_dirs:
        country = country_dir.rstrip("/").split("/")[-1]
        if not quiet:
            print("* reading shapefiles for {}...".format(country))
        shape_df_list.append(
            read_shapes_in_dir_with_ext(country_dir, extension_str, quiet)
        )

    shapes_df_all = pd.concat(shape_df_list, axis=0, ignore_index=True)

    return shapes_df_all

In [None]:
shapes_df_all = read_shapes(".shp")
print(shapes_df_all.shape)
shapes_df_all.head()

In [None]:
shapes_df_all = read_shapes("*_core.shp")
print(shapes_df_all.shape)
shapes_df_all.head()

In [None]:
from shapely.geometry import Point

shapes_df_all = read_shapes(".shp")

In [None]:
# def classify_start_point(gps_data_df, classe, level):
geom = gps_data_df.apply(lambda x: Point([x["StartLon"], x["StartLat"]]), axis=1)
gps_data_df = gpd.GeoDataFrame(gps_data_df, geometry=geom)
gps_data_df.crs = {"init": "epsg:4326"}
gps_data_df.head()

In [None]:
print("gps_data_df.shape:", gps_data_df.shape)

In [None]:
# If you get an error of
#    sjoin 'NoneType' object has no attribute 'intersection'
# you need to check if yoiu have the rtree package installed

In [None]:
gps_cities_start = gpd.sjoin(gps_data_df, shapes_df_all, how="left", op="intersects")
print(gps_cities_start.shape)
gps_cities_start.head(3)

In [None]:
gps_cities_start.rename(
    columns={
        "fuaname_en": "fuaname_en_start",
        "iso3": "country_start",
        "class_code": "class_code_start",
    },
    inplace=True,
)
gps_cities_start = gps_cities_start[
    [
        "legid",
        "StartLat",
        "StartLon",
        "EndLat",
        "EndLon",
        "fuaname_en_start",
        "country_start",
        "class_code_start",
    ]
]
gps_cities_start.head(3)
gps_cities_start["start_class"] = gps_cities_start.apply(
    lambda x: "-" if pd.isnull(x["fuaname_en_start"]) else "sub-urban", axis=1
)
gps_cities_start.head()

shapes_df_all = read_shapes("_core.shp")
geom = gps_cities_start.apply(lambda x: Point([x["EndLon"], x["EndLat"]]), axis=1)
gps_cities_start = gpd.GeoDataFrame(gps_cities_start, geometry=geom)
gps_cities_start.crs = {"init": "epsg:4326"}
gps_cities_start.head()

print("gps_data_df.shape:", gps_data_df.shape)
gps_cities_start = gpd.sjoin(
    gps_cities_start, shapes_df_all, how="left", op="intersects"
)
print("gps_cities_start.shape:", gps_cities_start.shape)
gps_cities_start.head(3)

In [None]:
gps_cities_start.groupby("start_class").size()

In [None]:
gps_cities_start.rename(
    columns={
        "fuaname_en": "fuaname_en_start2",
        "iso3": "country_start2",
        "class_code": "class_code_start2",
    },
    inplace=True,
)
gps_cities_start = gps_cities_start[
    [
        "legid",
        "StartLat",
        "StartLon",
        "EndLat",
        "EndLon",
        "fuaname_en_start",
        "country_start",
        "fuaname_en_start2",
        "country_start2",
        "class_code_start2",
        "start_class",
    ]
]

gps_cities_start["start_class"] = gps_cities_start.apply(
    lambda x: x["start_class"] if pd.isnull(x["fuaname_en_start2"]) else "urban", axis=1
)
gps_cities_start["start_class"] = gps_cities_start["start_class"].replace("-", "rural")

gps_cities_start["country_start"] = gps_cities_start.apply(
    lambda x: x["country_start"]
    if pd.isnull(x["country_start2"])
    else x["country_start2"],
    axis=1,
)
gps_cities_start["fuaname_en_start"] = gps_cities_start.apply(
    lambda x: x["fuaname_en_start"]
    if pd.isnull(x["fuaname_en_start2"])
    else x["fuaname_en_start2"],
    axis=1,
)

gps_cities_start.head(3)


# gps_cities_start['start_class'] = gps_cities_start.apply(lambda x: x['start_class'] if pd.isnull(x['fuaname_en_start']) else classe , axis=1)
# gps_cities_start.head()
# return gps_cities_start

In [None]:
gps_cities_start.head(5)

In [None]:
gps_cities_start.groupby("start_class").size()

### add geometry to End point

In [None]:
from shapely.geometry import Point

shapes_df_all = read_shapes(".shp")

# def classify_end_point(gps_data_df, classe, level):
geom = gps_data_df.apply(lambda x: Point([x["EndLon"], x["EndLat"]]), axis=1)
gps_data_df = gpd.GeoDataFrame(gps_data_df, geometry=geom)
gps_data_df.crs = {"init": "epsg:4326"}
gps_data_df.head()


print(gps_data_df.shape)
gps_cities_end = gpd.sjoin(gps_data_df, shapes_df_all, how="left", op="intersects")
print(gps_cities_end.shape)
gps_cities_end.head(3)


gps_cities_end.rename(
    columns={
        "fuaname_en": "fuaname_en_end",
        "iso3": "country_end",
        "class_code": "class_code_end",
    },
    inplace=True,
)
gps_cities_end = gps_cities_end[
    [
        "legid",
        "StartLat",
        "StartLon",
        "EndLat",
        "EndLon",
        "fuaname_en_end",
        "country_end",
        "class_code_end",
    ]
]
gps_cities_end.head(3)
gps_cities_end["end_class"] = gps_cities_end.apply(
    lambda x: "-" if pd.isnull(x["fuaname_en_end"]) else "sub-urban", axis=1
)
gps_cities_end.head()

# ======= ======= ======= ======= ======= =======
# ======= ======= ======= ======= ======= =======

shapes_df_all = read_shapes("_core.shp")
geom = gps_cities_end.apply(lambda x: Point([x["EndLon"], x["EndLat"]]), axis=1)
gps_cities_end = gpd.GeoDataFrame(gps_cities_end, geometry=geom)
gps_cities_end.crs = {"init": "epsg:4326"}
gps_cities_end.head()

print(gps_data_df.shape)
gps_cities_end = gpd.sjoin(gps_cities_end, shapes_df_all, how="left", op="intersects")
print(gps_cities_end.shape)
gps_cities_end.head(3)

gps_cities_end.rename(
    columns={
        "fuaname_en": "fuaname_en_end2",
        "iso3": "country_end2",
        "class_code": "class_code_end2",
    },
    inplace=True,
)
gps_cities_end = gps_cities_end[
    [
        "legid",
        "StartLat",
        "StartLon",
        "EndLat",
        "EndLon",
        "fuaname_en_end",
        "country_end",
        "fuaname_en_end2",
        "country_end2",
        "class_code_end2",
        "end_class",
    ]
]

gps_cities_end["end_class"] = gps_cities_end.apply(
    lambda x: x["end_class"] if pd.isnull(x["fuaname_en_end2"]) else "urban", axis=1
)
gps_cities_end["end_class"] = gps_cities_end["end_class"].replace("-", "rural")

gps_cities_end["country_end"] = gps_cities_end.apply(
    lambda x: x["country_end"] if pd.isnull(x["country_end2"]) else x["country_end2"],
    axis=1,
)
gps_cities_end["fuaname_en_end"] = gps_cities_end.apply(
    lambda x: x["fuaname_en_end"]
    if pd.isnull(x["fuaname_en_end2"])
    else x["fuaname_en_end2"],
    axis=1,
)

gps_cities_end.head(3)


# gps_cities_end['end_class'] = gps_cities_end.apply(lambda x: x['end_class'] if pd.isnull(x['fuaname_en_end']) else classe , axis=1)
# gps_cities_end.head()
# return gps_cities_end

In [None]:
shapes_df_all.head(3)

In [None]:
tmp_shape = shapes_df_all["geometry"].iloc[0]
print("Centroid:", tmp_shape.centroid)
print("(x, y):", "({}, {})".format(tmp_shape.centroid.x, tmp_shape.centroid.y))

In [None]:
shapes_df_all["centroid_x"] = shapes_df_all["geometry"].apply(
    lambda shape: shape.centroid.x
)
shapes_df_all["centroid_y"] = shapes_df_all["geometry"].apply(
    lambda shape: shape.centroid.y
)

In [None]:
shapes_df_all.head(3)

In [None]:
gps_cities_end.groupby("end_class").size()

In [None]:
gps_cities_end.head()

In [None]:
gps_cities = pd.merge(
    gps_cities_start,
    gps_cities_end[
        ["legid", "fuaname_en_end", "country_end", "class_code_end2", "end_class"]
    ],
    on="legid",
)
print(gps_cities.shape)
gps_cities.tail()

#### Save coordinate pickle

In [None]:
gps_cities.to_pickle(input_path + "gps_cities.pkl")

In [None]:
shapes_df_all.to_pickle(input_path + "shapes_df_all.pkl")

#### Add urban/sub-urban/rural classification to legs DataFrame

In [None]:
legs_df.shape

In [None]:
print(legs_df.shape)
all_legs_final_ds_user_info_gps = pd.merge(legs_df, gps_data_df, on="legid", how="left")
print(all_legs_final_ds_user_info_gps.shape)
print(
    "- Total missing gps coordinates: ",
    all_legs_final_ds_user_info_gps[
        all_legs_final_ds_user_info_gps.EndLat.isna()
    ].shape[0],
)
print(
    "- Of the above missing coordinates, the number of transfer leg (waitingEvent) is :",
    all_legs_final_ds_user_info_gps[
        all_legs_final_ds_user_info_gps["class"] == "WaitingEvent"
    ].shape[0],
)

print(
    "- While, the number of leg (class=Leg) is with missing coordinates is :",
    all_legs_final_ds_user_info_gps[
        (all_legs_final_ds_user_info_gps.EndLat.isna())
        & (all_legs_final_ds_user_info_gps["class"] == "Leg")
    ].shape[0],
)

In [None]:
all_legs_final_ds_user_info_urban_class = pd.merge(
    legs_df,
    gps_cities[
        [
            "legid",
            "fuaname_en_start",
            "country_start",
            "fuaname_en_end",
            "country_end",
            "start_class",
            "end_class",
        ]
    ],
    on="legid",
    how="left",
)
all_legs_final_ds_user_info_urban_class.tail(2)

In [None]:
all_legs_final_ds_user_info_urban_class[
    all_legs_final_ds_user_info_urban_class["start_class"].isna()
].shape
all_legs_final_ds_user_info_urban_class.to_pickle(
    input_path + "all_legs_final_ds_user_info_urban_class.pkl"
)

In [None]:
zone_dist = (
    all_legs_final_ds_user_info_urban_class.groupby(["start_class", "end_class"])
    .size()
    .reset_index()
)
zone_dist.columns = ["start_class", "end_class", "#legs"]
print(zone_dist["#legs"].sum())
zone_dist

In [None]:
# 66432+21378 (missing coordinates , 21228 transfer legs and 150 legs)

In [None]:
len(
    all_legs_final_ds_user_info_urban_class[
        ~all_legs_final_ds_user_info_urban_class["start_class"].isna()
    ]["tripid"].unique()
)

In [None]:
len(all_legs_final_ds_user_info_urban_class["userid"].unique())

In [None]:
legs_df.columns

In [None]:
trips_users_df.columns

In [None]:
trips_df.columns

In [None]:
all_legs_final_ds_user_info_urban_class.head(3)

### Coordinate Anonymizazion Procedure

* city area round to 50-100m (better more than less ie 100m): 3rd decimal place
* suburb - commuting zone - 250m - round third decimal to .5
* rural - 500m - round 2nd decimal to .5`

In [None]:
def round_to_digit(num, digit=5):
    return digit * round(num / digit)


def round_to_decimal_digit(num, ndecimals=0, digit=5):
    rounded = round(num, ndecimals) * pow(10, ndecimals)
    return round_to_digit(rounded, digit) / float(pow(10, ndecimals))


def anonymize_coord(coord, coord_class):
    anon_coord = -1

    # TODO if coord is not a number?

    if coord_class == "urban":
        anon_coord = round(coord, 3)
    elif coord_class == "sub-urban":
        anon_coord = round_to_decimal_digit(coord, 3, 5)
    else:
        anon_coord = round_to_decimal_digit(coord, 2, 5)

    return anon_coord


def anonymize_coord_nan(coord, coord_class):
    if pd.isna(coord):
        return np.nan
    else:
        return anonymize_coord(coord, coord_class)

In [None]:
all_legs_final_ds_user_info_urban_class.head(3)

In [None]:
gps_cities.head(3)

In [None]:
anonymize_coord(38.635434, "sub-urban")

In [None]:
coords = [0.173257, 0.17713, 0.17813, 0.17432]

In [None]:
for coord in coords:
    print(coord)
    print("urban:", anonymize_coord(coord, "urban"))
    print("sub-urban:", anonymize_coord(coord, "sub-urban"))
    print("rural:", anonymize_coord(coord, "rural"))
    print("---")

In [None]:
all_legs_final_ds_user_info_urban_class.head(3)

In [None]:
gps_cities.head(3)

In [None]:
anon_gps = gps_cities.copy()

anon_gps["StartLat"] = anon_gps.apply(
    lambda row: anonymize_coord(row["StartLat"], row["start_class"]), axis=1
)
anon_gps["StartLon"] = anon_gps.apply(
    lambda row: anonymize_coord(row["StartLon"], row["start_class"]), axis=1
)

anon_gps["EndLat"] = anon_gps.apply(
    lambda row: anonymize_coord(row["EndLat"], row["end_class"]), axis=1
)
anon_gps["EndLon"] = anon_gps.apply(
    lambda row: anonymize_coord(row["EndLon"], row["end_class"]), axis=1
)

In [None]:
anon_gps.head(3)

In [None]:
colstodrop = [
    "fuaname_en_start2",
    "country_start2",
    "class_code_start2",
    "class_code_end2",
]
anon_gps.drop(colstodrop, axis=1, inplace=True)

colstorename = {
    "StartLat": "start_lat",
    "StartLon": "start_lon",
    "EndLat": "end_lat",
    "EndLon": "end_lon",
    "fuaname_en_start": "start_name",
    "country_start": "start_country",
    "fuaname_en_end": "end_name",
    "country_end": "end_country",
}

anon_gps.rename(columns=colstorename, inplace=True)

In [None]:
anon_gps.head(3)

In [None]:
print("anon coordinates DF shape: ", anon_gps.shape)

In [None]:
anon_gps.drop_duplicates(subset=["legid"], keep="first", inplace=True)
print("anon coordinates DF shape: ", anon_gps.shape)

In [None]:
output_file = "legs_coordinates.csv"
output_path = os.path.join(anon_dataset_path, output_file)

anon_gps.to_csv(output_path, index=False, header=True, float_format="%.3f")

In [None]:
print(output_path)

## Preprocessed Data

### all_legs: Conversion from pkl to csv

In [None]:
print("all legs preprocessed: ", legs_df.shape)
legs_df.columns

In [None]:
cols_to_remove = [
    "workAddress._id",
    "workAddress.address",
    "homeAddress._id",
    "homeAddress.address",
]
legs_df_anon = legs_df.drop(cols_to_remove, axis=1)
print(legs_df_anon.shape)

## save to csv
output_file = "all_legs_merged_no_outlier_0.01_anonymized.csv"
output_path = os.path.join(anon_dataset_path, output_file)
legs_df_anon.to_csv(output_path, index=False, header=True, float_format="%.3f")

## Raw Data

In [None]:
# 1. All trips until 30-10-2019

trips_prev_df = []
first = True
read_files = glob(raw_data_path + "*_tripsData.json")

tot = 0
for f in read_files:
    print(f)
    with open(f) as f:
        trips = json.loads(f.read())
        trips_prev_df_temp = json_normalize(trips)
    print(trips_prev_df_temp.shape)
    tot += trips_prev_df_temp.shape[0]

    if first:
        trips_prev_df = trips_prev_df_temp
        first = False
    else:
        trips_prev_df = pd.concat([trips_prev_df, trips_prev_df_temp])

print()
print("Tot = ", tot)
print(trips_prev_df.shape)
print()
print("Remove duplicates...")
trips_prev_df_nodup = trips_prev_df.drop_duplicates(["tripid"], keep="first")
print(trips_prev_df_nodup.shape)

In [None]:
## 2. Updated trips

trips_updated_df = []
first = True
read_files = glob(raw_data_update_path + "*_tripsData.json")

tot = 0
for f in read_files:
    print(f)
    with open(f) as f:
        trips = json.loads(f.read())
        trips_updated_df_temp = json_normalize(trips)
    print(trips_updated_df_temp.shape)
    tot += trips_updated_df_temp.shape[0]

    if first:
        trips_updated_df = trips_updated_df_temp
        first = False
    else:
        trips_updated_df = pd.concat([trips_updated_df, trips_updated_df_temp])

print()
print("Tot = ", tot)
print(trips_updated_df.shape)
print()
print("Remove duplicates...")
trips_updated_df_nodup = trips_updated_df.drop_duplicates(["tripid"], keep="first")
print(trips_updated_df_nodup.shape)

In [None]:
## check
print("trips DF shape: {}".format(trips_prev_df_nodup.shape))
print("trips_update DF shape: {}".format(trips_updated_df_nodup.shape))

**3. Merge all + updated**

In [None]:
## 3. Merge all + updated
trips_df_all = pd.concat([trips_prev_df_nodup, trips_updated_df_nodup])
print("trips DF shape: {}".format(trips_df_all.shape))

trips_df_all["tripStartDate_formated"] = pd.to_datetime(
    trips_df_all["tripStartDate"], unit="ms"
)
trips_df_all["tripEndDate_formated"] = pd.to_datetime(
    trips_df_all["tripEndDate"], unit="ms"
)

# print('Remove duplicates...')
# trips_df_nodup = trips_df.iloc[trips_df.astype(str).drop_duplicates(keep='first').index]
# print('trips DF shape: {}'.format(trips_df.shape))
print()
print("Total trips: ", trips_df_all.shape[0])
print("Total unique trips: ", len(trips_df_all["tripid"].unique()))

trips_df_all.reset_index()

**6. Save**

In [None]:
colstodrop = [
    "finalAddress",
    "startAddress",
]
trips_df_all.drop(colstodrop, axis=1, inplace=True)
trips_df_all.reset_index()

In [None]:
trips_df_all.head(3)

In [None]:
## save to JSON
output_file = "all_tripsData.json"
output_path = os.path.join(anon_dataset_path, output_file)
trips_df_all.to_json(output_path, orient="split", index=False)

## User coordinates

In [None]:
users_df_with_trips = pd.read_pickle(input_path + "users_df_with_trips.pkl")

In [None]:
users_df_with_trips.head(3)

In [None]:
address_columns = [
    col for col in users_df_with_trips.columns if "address" in col.lower()
]
print("address_columns:", address_columns)

In [None]:
user_addresses = users_df_with_trips[
    ["userid", "city"] + address_columns + ["onCampaigns"]
].copy()

In [None]:
user_addresses.head(3)

In [None]:
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim

LOCATOR = Nominatim(user_agent="Nominatim")

# 1 - conveneint function to delay between geocoding calls
GEOCODER = RateLimiter(LOCATOR.geocode, min_delay_seconds=2, max_retries=5)

In [None]:
def geocode(data, in_field, out_suffix):
    # 2- - create location column

    out_location = "location_{}".format(out_suffix)
    data[out_location] = data[in_field].apply(GEOCODER)

    # 3 - create longitude, laatitude and altitude from location column (returns tuple)
    in_location = "location_{}".format(out_suffix)
    out_point = "point_{}".format(out_suffix)
    data[out_point] = data[in_location].apply(
        lambda loc: tuple(loc.point) if loc else None
    )

    # 4 - split point column into latitude, longitude and altitude columns
    in_point = "point_{}".format(out_suffix)
    out_fields = [
        field.format(out_suffix)
        for field in ["latitude_{}", "longitude_{}", "altitude_{}"]
    ]
    data[out_fields] = pd.DataFrame(data[in_point].tolist(), index=data.index)

    # replace None with NaN
    data[out_location].fillna(value=np.nan, inplace=True)
    data[out_point].fillna(value=np.nan, inplace=True)
    data[out_fields].fillna(value=np.nan, inplace=True)

    return data

In [None]:
addresses = [
    adr for adr in list(user_addresses["workAddress.address"].dropna().unique()) if adr
]
addresses_sample = random.sample(addresses, 5)

for adr in addresses_sample:
    print("Address: {} - ".format(adr), end="")
    location = LOCATOR.geocode(adr)
    if location is not None:
        print(
            "Latitude = {}, Longitude = {}".format(
                location.latitude, location.longitude
            )
        )
    else:
        print("NULL")

In [None]:
# geocode(data, in_field, out_field_suffix)
geocoded_addresses = geocode(user_addresses, "workAddress.address", "work")

In [None]:
geocoded_addresses.head(5)

In [None]:
# geocode(data, in_field, out_field_suffix)
geocoded_addresses = geocode(geocoded_addresses, "homeAddress.address", "home")

In [None]:
geocoded_addresses.head(10)

In [None]:
out_filename = out_path / "geocoded_addresses.pkl"
geocoded_addresses.to_pickle(out_filename)

## User coordinates classification

In [None]:
import geopandas
import geopy

In [None]:
geocoded_addresses = pd.read_pickle(input_path + "geocoded_addresses.pkl")

In [None]:
geocoded_addresses.head(5)

In [None]:
geocoded_addresses.columns

In [None]:
from shapely.geometry import Point

shapes_df_all = read_shapes(".shp", quiet=True)

### Work addresses

In [None]:
geocoded_work_coords = geocoded_addresses[
    [
        "userid",
        "longitude_work",
        "latitude_work",
    ]
].copy()
geocoded_work_coords.head(3)

In [None]:
from shapely.geometry import Point

shapes_df_all = read_shapes(".shp", quiet=True)

# def classify_end_point(geocoded_work_coords, classe, level):
geom = geocoded_work_coords.apply(
    lambda x: Point([x["longitude_work"], x["latitude_work"]]), axis=1
)
geocoded_work_coords = gpd.GeoDataFrame(geocoded_work_coords, geometry=geom)
geocoded_work_coords.crs = {"init": "epsg:4326"}
geocoded_work_coords.head()


print(geocoded_work_coords.shape)
geocoded_work_coords = gpd.sjoin(
    geocoded_work_coords, shapes_df_all, how="left", op="intersects"
)
print(geocoded_work_coords.shape)
geocoded_work_coords.head(3)


geocoded_work_coords.rename(
    columns={
        "fuaname_en": "fuaname_en_work",
        "iso3": "country_work",
        "class_code": "class_code_work",
    },
    inplace=True,
)
geocoded_work_coords = geocoded_work_coords[
    [
        "userid",
        "latitude_work",
        "longitude_work",
        "fuaname_en_work",
        "country_work",
        "class_code_work",
    ]
]
geocoded_work_coords.head(3)
geocoded_work_coords["work_class"] = geocoded_work_coords.apply(
    lambda x: "-" if pd.isnull(x["fuaname_en_work"]) else "sub-urban", axis=1
)
geocoded_work_coords.head()

# ======= ======= ======= ======= ======= =======
# ======= ======= ======= ======= ======= =======

shapes_df_all = read_shapes("_core.shp", quiet=True)
geom = geocoded_work_coords.apply(
    lambda x: Point([x["longitude_work"], x["latitude_work"]]), axis=1
)
geocoded_work_coords = gpd.GeoDataFrame(geocoded_work_coords, geometry=geom)
geocoded_work_coords.crs = {"init": "epsg:4326"}
geocoded_work_coords.head()

print(geocoded_work_coords.shape)
geocoded_work_coords = gpd.sjoin(
    geocoded_work_coords, shapes_df_all, how="left", op="intersects"
)
print(geocoded_work_coords.shape)
geocoded_work_coords.head(3)

geocoded_work_coords.rename(
    columns={
        "fuaname_en": "fuaname_en_work2",
        "iso3": "country_work2",
        "class_code": "class_code_work2",
    },
    inplace=True,
)
geocoded_work_coords = geocoded_work_coords[
    [
        "userid",
        "latitude_work",
        "longitude_work",
        "fuaname_en_work",
        "country_work",
        "fuaname_en_work2",
        "country_work2",
        "class_code_work2",
        "work_class",
    ]
]

geocoded_work_coords["work_class"] = geocoded_work_coords.apply(
    lambda x: x["work_class"] if pd.isnull(x["fuaname_en_work2"]) else "urban", axis=1
)
geocoded_work_coords["work_class"] = geocoded_work_coords["work_class"].replace(
    "-", "rural"
)

geocoded_work_coords["country_work"] = geocoded_work_coords.apply(
    lambda x: x["country_work"]
    if pd.isnull(x["country_work2"])
    else x["country_work2"],
    axis=1,
)
geocoded_work_coords["fuaname_en_work"] = geocoded_work_coords.apply(
    lambda x: x["fuaname_en_work"]
    if pd.isnull(x["fuaname_en_work2"])
    else x["fuaname_en_work2"],
    axis=1,
)

geocoded_work_coords.head(3)


# geocoded_work_coords['work_class'] = geocoded_work_coords.apply(lambda x: x['work_class'] if pd.isnull(x['fuaname_en_work']) else classe , axis=1)
# geocoded_work_coords.head()
# return geocoded_work_coords

In [None]:
geocoded_work_coords["work_class"] = geocoded_work_coords.apply(
    lambda x: check_value_coords(
        x["work_class"], x["longitude_work"], x["latitude_work"]
    ),
    axis=1,
)
geocoded_work_coords.head(3)

In [None]:
geocoded_work_coords.loc[
    (geocoded_work_coords["latitude_work"].isna())
    & (geocoded_work_coords["longitude_work"].isna())
].head(3)

In [None]:
geocoded_work_coords.groupby("work_class").size()

### Home addresses

In [None]:
geocoded_home_coords = geocoded_addresses[
    [
        "userid",
        "longitude_home",
        "latitude_home",
    ]
].copy()
geocoded_home_coords.head(3)

In [None]:
from shapely.geometry import Point

shapes_df_all = read_shapes(".shp", quiet=True)

# def classify_end_point(geocoded_home_coords, classe, level):
geom = geocoded_home_coords.apply(
    lambda x: Point([x["longitude_home"], x["latitude_home"]]), axis=1
)
geocoded_home_coords = gpd.GeoDataFrame(geocoded_home_coords, geometry=geom)
geocoded_home_coords.crs = {"init": "epsg:4326"}
geocoded_home_coords.head()


print(geocoded_home_coords.shape)
geocoded_home_coords = gpd.sjoin(
    geocoded_home_coords, shapes_df_all, how="left", op="intersects"
)
print(geocoded_home_coords.shape)
geocoded_home_coords.head(3)


geocoded_home_coords.rename(
    columns={
        "fuaname_en": "fuaname_en_home",
        "iso3": "country_home",
        "class_code": "class_code_home",
    },
    inplace=True,
)
geocoded_home_coords = geocoded_home_coords[
    [
        "userid",
        "latitude_home",
        "longitude_home",
        "fuaname_en_home",
        "country_home",
        "class_code_home",
    ]
]
geocoded_home_coords.head(3)
geocoded_home_coords["home_class"] = geocoded_home_coords.apply(
    lambda x: "-" if pd.isnull(x["fuaname_en_home"]) else "sub-urban", axis=1
)
geocoded_home_coords.head()

# ======= ======= ======= ======= ======= =======
# ======= ======= ======= ======= ======= =======

shapes_df_all = read_shapes("_core.shp", quiet=True)
geom = geocoded_home_coords.apply(
    lambda x: Point([x["longitude_home"], x["latitude_home"]]), axis=1
)
geocoded_home_coords = gpd.GeoDataFrame(geocoded_home_coords, geometry=geom)
geocoded_home_coords.crs = {"init": "epsg:4326"}
geocoded_home_coords.head()

print(geocoded_home_coords.shape)
geocoded_home_coords = gpd.sjoin(
    geocoded_home_coords, shapes_df_all, how="left", op="intersects"
)
print(geocoded_home_coords.shape)
geocoded_home_coords.head(3)

geocoded_home_coords.rename(
    columns={
        "fuaname_en": "fuaname_en_home2",
        "iso3": "country_home2",
        "class_code": "class_code_home2",
    },
    inplace=True,
)
geocoded_home_coords = geocoded_home_coords[
    [
        "userid",
        "latitude_home",
        "longitude_home",
        "fuaname_en_home",
        "country_home",
        "fuaname_en_home2",
        "country_home2",
        "class_code_home2",
        "home_class",
    ]
]

geocoded_home_coords["home_class"] = geocoded_home_coords.apply(
    lambda x: x["home_class"] if pd.isnull(x["fuaname_en_home2"]) else "urban", axis=1
)
geocoded_home_coords["home_class"] = geocoded_home_coords["home_class"].replace(
    "-", "rural"
)

geocoded_home_coords["country_home"] = geocoded_home_coords.apply(
    lambda x: x["country_home"]
    if pd.isnull(x["country_home2"])
    else x["country_home2"],
    axis=1,
)
geocoded_home_coords["fuaname_en_home"] = geocoded_home_coords.apply(
    lambda x: x["fuaname_en_home"]
    if pd.isnull(x["fuaname_en_home2"])
    else x["fuaname_en_home2"],
    axis=1,
)

# geocoded_home_coords['home_class'] = geocoded_home_coords.apply(lambda x: x['home_class'] if pd.isnull(x['fuaname_en_home']) else classe , axis=1)
# geocoded_home_coords.head()
# return geocoded_home_coords

In [None]:
geocoded_home_coords.head(3)

In [None]:
geocoded_home_coords["home_class"] = geocoded_home_coords.apply(
    lambda x: check_value_coords(
        x["home_class"], x["longitude_home"], x["latitude_home"]
    ),
    axis=1,
)
geocoded_home_coords.head(3)

In [None]:
geocoded_home_coords.loc[
    (geocoded_home_coords["latitude_home"].isna())
    & (geocoded_home_coords["longitude_home"].isna())
].head(3)

In [None]:
geocoded_home_coords.groupby("home_class").size()

## User anonymization

In [None]:
geocoded_work_coords.head(3)

In [None]:
geocoded_home_coords.head(3)

In [None]:
geocoded_user_home_work_coords = pd.merge(
    geocoded_work_coords[["userid", "latitude_work", "longitude_work", "work_class"]],
    geocoded_home_coords[["userid", "latitude_home", "longitude_home", "home_class"]],
    on="userid",
)
print(geocoded_user_home_work_coords.shape)
geocoded_user_home_work_coords.tail()

In [None]:
geocoded_user_home_work_coords.to_pickle(
    input_path + "geocoded_user_home_work_coords.pkl"
)
print(input_path + "geocoded_user_home_work_coords.pkl")

In [None]:
anon_user_coords = geocoded_user_coords.copy()

anon_user_coords["latitude_work"] = anon_user_coords.apply(
    lambda row: anonymize_coord_nan(row["latitude_work"], row["work_class"]), axis=1
)
anon_user_coords["longitude_work"] = anon_user_coords.apply(
    lambda row: anonymize_coord_nan(row["longitude_work"], row["work_class"]), axis=1
)

anon_user_coords["latitude_home"] = anon_user_coords.apply(
    lambda row: anonymize_coord_nan(row["latitude_home"], row["home_class"]), axis=1
)
anon_user_coords["longitude_home"] = anon_user_coords.apply(
    lambda row: anonymize_coord_nan(row["longitude_home"], row["home_class"]), axis=1
)

In [None]:
anon_user_coords.fillna("", inplace=True)

anon_user_coords.head(3)

In [None]:
output_file = "users_coordinates.csv"
output_path = os.path.join(anon_dataset_path, output_file)
print("output_path: ", output_path)

anon_user_coords.to_csv(output_path, index=False, header=True, float_format="%.3f")