In [1]:
import pandas as pd
from countries import countries
import numpy as np
from multiprocessing import  Pool

# Computing which country a tree is in

We want to take the LAT/LONG coordinates of each tree and find out which country it belongs to. There are only services that do this (e.g. one from Google), but they are limiting free access. Since we have more than 200k trees, we need to choose a different way.

We use the [countries package](https://github.com/che0/countries) by che0, which they thankfully released as public domain code. To use it, a package of polygons delineating each country's borders has to be downloaded and extracted first. See the link above for details.

In [2]:
cc = countries.CountryChecker('countries/TM_WORLD_BORDERS-0.3/TM_WORLD_BORDERS-0.3.shp')

def gps_to_country(lat, lon):
    if (lat is None) or (lon is None):
        return "LatLonNone"
    point = countries.Point(lat, lon)
    country_code = cc.getCountry(point)
    if country_code is None:
        return "CountryCodeNone"
    else:
        return country_code.iso

In [2]:
# Contains the columns ["id", "time_created", "lat", "lon"]
df = pd.read_csv("tree_data.csv")

In [6]:
# Short test to see whether it works
for i in range(10):
    print(df.lat[i], df.lon[i])
    print(gps_to_country(df.lat[i], df.lon[i]))

-4.782518333333333 38.29521333333333
TZ
-4.782491666666667 38.29519833333333
TZ
-4.782401666666667 38.29512666666667
TZ
-4.782455000000001 38.29514666666667
TZ
-4.782398333333333 38.29515
TZ
-4.7824300000000015 38.295028333333335
TZ
-4.782426666666667 38.29500166666666
TZ
-4.7824399999999985 38.294975
TZ
-4.782445 38.29500333333333
TZ
-4.782443333333333 38.29497
TZ


In [11]:
# Apply method to all locations in the database
df["country_code"] = df.apply(lambda x: gps_to_country(x.lat, x.lon), axis=1)

In [12]:
df.to_csv("tree_data_cc_single_core.csv", index=False)

## Compute country code distribution

Look at which country codes occur how often. _CountryCodeNone_ and _nan_ indicate an error in looking up the country codes. These should probably be investigated at some point.

In [15]:
country_code_concat = country_codes[0]
for i in range(1,len(country_codes)):
    country_code_concat.extend(country_codes[i])

In [4]:
df = pd.read_csv("tree_data_cc_single_core.csv")
print(df.country_code[:10])
from collections import Counter
counter = Counter(df.country_code)
print(counter)

0    TZ
1    TZ
2    TZ
3    TZ
4    TZ
5    TZ
6    TZ
7    TZ
8    TZ
9    TZ
Name: country_code, dtype: object
Counter({'TZ': 198603, 'UG': 26692, 'KE': 13487, 'IN': 7393, 'NG': 3736, 'GH': 3013, 'MY': 2497, 'US': 1152, 'TG': 613, 'TH': 446, 'ID': 334, 'IE': 106, 'NP': 101, 'PH': 99, 'CountryCodeNone': 91, 'BR': 79, 'CV': 68, 'PK': 60, 'PT': 51, 'CO': 40, nan: 36, 'CA': 35, 'FR': 35, 'JO': 31, 'SD': 30, 'RO': 22, 'HR': 21, 'DE': 21, 'GB': 19, 'HT': 16, 'IT': 15, 'NL': 13, 'ZA': 12, 'AU': 11, 'ES': 10, 'SE': 8, 'SG': 7, 'CZ': 6, 'SR': 5, 'BA': 4, 'AF': 4, 'PR': 4, 'LK': 4, 'NI': 4, 'SI': 3, 'LU': 3, 'PL': 3, 'BE': 2, 'JM': 2, 'MX': 2, 'MW': 2, 'TW': 1, 'GR': 1, 'BD': 1, 'CL': 1, 'TT': 1, 'CN': 1, 'UA': 1, 'UZ': 1, 'NZ': 1})


# Deprecated

I wanted to parallelize computing the country codes from LAT/LONG. Didn't work, there's still an error somewhere. Went with single-core solution instead. Good enough for me. Might not be good enough for production.

In [8]:
def parallelize_dataframe(data, func, n_cores=4):
    n = len(data)
    batch_size = n//n_cores
    print(f"Batch size {batch_size}")
    data_split = [data[(i*batch_size):((i+1)*batch_size)] for i in range(n_cores)]
    data_split[-1] += data[(n_cores*batch_size):]
    #df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    res = pool.map(func, data_split)
    pool.close()
    pool.join()
    return res

def batch_gps_to_country(batch_data):
    return [gps_to_country(lat,lon) for lat,lon in batch_data]
        

In [28]:
data = list(zip(df.lat[:51], df.lon[:51]))
n_cores = 4
            
n = len(data)
batch_size = n//n_cores
data_split = [data[(i*batch_size):((i+1)*batch_size)] for i in range(n_cores)]
data_split[-1] += data[(n_cores*batch_size):]

print([len(split) for split in data_split])

[12, 12, 12, 15]
