In [12]:
from geopy.geocoders import Nominatim
import pandas as pd
import geopandas as gpd
import numpy as np
import folium

In [2]:
geolocator = Nominatim(user_agent="kaggle_learn")
location = geolocator.geocode("Nairobi")

print(location.point)
print(location.address)

1 16m 59.7119s S, 36 49m 2.08164s E
Nairobi, Kenya


In [3]:
point = location.point
print("Latitude:", point.latitude)
print("Longitude:", point.longitude)

Latitude: -1.2832533
Longitude: 36.8172449


In [7]:
universities = pd.read_csv("../geospatial_kaggle/top_universities.csv")
universities.head()

Unnamed: 0,Name
0,University of Oxford
1,University of Cambridge
2,Imperial College London
3,ETH Zurich
4,UCL


In [11]:
def my_geocoder(row):
    """
    a lambda function to apply the geocoder 
    to every row in the DataFrame.
    """
    try:
        point = geolocator.geocode(row).point
        return pd.Series({'Latitude': point.latitude, 'Longitude':point.longitude})
    except:
        return None

universities[['Latitude', 'Longitude']] = universities.apply(lambda x: my_geocoder(x['Name']), axis=1)

print("{}% of addresses were geocoded.".format(
    (1-sum(np.isnan(universities["Latitude"])) / len(universities)) * 100
))

# drop universities that were not successfully geocoded
universities = universities.loc[~np.isnan(universities["Latitude"])]
universities = gpd.GeoDataFrame(
    universities, geometry=gpd.points_from_xy(universities.Longitude, universities.Latitude)
)
universities.crs = {'init': 'epsg:4326'}
universities.head()

95.0% of addresses were geocoded.


  in_crs_string = _prepare_from_proj_string(in_crs_string)


Unnamed: 0,Name,Latitude,Longitude,geometry
0,University of Oxford,51.758708,-1.255668,POINT (-1.25567 51.75871)
1,University of Cambridge,52.199852,0.119739,POINT (0.11974 52.19985)
2,Imperial College London,51.498959,-0.175641,POINT (-0.17564 51.49896)
3,ETH Zurich,47.376504,8.547321,POINT (8.54732 47.37650)
4,UCL,51.523581,-0.132977,POINT (-0.13298 51.52358)


In [13]:
uni_map = folium.Map(location=[54,15], tiles='openstreetmap', zoom_start=2)

# add points to the map
for idx, row in universities.iterrows():
    folium.Marker([row['Latitude'], row['Longitude']], popup=row['Name']).add_to(uni_map)
    
uni_map

## Table joins

In [14]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
europe = world.loc[world.continent == 'Europe'].reset_index(drop=True)

europe_stats = europe[["name", "pop_est", "gdp_md_est"]]
europe_boundaries = europe[["name", "geometry"]]

In [15]:
europe_boundaries.head()

Unnamed: 0,name,geometry
0,Russia,"MULTIPOLYGON (((178.72530 71.09880, 180.00000 ..."
1,Norway,"MULTIPOLYGON (((15.14282 79.67431, 15.52255 80..."
2,France,"MULTIPOLYGON (((-51.65780 4.15623, -52.24934 3..."
3,Sweden,"POLYGON ((11.02737 58.85615, 11.46827 59.43239..."
4,Belarus,"POLYGON ((28.17671 56.16913, 29.22951 55.91834..."


join `europe_boundaries` with a DataFrame `europe_stats` containing the estimated population and gross domestic product (GDP) for each country.

In [16]:
europe_stats.head()

Unnamed: 0,name,pop_est,gdp_md_est
0,Russia,142257519,3745000.0
1,Norway,5320045,364700.0
2,France,67106161,2699000.0
3,Sweden,9960487,498100.0
4,Belarus,9549747,165400.0


In [17]:
# use an attribute join to merge data about countries in Europe
europe = europe_boundaries.merge(europe_stats, on="name")
europe.head()

Unnamed: 0,name,geometry,pop_est,gdp_md_est
0,Russia,"MULTIPOLYGON (((178.72530 71.09880, 180.00000 ...",142257519,3745000.0
1,Norway,"MULTIPOLYGON (((15.14282 79.67431, 15.52255 80...",5320045,364700.0
2,France,"MULTIPOLYGON (((-51.65780 4.15623, -52.24934 3...",67106161,2699000.0
3,Sweden,"POLYGON ((11.02737 58.85615, 11.46827 59.43239...",9960487,498100.0
4,Belarus,"POLYGON ((28.17671 56.16913, 29.22951 55.91834...",9549747,165400.0


### Spatial join

With a spatial join, we combine GeoDataFrames based on the spatial relationship between the objects in the "geometry" columns. For instance, we already have a GeoDataFrame `universities` containing geocoded addresses of European universities.

Then we can use a spatial join to match each university to its corresponding country. We do this with `gpd.sjoin()`.

In [18]:
european_universities = gpd.sjoin(universities, europe)

print("locate {} universities.".format(len(universities)))
print("{} of the universities are European (in {} different countries)".format(len(european_universities), len(european_universities.name.unique())))

european_universities.head()

locate 95 universities.
89 of the universities are European (in 15 different countries)


Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: +init=epsg:4326 +type=crs
Right CRS: EPSG:4326

  european_universities = gpd.sjoin(universities, europe)


Unnamed: 0,Name,Latitude,Longitude,geometry,index_right,name,pop_est,gdp_md_est
0,University of Oxford,51.758708,-1.255668,POINT (-1.25567 51.75871),28,United Kingdom,64769452,2788000.0
1,University of Cambridge,52.199852,0.119739,POINT (0.11974 52.19985),28,United Kingdom,64769452,2788000.0
2,Imperial College London,51.498959,-0.175641,POINT (-0.17564 51.49896),28,United Kingdom,64769452,2788000.0
4,UCL,51.523581,-0.132977,POINT (-0.13298 51.52358),28,United Kingdom,64769452,2788000.0
5,London School of Economics and Political Science,51.514591,-0.116431,POINT (-0.11643 51.51459),28,United Kingdom,64769452,2788000.0
