# Geospatial Features
In this notebook we will learn about geographic data and how to work with them

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

%matplotlib inline
sns.set(rc={'figure.figsize':(10, 10)}, font_scale=1.2)

### 1) Work with Lat and Long

we will use this library: https://github.com/risenW/datasisthttps://github.com/risenW/datasist

**1.1) Measuring Distance**

In [2]:
from datasist.feature_engineering import haversine_distance

# lat, long
my_home = [30.109919, 31.308797]
cafe = [30.120982, 31.322026]

In [3]:
haversine_distance(my_home[0], my_home[1], cafe[0], cafe[1])

0    1.769848
dtype: float64

**1.2) Get Center Location between 2 points**

In [4]:
from datasist.feature_engineering import get_location_center

get_location_center(my_home[0], cafe[0]), get_location_center(my_home[1], cafe[1])

(0    30.115451
 dtype: float64,
 0    31.315411
 dtype: float64)

***So Lets use it with Our data***

In [5]:
df = pd.read_csv('../dastasets/sendy_logistics.csv')
df.head()

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),...,Arrival at Destination - Time,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id,Time from Pickup to Arrival
0,Order_No_4211,User_Id_633,Bike,3,Business,9,5,9:35:46 AM,9,5,...,10:39:55 AM,4,20.4,,-1.317755,36.83037,-1.300406,36.829741,Rider_Id_432,745
1,Order_No_25375,User_Id_2285,Bike,3,Personal,12,5,11:16:16 AM,12,5,...,12:17:22 PM,16,26.4,,-1.351453,36.899315,-1.295004,36.814358,Rider_Id_856,1993
2,Order_No_1899,User_Id_265,Bike,3,Business,30,2,12:39:25 PM,30,2,...,1:00:38 PM,3,,,-1.308284,36.843419,-1.300921,36.828195,Rider_Id_155,455
3,Order_No_9336,User_Id_1402,Bike,3,Business,15,5,9:25:34 AM,15,5,...,10:05:27 AM,9,19.2,,-1.281301,36.832396,-1.257147,36.795063,Rider_Id_855,1341
4,Order_No_27883,User_Id_1737,Bike,1,Personal,13,1,9:55:18 AM,13,1,...,10:25:37 AM,9,15.4,,-1.266597,36.792118,-1.295041,36.809817,Rider_Id_770,1214


In [6]:
df['Dist_Pick_Dest'] = haversine_distance(df['Pickup Lat'], df['Pickup Long'], df['Destination Lat'], df['Destination Long'])
df.head()

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),...,Distance (KM),Temperature,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id,Time from Pickup to Arrival,Dist_Pick_Dest
0,Order_No_4211,User_Id_633,Bike,3,Business,9,5,9:35:46 AM,9,5,...,4,20.4,,-1.317755,36.83037,-1.300406,36.829741,Rider_Id_432,745,1.930333
1,Order_No_25375,User_Id_2285,Bike,3,Personal,12,5,11:16:16 AM,12,5,...,16,26.4,,-1.351453,36.899315,-1.295004,36.814358,Rider_Id_856,1993,11.339849
2,Order_No_1899,User_Id_265,Bike,3,Business,30,2,12:39:25 PM,30,2,...,3,,,-1.308284,36.843419,-1.300921,36.828195,Rider_Id_155,455,1.880079
3,Order_No_9336,User_Id_1402,Bike,3,Business,15,5,9:25:34 AM,15,5,...,9,19.2,,-1.281301,36.832396,-1.257147,36.795063,Rider_Id_855,1341,4.943458
4,Order_No_27883,User_Id_1737,Bike,1,Personal,13,1,9:55:18 AM,13,1,...,9,15.4,,-1.266597,36.792118,-1.295041,36.809817,Rider_Id_770,1214,3.724829


In [7]:
df['Center_Lat'] = get_location_center(df['Pickup Lat'], df['Destination Lat'])
df['Center_Long'] = get_location_center(df['Pickup Long'], df['Destination Long'])
df.head()

Unnamed: 0,Order No,User Id,Vehicle Type,Platform Type,Personal or Business,Placement - Day of Month,Placement - Weekday (Mo = 1),Placement - Time,Confirmation - Day of Month,Confirmation - Weekday (Mo = 1),...,Precipitation in millimeters,Pickup Lat,Pickup Long,Destination Lat,Destination Long,Rider Id,Time from Pickup to Arrival,Dist_Pick_Dest,Center_Lat,Center_Long
0,Order_No_4211,User_Id_633,Bike,3,Business,9,5,9:35:46 AM,9,5,...,,-1.317755,36.83037,-1.300406,36.829741,Rider_Id_432,745,1.930333,-1.30908,36.830056
1,Order_No_25375,User_Id_2285,Bike,3,Personal,12,5,11:16:16 AM,12,5,...,,-1.351453,36.899315,-1.295004,36.814358,Rider_Id_856,1993,11.339849,-1.323229,36.856837
2,Order_No_1899,User_Id_265,Bike,3,Business,30,2,12:39:25 PM,30,2,...,,-1.308284,36.843419,-1.300921,36.828195,Rider_Id_155,455,1.880079,-1.304603,36.835807
3,Order_No_9336,User_Id_1402,Bike,3,Business,15,5,9:25:34 AM,15,5,...,,-1.281301,36.832396,-1.257147,36.795063,Rider_Id_855,1341,4.943458,-1.269224,36.81373
4,Order_No_27883,User_Id_1737,Bike,1,Personal,13,1,9:55:18 AM,13,1,...,,-1.266597,36.792118,-1.295041,36.809817,Rider_Id_770,1214,3.724829,-1.280819,36.800968


### 2) Geocoding features to get the address from lat, long or reverse.

we will use this library: https://pypi.org/project/geopy/

In [7]:
from geopy.geocoders import Nominatim

geolocator = Nominatim(user_agent="Singularity")


location = geolocator.geocode("175 5th Avenue NYC")
print((location.latitude, location.longitude))

(43.3545384, -8.4082542)


In [9]:
location = geolocator.reverse("52.509669, 13.376294")
print(location.address)

Steinecke, Potsdamer Platz, Tiergarten, Mitte, Berlin, 10785, Deutschland


### 3) Extract valuable information about locations from Zip Codes

we will use this library: https://pypi.org/project/zipcodes/

In [8]:
import zipcodes

zipcodes.matching('77429')

[{'zip_code': '77429',
  'zip_code_type': 'STANDARD',
  'active': True,
  'city': 'Cypress',
  'acceptable_cities': [],
  'unacceptable_cities': [],
  'state': 'TX',
  'county': 'Harris County',
  'timezone': 'America/Chicago',
  'area_codes': ['281', '832'],
  'world_region': 'NA',
  'country': 'US',
  'lat': '29.9857',
  'long': '-95.6548'}]

### So Lets use it with Our data

In [9]:
df = pd.read_csv('../dastasets/zip_codes.csv')
df

Unnamed: 0,ZIP,2010 Census Population,FIPS Code,StCtyCode,County Name,Zip Code Designation
0,99546,* 525,2016,2016,ALEUTIANS WEST,Low Income Area/HPSA
1,99551,* 104,2050,2050,BETHEL,Low Income Area/HPSA
2,99553,"* 1,027",2013,2013,ALEUTIANS EAST,Low Income Area/HPSA
3,99554,"* 1,439",2270,2270,WADE HAMPTON,Low Income Area/HPSA
4,99555,* 219,2070,2070,DILLINGHAM,Low Income Area/HPSA
...,...,...,...,...,...,...
995,72081,"* 7,926",5145,4720,WHITE,Low Income Area
996,71749,"* 2,651",5139,4690,UNION,Low Income Area/HPSA
997,72351,* 898,5093,4460,MISSISSIPPI,Low Income Area
998,72082,"* 1,917",5145,4720,WHITE,Low Income Area


In [10]:
def get_city_from_zip_code(zip_code):
    try:
        return zipcodes.matching(str(zip_code))[0]['city']
    except: 
        return np.nan

df['City'] = df['ZIP'].apply(get_city_from_zip_code)
df

Unnamed: 0,ZIP,2010 Census Population,FIPS Code,StCtyCode,County Name,Zip Code Designation,City
0,99546,* 525,2016,2016,ALEUTIANS WEST,Low Income Area/HPSA,Adak
1,99551,* 104,2050,2050,BETHEL,Low Income Area/HPSA,Akiachak
2,99553,"* 1,027",2013,2013,ALEUTIANS EAST,Low Income Area/HPSA,Akutan
3,99554,"* 1,439",2270,2270,WADE HAMPTON,Low Income Area/HPSA,Alakanuk
4,99555,* 219,2070,2070,DILLINGHAM,Low Income Area/HPSA,Aleknagik
...,...,...,...,...,...,...,...
995,72081,"* 7,926",5145,4720,WHITE,Low Income Area,Judsonia
996,71749,"* 2,651",5139,4690,UNION,Low Income Area/HPSA,Junction City
997,72351,* 898,5093,4460,MISSISSIPPI,Low Income Area,Keiser
998,72082,"* 1,917",5145,4720,WHITE,Low Income Area,Kensett


In [11]:
len(df['City'].unique())

843

In [12]:
df['City'].value_counts()

Birmingham    21
Mobile        18
Montgomery    14
Huntsville    10
Tuscaloosa     5
              ..
Amagon         1
Earle          1
Stebbins       1
Horton         1
Goshen         1
Name: City, Length: 843, dtype: int64

# Great Work!