# Accuaring County the coordinates data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## we need the geopandas and geopy to get coordinates of the of the County in the data set 
## Geocoding service is built on top of OpenStreetMap data
!pip install geopandas
!pip install geopy


### Importing the data


In [None]:
root_URL = 'https://raw.githubusercontent.com/StevenVuong/WDL_2020/master/data/stage1'
Churn_OD_file = 'Churn_OD.txt'
Churn_UsersProfile_file = 'Churn_UsersProfile.txt'

Churn_OD_URL = root_URL +'/'+ Churn_OD_file
Churn_UsersProfile_URL = root_URL +'/'+ Churn_UsersProfile_file


In [None]:
df1 = pd.read_csv(Churn_OD_URL ,delimiter="|",encoding = 'Latin1')


In [None]:
df1.head(10)

Unnamed: 0,Region_of_Origin,District_of_Origin,County_of_Origin,Region_of_Public_Transportation,District_of_Public_Transportation,County_of_Public_Transportation,Dicofre_ParishCode_of_Public_Transportation,Demand_weight
0,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110608,0.307323
1,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110639,0.069997
2,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110658,0.066059
3,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110654,0.059847
4,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110633,0.052341
5,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110611,0.049032
6,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,AMADORA,111512,0.039779
7,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110660,0.038691
8,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110610,0.037827
9,R1 - AM Lisboa,Lisboa,Amadora,R1 - AM Lisboa,LISBOA,LISBOA,110666,0.0378


In [None]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2253 entries, 0 to 2252
Data columns (total 8 columns):
 #   Column                                       Non-Null Count  Dtype  
---  ------                                       --------------  -----  
 0   Region_of_Origin                             2253 non-null   object 
 1   District_of_Origin                           2253 non-null   object 
 2   County_of_Origin                             2253 non-null   object 
 3   Region_of_Public_Transportation              2253 non-null   object 
 4   District_of_Public_Transportation            2253 non-null   object 
 5   County_of_Public_Transportation              2253 non-null   object 
 6   Dicofre_ParishCode_of_Public_Transportation  2253 non-null   int64  
 7   Demand_weight                                2253 non-null   float64
dtypes: float64(1), int64(1), object(6)
memory usage: 140.9+ KB


In [None]:
## formatting the column value to be lowercase
df1['District_of_Origin'] = df1['District_of_Origin'].str.lower()
df1['County_of_Origin'] = df1['County_of_Origin'].str.lower()

df1['District_of_Public_Transportation'] = df1['District_of_Public_Transportation'].str.lower()
df1['County_of_Public_Transportation'] = df1['County_of_Public_Transportation'].str.lower()




In [None]:
set1 = set(df1['County_of_Origin'])
set2 = set(df1['County_of_Public_Transportation'])

set3 = set1.difference(set2)
set4 = set2.difference(set1)


 **set3** show the counties that don't have direct route back to

In [None]:
for i in set3:
  print (i)

cascais
palmela
póvoa de varzim
espinho
santa maria da feira
vale de cambra
mafra
moita
são joão da madeira
setúbal
vila franca de xira
alcochete
sesimbra
oliveira de azeméis
montijo
barreiro


 **set4** show the counties that don't have direct route back to,
 but as **set4** is empty we assum that all counties in **set2** have return bus route???

In [None]:
for i in set4:
  print (i)

##Accuaring County the coordinates 


In [None]:
#Importing the Nominatim geocoder class 
from geopy.geocoders import Nominatim

Champ de Mars, Rue Saint-Dominique, Quartier du Gros-Caillou, Paris 7e Arrondissement, Paris, Île-de-France, France métropolitaine, 75007, France
Latitude = 48.85614465, Longitude = 2.297820393322227


In [None]:
## creaeting datafram to hold the values 
temp = pd.DataFrame()
location_points = pd.DataFrame()

In [None]:
# we want one/unique instance of each address saved to get coords to used for later
temp['address'] = df1['County_of_Origin'] + ',' + df1['District_of_Origin'] + ',Portugal'
location_points['address'] = temp['address'].unique()
location_points.head()

Unnamed: 0,address
0,"amadora,lisboa,Portugal"
1,"cascais,lisboa,Portugal"
2,"lisboa,lisboa,Portugal"
3,"loures,lisboa,Portugal"
4,"mafra,lisboa,Portugal"


In [None]:
from geopy.extra.rate_limiter import RateLimiter

## 1 - We create 'locator' that holds the Geocoding service, Nominatim will get us the coordinara from  the databas 
locator = Nominatim(user_agent='myGeocoder')
## 2 - conveneint function to delay between geocoding calls
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)


In [None]:
## 3 - create location column
location_points['location'] = location_points['address'].apply(geocode)
## 4 - create longitude, laatitude and altitude from location column (returns tuple)
location_points['point'] = location_points['location'].apply(lambda loc: tuple(loc.point) if loc else None)
# 5 - split point column into latitude, longitude and altitude columns
location_points[['latitude', 'longitude', 'altitude']] = pd.DataFrame(location_points['point'].tolist(), index=location_points.index)

In [None]:
location_points.head()


Unnamed: 0,address,location,point,latitude,longitude,altitude
0,"amadora,lisboa,Portugal","(Amadora, Lisboa, Grande Lisboa, Área Metropol...","(38.7595162, -9.223677009460125, 0.0)",38.759516,-9.223677,0.0
1,"cascais,lisboa,Portugal","(Cascais, Lisboa, Grande Lisboa, Área Metropol...","(38.72240025, -9.396909171649877, 0.0)",38.7224,-9.396909,0.0
2,"lisboa,lisboa,Portugal","(Lisboa, Grande Lisboa, Área Metropolitana de ...","(38.7077507, -9.1365919, 0.0)",38.707751,-9.136592,0.0
3,"loures,lisboa,Portugal","(Loures, Lisboa, Grande Lisboa, Área Metropoli...","(38.8578642, -9.175816056948255, 0.0)",38.857864,-9.175816,0.0
4,"mafra,lisboa,Portugal","(Mafra, Lisboa, Grande Lisboa, Área Metropolit...","(38.9369782, -9.3282374, 0.0)",38.936978,-9.328237,0.0
