In [None]:

"""
Project 5: Railway Accidents
part 3: enriching data 

Objective for this code (extra):
    Data enrichment: We have several train accident for which we do not have the coordinations for.
    This code below is for finding the coordination when given a certain location
    Not all will be found as we had some errors that we could not solve.
"""

In [None]:
import numpy as np
import pandas as pd
# Importing geopy library
from geopy.geocoders import Nominatim
# Import sleep to slow down the processing as we can only use the service within a limited window.
from time import sleep

# Open the csv file that we have imported from CleaningTrainDataScript.py
df = pd.read_csv('CleanData.csv', encoding = 'utf-8')

In [None]:
#First we are calling the Nominatim tool, which is the tool to help us with this method.
geolocator = Nominatim(user_agent="GetLoc")

#Then we create three new columns to append our new location and coordinates
loc = []
lat = []
long = []


#Here we do a loop for each entry in location. We made two try excepts since we got a few
#errors with some locations which we could not solve. By creating this code we can at least
#know which locations were giving us problems and try them out separately
#Some ended up working when tested outside of the loop, most did not.
#Error 1 = the location could not be found by Geopy
#Error 2 = timeout and GeocoderUnavailable: HTTPSConnectionPool(host='nominatim.openstreetmap.org', 
#port=443): Max retries exceeded with url
#Error 2 seems to be because it is only allowed to retrieve 1 location per second
#This is why we have sleep(1) as it means it waits one second before it continues
#for this reason, this code takes a long time to run.
for i in df["Location"]:  
    try:
        try:
            location = geolocator.geocode(i)
            loc.append(location.address)
            lat.append(location.latitude)
            long.append(location.longitude)
            sleep(1.1)
        except AttributeError:
            loc.append("Error 1")
            lat.append("Error 1")
            long.append("Error 1")
            sleep(1.1)
    except:
        loc.append("Error 2")
        lat.append("Error 2")
        long.append("Error 2")
        sleep(1.1)

In [None]:
# Add them into our dataframe
df["Locnew"] = loc
df["Latnew"] = lat
df["Longnew"] = long

# Conditional statement. If latitude row is any value other than NAN then let that same row for
# Latnew column be equal to that exact value. This is to use our existing coordinates we got
# From the webscraping. We are combining them in the new column
# Afterwards we remove the two errors and let them be empty NAN entries
# Note: We make it a string here to easier deal with it when we want to combine it.
df.loc[df.Latitude.notnull(), "Latnew"] = df["Latitude"]
df["Latnew"] = df["Latnew"].apply(str)
df.loc[df["Latnew"] == "Error 1", "Latnew"] = np.nan
df.loc[df["Latnew"] == "Error 2", "Latnew"] = np.nan
# Same thing for Longitude and longnew
df.loc[df.Longitude.notnull(), "Longnew"] = df["Longitude"]
df["Longnew"] = df["Longnew"].apply(str)
df.loc[df["Longnew"] == "Error 1", "Longnew"] = np.nan
df.loc[df["Longnew"] == "Error 2", "Longnew"] = np.nan

# Now we can use these columns as the new Longitude and Latitude and remove the other ones
df["Latitude"] = df["Latnew"]
df["Longitude"] = df["Longnew"]

# Finally we want to create new coordinates with the new info, while still keeping the
# location info for the coordinates that we do not have
df.loc[df.Longnew.notnull(), "Coordinates"] = df["Latnew"] + ';' + df["Longnew"]

# We wanna keep the new locations we got from Geopy since it is more clear and precise info
# First we make the errors disappear, then we make the condition that if there is a value other
# than NAN then for that row the new  location is the locnew location.
df.loc[df["Locnew"] == "Error 1", "Locnew"] = np.nan
df.loc[df["Locnew"] == "Error 2", "Locnew"] = np.nan
df.loc[df.Locnew.notnull(), "Location"] = df["Locnew"]

In [None]:
# finally we drop the three new columns we got from CleaningLocation.py
df = df.drop(columns=["Latnew"])
df = df.drop(columns=["Longnew"])
df = df.drop(columns=["Locnew"])

# Finally we create the last csv before visualising
df.to_csv("DataForVisualising.csv", index=False, encoding='utf-8')