# Population <-> Cities

In [1]:
# To hide the warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
import networkx as nx
import numpy as np
import pickle
import matplotlib.pyplot as plt

In [7]:
with open('swiss_population_pickle.pkl', 'rb') as f:
    population = pickle.load(f)

First we will check how many cities are considered large, where a large city has more than `POP_LIMIT_LARGE_CITY` number of inhabitants

In [10]:
POP_LIMIT_LARGE_CITY = 24000
large_cities = [city for city, pop in population.items() if pop > POP_LIMIT_LARGE_CITY]
print("There are {} large cities in Switzerland".format(len(large_cities)))

There are 31 large cities in Switzerland


The following small function returns a set of possible names for a city, as they might be found in the stop dataset. For example, when given Zürich, it will return `[Zürich HB, Zürich Hbf, ...]`

In [12]:
def alt_city_names(city_orig_name):
    stop_extensions = ['HB', 'Hbf', 'SBB', 'CFF']
    return ["{} {}".format(city_orig_name, ext) for ext in stop_extensions]

This function goes through the given names of stops, and tries to find which stop matches to the provided cities.

In [12]:
def real_stop_names(cities, stop_names):
    city_stop_names = []
    
    for city in cities:
        found = False
        if city in stop_names.values:
            found = True
            city_stop_names.append(city)

        if not found:
            for alt_name in alt_city_names(city):
                if alt_name in stop_names.values:
                    found = True
                    city_stop_names.append(alt_name)
                    break
                    
        if not found:
            if city == "Fribourg":
                city_stop_names.append("Fribourg/Freiburg")
            if city == "Rapperswil-Jona":
                city_stop_names.append("Rapperswil")

    return(city_stop_names)

In [37]:
df = pd.read_csv('stop_data_complete.csv', sep=',', encoding='latin-1')
del df['Unnamed: 0']

large_city_stop_names = real_stop_names(large_cities, df['stop_name'])

In [65]:
large_cities_centers = {}
for stop in large_city_stop_names:
    matching_lines = df.loc[df['stop_name'] == stop]
    # Multiple lines can match for one stop. 
    # This is usually because each platform counts as a stop in bigger stations
    # We arbitrarily take the first, this makes very little difference.
    stop_line = matching_lines.iloc[0]
    large_cities_centers[stop] = (stop_line['stop_lat'], stop_line['stop_lon'])
    
print("We now have the following:")
print("{} : {}, etc.".format('Basel SBB', large_cities_centers['Basel SBB']))

We now have the following:
Basel SBB : (47.547649085507601, 7.5895514262328696), etc.


### Distance calculation

In [62]:
!pip install geopy

Collecting geopy
  Downloading geopy-1.11.0-py2.py3-none-any.whl (66kB)
[K    100% |████████████████████████████████| 71kB 1.6MB/s ta 0:00:01
[?25hInstalling collected packages: geopy
Successfully installed geopy-1.11.0


In [64]:
from geopy.distance import vincenty as geo_dist

Using the [GeoPy](http://geopy.readthedocs.io/en/1.10.0/) library and the vincenty algorithm, we can now use the geo_dist function to calculate distances between two stops. Here is an example for Lausanne and Zürich:

In [70]:
print("Distance between Lausanne and Zürich as the crow flies:")
print('{:.2f} km'.format(geo_dist(large_cities_centers['Lausanne'], large_cities_centers['Zürich HB']).km))

Distance between Lausanne and Zürich as the crow flies:
174.17 km


With this function we now construct a function that returns the set of stops (train or bus, not yet filtered) that is within XX km of a given city's train station.

In [140]:
def stops_in_range(df, city_stop_name, city_stop_coords, max_dist):
    in_range = []
    for index, row in df.iterrows():
        if not row['stop_name'] == city_stop_name:
            if geo_dist(city_stop_coords, (row['stop_lat'], row['stop_lon'])).km < max_dist:
                in_range.append(row['stop_name'])
    return set(in_range)

Now, for every large city, we find all stops that are closer than 5km from its train station.

In [145]:
stops_to_close = {}
print("Processing all stops for:")
print()
for city, coords in large_cities_centers.items():
    print("{}".format(city), end='')
    stops_to_close[city] = stops_in_range(df, city, coords, 5)
    print(" .... done")

Processing all stops for:

Dübendorf .... done
Uster .... done
Winterthur .... done
Dietikon .... done
Zürich HB .... done
Biel/Bienne .... done
Bern .... done
Köniz .... done
Thun .... done
Luzern .... done
Zug .... done
Fribourg/Freiburg .... done
Basel SBB .... done
Schaffhausen .... done
St. Gallen .... done
Rapperswil .... done
Chur .... done
Frauenfeld .... done
Lugano .... done
Yverdon-les-Bains .... done
Lausanne .... done
Montreux .... done
Sion .... done
La Chaux-de-Fonds .... done
Neuchâtel .... done
Genève .... done
Vernier .... done


Here is the example for Lausanne:

In [147]:
print(stops_to_close['Lausanne'])

{'Lausanne, Rovéréaz', 'Pully, Trois-Chasseurs', 'Cery-Fleur-de-Lys', 'St-Sulpice VD, Pâqueret', 'Epalinges, Croisettes', 'Lausanne, St-Etienne', 'Lausanne, Valmont', 'Lausanne-Chauderon', 'Prilly-Malley', 'Pully-Nord', 'Ecublens VD, Champagne', 'Ecublens VD, EPFL (bus)', 'Renens VD', 'Ecublens VD, Blévallaire', 'Lausanne, Sallaz', 'Union-Prilly', 'Prilly-Chasseur', 'Lausanne, Le Foyer', 'Pully', 'La Conversion', 'Lausanne, Bourdonnette', 'Jouxtens-Mézery', 'Lausanne-Flon', 'Montblesson, Centenaire', 'Ecublens VD, Dorigny', 'Lutry', 'Ecublens VD, EPFL Piccard', 'Le Lussex', 'Montétan'}


And now we save the result to file, so it can be used else where:

In [149]:
with open('stops_too_close.pkl', 'wb') as f:
    pickle.dump(stops_to_close, f)

Here is an example of how to open and use the file:

In [150]:
with open('stops_too_close.pkl', 'rb') as f:
    stops_too_close = pickle.load(f)
    print(stops_too_close['Lausanne'])

{'Lausanne, Rovéréaz', 'Pully, Trois-Chasseurs', 'Cery-Fleur-de-Lys', 'St-Sulpice VD, Pâqueret', 'Epalinges, Croisettes', 'Lausanne, St-Etienne', 'Lausanne, Valmont', 'Lausanne-Chauderon', 'Prilly-Malley', 'Pully-Nord', 'Ecublens VD, Champagne', 'Ecublens VD, EPFL (bus)', 'Renens VD', 'Ecublens VD, Blévallaire', 'Lausanne, Sallaz', 'Union-Prilly', 'Prilly-Chasseur', 'Lausanne, Le Foyer', 'Pully', 'La Conversion', 'Lausanne, Bourdonnette', 'Jouxtens-Mézery', 'Lausanne-Flon', 'Montblesson, Centenaire', 'Ecublens VD, Dorigny', 'Lutry', 'Ecublens VD, EPFL Piccard', 'Le Lussex', 'Montétan'}


# Finding population data for all stops

We will now take the list of stops, take only the train stops (`type = Train`), and try to find population data for the associated city.

In [46]:
df = pd.read_csv('stop_with_type.csv', sep=',', encoding='latin-1')
del df['Unnamed: 0']

In [53]:
df_only_train = df[df['type'] == 'Train']

In [68]:
swiss_cantons = ['SH', 'BS', 'TG', 'AG', 'JU', 'BL', 'ZH', 'AI', 'VD', 'NE', 'SO', 'LU', 'ZG', 'SG', 'AR', 'FR', 'BE', 'NW', 'GE', 'SZ', 'GL', 'GR', 'VS', 'OW', 'UR', 'TI']

The following function may look not very clean, but this is mostly for optimization reasons.

If we find the population for the current stop it goes directly into the map, and the stop name is saved to make sure we don't run it again, and if we don't find it at all, it goes into the `not_found` list.

In order we try to match the following:
1. The full stop name directly (~500 occurences)
2. The stop name without HB, CFF, SBB, Hbf attached (~20 occurences)
3. Stop names with the canton in the name (~130 occurences)
4. Stop names for stops in between two villages such as Puidoux-Chexbres (~30 occurences)
5. Stop names in two languages such as Fribourg/Freiburg (~few)
6. Stop names of multiple words where the first word is a city (~30 occurences)

In [102]:
population_keys = population.keys()
pop_city_match = {}
treated_stops = []
not_found = []

for index, row in df_only_train.iterrows():

    stop_name = row['stop']
    if stop_name not in treated_stops:
        if stop_name in population_keys:
            treated_stops.append(stop_name)
            pop_city_match[stop_name] = population[stop_name]
        else:
#             not_found.append(stop_name)
            if stop_name[-2:] == 'HB':
                if stop_name[:-3] in population_keys:
                    treated_stops.append(stop_name)
                    pop_city_match[stop_name] = population[stop_name[:-3]]
            elif stop_name[-3:] in ['SBB', 'CFF', 'Hbf']:
                if stop_name[:-4] in population_keys:
                    treated_stops.append(stop_name)
                    pop_city_match[stop_name] = population[stop_name[:-4]]
            
            # Check for stops with canton name: 'Buchs ZH' is 'Buchs (ZH)' in the data
            elif stop_name[-2:] in swiss_cantons:
                canton_format = '{} ({})'.format(stop_name[:-3], stop_name[-2:])
                if canton_format in population.keys():
                    treated_stops.append(stop_name)
                    pop_city_match[stop_name] = population[canton_format]
            
            # Now check for names with a dash in them: Puidoux-Chexbres
            elif len(stop_name.split('-')) > 1:
                name_parts = stop_name.split('-')
                population_combined = 0
                if name_parts[0] not in treated_stops and name_parts[0] in population_keys:
                    population_combined += population[name_parts[0]]
                if name_parts[1] not in treated_stops and name_parts[1] in population_keys:
                    population_combined += population[name_parts[1]]
                
                # Observation: there are few stops that have more than 1 dash in the name (Morges-st-Jean)
                # but most of these are not very relevant because of how small they are.
                
                # If one of the parts of the name matched (or both parts),
                # then add the total population to the table.
                # Example: Puidoux-Chexbres
                if population_combined != 0:
                    treated_stops.append(stop_name)
                    pop_city_match[stop_name] = population_combined
                else:
                    not_found.append(stop_name)
             
            elif len(stop_name.split('/')) > 1:
                name_parts = stop_name.split('/')
                if name_parts[0] in population.keys() and name_parts[0] not in treated_stops:
                    treated_stops.append(stop_name)
                    pop_city_match[stop_name] = population[name_parts[0]]   
                elif name_parts[1] in population.keys() and name_parts[1] not in treated_stops:
                    treated_stops.append(stop_name)
                    pop_city_match[stop_name] = population[name_parts[1]]
                else:
                    not_found.append(stop_name)
            
            elif len(stop_name.split(' ')) > 1:
                first_part = stop_name.split(' ')[0]
                if first_part not in treated_stops and first_part in population_keys:
                    treated_stops.append(stop_name)
                    pop_city_match[stop_name] = population[first_part]
                else:
                    not_found.append(stop_name)
                
            else:
                not_found.append(stop_name)


In [106]:
print("Stops processed, population data found for:")
print("{} stops/cities".format(len(treated_stops)))
print("Not found for:")
print("{} stops/cities".format(len(not_found)))

Stops processed, population data found for:
720 stops/cities
Not found for:
1135 stops/cities


#### The other stops....

The stops that we didn't find population data for are a mix of various options such as:
- Cities outside of Switzerland
- Cities that are not the same name as a commune
- Cities that are not cities, but merely train stops on the outskirts of some other commune
- Mountain train stops
- etc

We will now set the population data for all these cities to `-1` to indicate they have no data, and then the model can decide how to handle that problem.

In [107]:
for stop in not_found:
    pop_city_match[stop] = -1

In [116]:
def add_pop_to_row(row):
    if row['type'] == 'Train':
        try:
            return pop_city_match[row['stop']]
        except:
            return 0
    else:
        return 0

In [117]:
new_df = df.apply(add_pop_to_row, axis=1)

In [120]:
df['population'] = new_df

In [122]:
df.to_csv('stops_with_population.csv', index=False)