# Second data cleaning - CFF Railway Network

In [1]:
# To hide the warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# To import the .csv created in the first data cleaning notebook. 
data_stop_relevant = pd.read_csv('stop_id_relevant.csv',sep=',')
del data_stop_relevant["Unnamed: 0"]

data_trips_relevant = pd.read_csv('trips_relevant.csv',sep=',')
del data_trips_relevant["Unnamed: 0"]

data_routes_relevant = pd.read_csv('routes_relevant.csv',sep=',')
del data_routes_relevant["Unnamed: 0"]

In [3]:
# This cell reindex the stops DataFrame in order to be more efficient for the cleaning after
stops = pd.read_csv('stops.txt',sep=',')

del stops['stop_lat']
del stops['stop_lon']
del stops['location_type']
del stops['parent_station']

reindexed_stops = stops.set_index(['stop_id'])

In [4]:
# This cell update the data_stop_relevant DataFrame by adding the name of the stop
index_stop = [i for i in range(len(data_stop_relevant))]
data_stop_relevant['stop_name'] = pd.Series([0 for i in range(len(data_stop_relevant))], index=index_stop)
data_stop_relevant.columns = ['stop_id','stop_name']

for i in range(len(data_stop_relevant)):
    s = data_stop_relevant['stop_id'][i]
    name = reindexed_stops['stop_name'][s]
    data_stop_relevant['stop_name'][i] = name

In [5]:
# To label the DataFrame
data_routes_relevant.columns = ['route_id','route_type']

In [6]:
# To label the DataFrame
data_trips_relevant.columns = ['trip_id','trip_short_name']

In [8]:
# Import the data_stop DataFrame
data_stop = pd.read_csv('stop_times.txt', sep=",")

del data_stop['arrival_time']
del data_stop['departure_time']
del data_stop['pickup_type']
del data_stop['drop_off_type']


In [11]:
# This cell create the data_stop_detail DataFrame which allow me to create the sequence of each trip for further edges creation
Vect_trip_id = np.array([])
Vect_stop_id = np.array([])
Vect_stop_name = np.array([])
Vect_stop_seq = np.array([])

for i in range(len(data_stop)):
    if((data_stop['trip_id'][i] in list(data_trips_relevant['trip_id']))==True):
        Vect_trip_id = np.append(Vect_trip_id , data_stop['trip_id'][i])
        Vect_stop_id = np.append(Vect_stop_id , data_stop['stop_id'][i])
        Vect_stop_name = np.append(Vect_stop_name , reindexed_stops['stop_name'][data_stop['stop_id'][i]])
        Vect_stop_seq = np.append(Vect_stop_seq , data_stop['stop_sequence'][i])
                
MatrixResults = np.transpose(np.matrix([Vect_trip_id,Vect_stop_id,Vect_stop_name,Vect_stop_seq]))
data_stop_detail = pd.DataFrame(MatrixResults)

In [17]:
# To show the structure of the resulting DataFrame
data_stop_detail

Unnamed: 0,trip_id,stop_id,stop_name,stop_seq
0,1.TA.1-1-A-j18-1.1.R,8500010:0:3,Basel SBB,1.0
1,1.TA.1-1-A-j18-1.1.R,8500020:0:3,Muttenz,2.0
2,1.TA.1-1-A-j18-1.1.R,8500021:0:5,Pratteln,3.0
3,1.TA.1-1-A-j18-1.1.R,8517131:0:2,Pratteln Salina Raurica,4.0
4,1.TA.1-1-A-j18-1.1.R,8500300:0:5,Kaiseraugst,5.0
5,1.TA.1-1-A-j18-1.1.R,8500313:0:2,Rheinfelden Augarten,6.0
6,1.TA.1-1-A-j18-1.1.R,8500301:0:3,Rheinfelden,7.0
7,1.TA.1-1-A-j18-1.1.R,8500302:0:3,Möhlin,8.0
8,1.TA.1-1-A-j18-1.1.R,8500303:0:2,Mumpf,9.0
9,1.TA.1-1-A-j18-1.1.R,8500320:0:3,Stein-Säckingen,10.0


In [14]:
# Save the data_stop_detail DataFrame in .csv file (in order not to run cell above for the graph creation) 
data_stop_detail.to_csv('stops_details.csv',sep=',')

# Graph generation

As the creation of the DataFrame data_stop_detail is really time consuming, the following code only import the created DataFrame and use it.

In [19]:
# To import the data_stop_detail DataFrame in order to create the Graph 
data_stop_detail = pd.read_excel('stop_details.xlsx' , sep=',')

In [20]:
# To label and show the structure of the DataFrame
data_stop_detail.columns = ['trip_id','stop_id','stop_name','stop_seq']
data_stop_detail

Unnamed: 0,trip_id,stop_id,stop_name,stop_seq
0,1.TA.1-1-A-j18-1.1.R,8500010:0:3,Basel SBB,1.0
1,1.TA.1-1-A-j18-1.1.R,8500020:0:3,Muttenz,2.0
2,1.TA.1-1-A-j18-1.1.R,8500021:0:5,Pratteln,3.0
3,1.TA.1-1-A-j18-1.1.R,8517131:0:2,Pratteln Salina Raurica,4.0
4,1.TA.1-1-A-j18-1.1.R,8500300:0:5,Kaiseraugst,5.0
5,1.TA.1-1-A-j18-1.1.R,8500313:0:2,Rheinfelden Augarten,6.0
6,1.TA.1-1-A-j18-1.1.R,8500301:0:3,Rheinfelden,7.0
7,1.TA.1-1-A-j18-1.1.R,8500302:0:3,Möhlin,8.0
8,1.TA.1-1-A-j18-1.1.R,8500303:0:2,Mumpf,9.0
9,1.TA.1-1-A-j18-1.1.R,8500320:0:3,Stein-Säckingen,10.0


In [37]:
# To create the graph 
G = nx.Graph()

# To create the nodes (the if/else structure only forbid to create several nodes with the same name)
for i in range(len(data_stop_relevant)):
    name = data_stop_relevant['stop_name'][i]
    num_id = data_stop_relevant['stop_id'][i]
     
    if ((name in G.nodes())==True):
        G.node[name]['ids'].append(num_id)
    else:
        G.add_node(name,ids=[num_id])

In [49]:
# Relabeling for convenient edges creation 
reindexed_data_stop_detail = data_stop_detail.set_index(['trip_id'])

# To create the edges (the if/else structure only forbid to create multiple edges)
for name in list(data_trips_relevant['trip_id']):
    L = list(reindexed_data_stop_detail['stop_name'][name])
    for j in range((len(L)-1)):
        if ((L[j],L[j+1]) in G.edges() == True):
            G.edge[L[j]][L[j+1]]['ids'].append(name)
        else:
            G.add_edge(L[j],L[j+1],ids=[name])

In [53]:
# Print the basic information about the network generated
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 2269
Number of edges: 2996
Average degree:   2.6408


In [52]:
# Save the network generated in the .gml standard file
nx.write_gml(G,'CFF_Graph.gml')