Let's start Exploring

In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('station_data.csv', parse_dates=['datetime_start', 'datetime_end'])



In [3]:
df.head()

Unnamed: 0,place_id,place_name,lat,lng,num_of_rack,datetime_start,datetime_end
0,102,Szilágyi Dezső tér,47.503424,19.039714,15.0,2015-01-01 00:00:01,2015-05-31 23:55:03
1,103,Clark Ádám tér,47.497586,19.040916,15.0,2015-01-01 00:00:01,2015-05-22 15:35:03
2,103,Clark Ádám tér,47.497586,19.040916,16.0,2015-05-22 15:40:02,2015-05-31 23:55:03
3,104,Döbrentei tér,47.491279,19.045116,22.0,2015-01-01 00:00:01,2015-05-31 23:55:03
4,105,Déli pályaudvar,47.499858,19.025488,28.0,2015-01-01 00:00:01,2015-05-31 23:55:03


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   place_id        87 non-null     int64         
 1   place_name      87 non-null     object        
 2   lat             87 non-null     float64       
 3   lng             87 non-null     float64       
 4   num_of_rack     87 non-null     float64       
 5   datetime_start  87 non-null     datetime64[ns]
 6   datetime_end    87 non-null     datetime64[ns]
dtypes: datetime64[ns](2), float64(3), int64(1), object(1)
memory usage: 4.9+ KB


In [5]:
df.describe(include='all')

Unnamed: 0,place_id,place_name,lat,lng,num_of_rack,datetime_start,datetime_end
count,87.0,87,87.0,87.0,87.0,87,87
unique,,75,,,,,
top,,Keleti pályaudvar,,,,,
freq,,3,,,,,
mean,695.816092,,47.499185,19.058434,20.356322,2015-01-17 00:41:27.333332992,2015-05-27 04:05:23.551724800
min,102.0,,47.473243,19.025488,14.0,2015-01-01 00:00:01,2015-01-24 22:10:01
25%,512.5,,47.492646,19.051503,15.0,2015-01-01 00:00:01,2015-05-31 23:55:03
50%,612.0,,47.499101,19.057213,22.0,2015-01-01 00:00:01,2015-05-31 23:55:03
75%,808.0,,47.507767,19.065611,22.0,2015-01-01 00:00:01,2015-05-31 23:55:03
max,1401.0,,47.518845,19.085408,36.0,2015-05-27 16:30:02,2015-05-31 23:55:03


In [6]:
df.columns

Index(['place_id', 'place_name', 'lat', 'lng', 'num_of_rack', 'datetime_start',
       'datetime_end'],
      dtype='object')

In [7]:
df['place_id'].nunique()

75

In [8]:
df['duration'] = (df['datetime_end'] - df['datetime_start']).dt.total_seconds() / 60.0	# Convert to minutes 
df['duration'] = df['duration'].astype(int)	# Convert to int 

In [9]:
df[df['place_id']==611]

Unnamed: 0,place_id,place_name,lat,lng,num_of_rack,datetime_start,datetime_end,duration
40,611,Nyugati tér,47.510114,19.055566,30.0,2015-01-01 00:00:01,2015-03-31 08:50:02,128690
41,611,Nyugati tér,47.509759,19.055352,30.0,2015-04-02 18:00:01,2015-04-04 22:25:02,3145
42,611,Nyugati tér,47.509675,19.055309,30.0,2015-04-04 22:30:02,2015-05-31 23:55:03,82165


In [10]:
df['num_of_rack'].describe()

count    87.000000
mean     20.356322
std       5.565628
min      14.000000
25%      15.000000
50%      22.000000
75%      22.000000
max      36.000000
Name: num_of_rack, dtype: float64

The goal is to treat each station as a single entity—especially when the “new” location is effectively the same place or very close by—you can consolidate rows into a single row per station ID.

In [11]:
df['datetime_start'] = pd.to_datetime(df['datetime_start'])
df['datetime_end'] = pd.to_datetime(df['datetime_end'])

# Group by place_id, unify rows

agg_dict = {
    'place_name': 'first',
    'lat': 'mean',
    'lng': 'mean',
    'num_of_rack': 'mean',           
    'datetime_start': 'min',         # earliest start
    'datetime_end': 'max'            # latest end
}

unique_df = df.groupby('place_id', as_index=False).agg(agg_dict)

# Inspect the result
unique_df

Unnamed: 0,place_id,place_name,lat,lng,num_of_rack,datetime_start,datetime_end
0,102,Szilágyi Dezső tér,47.503424,19.039714,15.0,2015-01-01 00:00:01,2015-05-31 23:55:03
1,103,Clark Ádám tér,47.497586,19.040916,15.5,2015-01-01 00:00:01,2015-05-31 23:55:03
2,104,Döbrentei tér,47.491279,19.045116,22.0,2015-01-01 00:00:01,2015-05-31 23:55:03
3,105,Déli pályaudvar,47.499858,19.025488,28.0,2015-01-01 00:00:01,2015-05-31 23:55:03
4,201,Margit híd - Buda,47.515002,19.039806,15.0,2015-01-01 00:00:01,2015-05-31 23:55:03
...,...,...,...,...,...,...,...
70,1302,Szent István park,47.518183,19.051610,22.0,2015-01-01 00:00:01,2015-05-31 23:55:03
71,1303,Váci út - Victor Hugo utca,47.517498,19.060008,16.0,2015-01-01 00:00:01,2015-05-31 23:55:03
72,1304,Margitsziget,47.518349,19.044821,30.0,2015-01-01 00:00:01,2015-05-31 23:55:03
73,1305,Pannónia utca - Raoul Wallenberg utca,47.514491,19.052535,15.0,2015-01-01 00:00:01,2015-05-31 23:55:03


In [12]:
unique_df.to_csv('unique_station_data.csv', index=False)