In [2]:

from datetime import datetime,timedelta
import os
import pandas as pd
import psycopg2
import pickle
from exceptions.databasealredyfilled import DatabaseAlreadyFilledException
from getrootdirectory import getRootDirectory
from configuration.parameters import ConfigurationParameters

class GTFSCurrentDayAnalysis:
    def __init__(self,filePath:str) -> None:
        self.parentDirectory=os.getcwd()
        self.configurationParameters=ConfigurationParameters()
    def getTodaysServices(self):
        date_parser = lambda x: pd.to_datetime(x, format='%Y%m%d')
        dayOfWeek=datetime.now().strftime('%A')
        dayOfWeek=dayOfWeek.lower()
        calendar_formatted = pd.read_csv(os.path.join(os.getcwd(),"data","vvs","gtfs","calendar.txt"), parse_dates=['start_date','end_date'], date_parser=date_parser)
        todaysServices=calendar_formatted.loc[((calendar_formatted["start_date"]<date_parser(datetime.today().strftime('%Y%m%d'))) & (calendar_formatted["end_date"]>date_parser(datetime.today().strftime('%Y%m%d'))) & (calendar_formatted[dayOfWeek]==1))]
        calendar_dates=pd.read_csv(os.path.join(os.getcwd(),"data","vvs","gtfs","calendar_dates.txt"),parse_dates=['date'], date_parser=date_parser)
        
        includedsServicesServices=calendar_dates.loc[(calendar_dates["date"]==date_parser(datetime.today().strftime('%Y%m%d'))) & (calendar_dates["exception_type"]==1)]
    

        excludedServvices=calendar_dates.loc[(calendar_dates["date"]==date_parser(datetime.today().strftime('%Y%m%d'))) & (calendar_dates["exception_type"]==2)]
        originalServices=set(list(todaysServices["service_id"]))
        removedServices=set(list(excludedServvices["service_id"]))
        addedServices=set(list(includedsServicesServices["service_id"]))
        activeServices=originalServices.difference(removedServices)
        activeServices=activeServices.union(addedServices)
        todaysServices=pd.DataFrame(columns=["service_id","date"])
        todaysServices["service_id"]=list(activeServices)
        todaysServices["date"]=[datetime.today().strftime("%Y%m%d")]*len(activeServices)
        
        return todaysServices
    
    def getTodaysTrips(self):
        todaysServices=self.getTodaysServices()
        trips=pd.read_csv(os.path.join(self.parentDirectory,"data","vvs","gtfs","trips.txt"))
        todayTrips=pd.merge(trips, todaysServices, on='service_id', how='inner')
        todayUniqueTripds=todayTrips.drop_duplicates(subset=["trip_id"])
        
        #fetch the agency id's from routs.txt
        
        routes=pd.read_csv(os.path.join(getRootDirectory(),"data","vvs","gtfs","routes.txt"))
        todaysTripsWithRoutes=pd.merge(todayUniqueTripds,routes,on="route_id",how="inner")
        todaysTripsWithRoutesSubset=todaysTripsWithRoutes[['route_id', 'trip_id', 'service_id','agency_id','route_long_name', 'trip_headsign', 'direction_id',
            'shape_id',  'date']]
        return todaysTripsWithRoutesSubset
    
    def generateTimeStamp(self,day,time:str):
        try:
            timestampStr=day+" "+time
            correct_timestamp=datetime.strptime(timestampStr,"%Y%m%d %H:%M:%S")
        except:
            
            timeparts=time.split(":")
            hours=(timeparts[0])
            minutes=int(timeparts[1])
            correctedHours=correctedHours=int(hours)-24
            current_datetime = datetime.today()

                # Set the time to midnight (00:00:00)
            beginning_of_tomorrow = current_datetime.replace(hour=0, minute=0, second=0, microsecond=0)

            correct_timestamp=beginning_of_tomorrow+timedelta(days=1,hours=correctedHours,minutes=minutes)
                
            
        return correct_timestamp
    
    def getDeparturesAndArrivalsToday(self):
        todaysTrips=self.getTodaysTrips()
        dtype = {"date": str}
        stopTimes=pd.read_csv(os.path.join(self.parentDirectory,"data","vvs","gtfs","stop_times.txt"),dtype=dtype)
        first_stop = stopTimes['stop_sequence'] == 1
        last_stop = stopTimes.groupby('trip_id')['stop_sequence'].transform('max') == stopTimes['stop_sequence']
        subset = stopTimes[first_stop | last_stop]
        subset
        
        todaysDepartureArrivals=pd.merge(subset,todaysTrips,on="trip_id",how="inner")
        todaysDepartureArrivals.sort_values(by=['trip_id', 'stop_sequence'])
        todaysDepartureArrivals.to_csv(os.path.join(self.parentDirectory,"data",datetime.now().strftime("%Y-%m-%d")+".txt"))
        
        timestamps=[]
        agencies=[]
        agenciesFlat=[]
        trip_ids=[]
        beginTimes=[]
        endTimes=[]
        
        longNames=[]
        
        for i in range(len(todaysDepartureArrivals)):
            row=todaysDepartureArrivals.iloc[i]
            day=row["date"]
            currentTripId=row["trip_id"]
            
            
            if(row["stop_sequence"]==1):
                time=row["departure_time"]
                timestamp=self.generateTimeStamp(day,time)
                beginTimes.append(timestamp)
                timestamps.append(timestamp)
                trip_ids.append(currentTripId)
                agency=row["agency_id"]
                
                agencies.append(agency)
                agenciesFlat.append(agency)
                longname=row["route_long_name"]
                longNames.append(longname)
                
                
                
                
                        
                
            else:
                    
                
                time=row["arrival_time"]
                agency=row["agency_id"]
                agencies.append(agency)
                timestamp=self.generateTimeStamp(day,time)
                endTimes.append(timestamp)
                timestamps.append(timestamp)
                
                
                
              
                
            
            
            
           
                    
            
        todaysDepartureArrivals["timestamp"]=timestamps
        todaysDepartureArrivals["agency"]=agencies
       
        
        flatDataset=pd.DataFrame(columns=["trip_id","begin_time","end_time","agency"])
        flatDataset["trip_id"]=trip_ids
        flatDataset["begin_time"]=beginTimes
        flatDataset["end_time"]=endTimes
        flatDataset["agency"]=agenciesFlat
        flatDataset["route_long_name"]=longNames
        flatDataset["number of updates"]=[0]*len(trip_ids)
        flatDataset.to_pickle(os.path.join(self.parentDirectory,"data","flat"+datetime.now().strftime("%Y-%m-%d")+".pkl"))
        #flatDataset.to_pickle("today_flat"+".pkl")
        flatDataset.to_csv(os.path.join(self.parentDirectory,"data","flat"+datetime.now().strftime("%Y-%m-%d")+".txt"))
        
        todaySubset= todaysDepartureArrivals[["trip_id","arrival_time", "departure_time", "stop_id","stop_sequence",
                                        "stop_headsign","route_id", "service_id","trip_headsign","direction_id",
                                        "date", "timestamp", "agency"]]
        todaySubset["minute of day"]=todaySubset['timestamp'].dt.hour * 60 + todaySubset['timestamp'].dt.minute
        todaySubset.to_pickle(os.path.join(self.parentDirectory,"data",datetime.now().strftime("%Y-%m-%d")+".pkl"))
        return flatDataset

ModuleNotFoundError: No module named 'exceptions'

In [15]:
import os
print(os.getcwd())
mycurrentTag=GTFSCurrentDayAnalysis("somestfu")

todayTrips=mycurrentTag.getTodaysTrips()

c:\Users\CHO\Documents\GTFSRTAnalysis\StadtnaviHerrenberg\comparator


  calendar_formatted = pd.read_csv(os.path.join(os.getcwd(),"data","vvs","gtfs","calendar.txt"), parse_dates=['start_date','end_date'], date_parser=date_parser)
  calendar_dates=pd.read_csv(os.path.join(os.getcwd(),"data","vvs","gtfs","calendar_dates.txt"),parse_dates=['date'], date_parser=date_parser)


c:\Users\CHO\Documents\GTFSRTAnalysis\StadtnaviHerrenberg\comparator


In [16]:
len(todayTrips)

31017

In [18]:

todaysTrips=mycurrentTag.getTodaysTrips()
dtype = {"date": str}
stopTimes=pd.read_csv(os.path.join(os.getcwd(),"data","vvs","gtfs","stop_times.txt"),dtype=dtype)
first_stop = stopTimes['stop_sequence'] == 1
last_stop = stopTimes.groupby('trip_id')['stop_sequence'].transform('max') == stopTimes['stop_sequence']
subset = stopTimes[first_stop | last_stop]
subset

todaysDepartureArrivals=pd.merge(subset,todaysTrips,on="trip_id",how="inner")
todaysDepartureArrivals.sort_values(by=['trip_id', 'stop_sequence'])

  calendar_formatted = pd.read_csv(os.path.join(os.getcwd(),"data","vvs","gtfs","calendar.txt"), parse_dates=['start_date','end_date'], date_parser=date_parser)
  calendar_dates=pd.read_csv(os.path.join(os.getcwd(),"data","vvs","gtfs","calendar_dates.txt"),parse_dates=['date'], date_parser=date_parser)


c:\Users\CHO\Documents\GTFSRTAnalysis\StadtnaviHerrenberg\comparator


  stopTimes=pd.read_csv(os.path.join(os.getcwd(),"data","vvs","gtfs","stop_times.txt"),dtype=dtype)


Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,route_id,service_id,agency_id,route_long_name,trip_headsign,direction_id,shape_id,date
60413,de:ding.eu:20035e::1-T0-1-j24,05:15:00,05:15:00,de:08135:650:8:8,1,,0,0,0.00,de:ding.eu:20035e:,T0#27,VVS,Heidenheim - Söhnstetten - Böhmenkirch,Böhmenkirch Rathaus,0,73-35-j24-1.28.H,20241021
60414,de:ding.eu:20035e::1-T0-1-j24,05:47:00,05:47:00,de:08117:203:0:R1,17,,0,0,21397.51,de:ding.eu:20035e:,T0#27,VVS,Heidenheim - Söhnstetten - Böhmenkirch,Böhmenkirch Rathaus,0,73-35-j24-1.28.H,20241021
60395,de:ding.eu:20035e::1-T0-10-j24,15:35:00,15:35:00,de:08135:650:8:8,1,,0,0,0.00,de:ding.eu:20035e:,T0#27,VVS,Heidenheim - Söhnstetten - Böhmenkirch,Böhmenkirch Rathaus,0,73-35-j24-1.31.H,20241021
60396,de:ding.eu:20035e::1-T0-10-j24,16:00:00,16:00:00,de:08117:203:0:R1,11,,0,0,17899.00,de:ding.eu:20035e:,T0#27,VVS,Heidenheim - Söhnstetten - Böhmenkirch,Böhmenkirch Rathaus,0,73-35-j24-1.31.H,20241021
60397,de:ding.eu:20035e::1-T0-11-j24,16:15:00,16:15:00,de:08135:650:8:8,1,,0,0,0.00,de:ding.eu:20035e:,T0#27,VVS,Heidenheim - Söhnstetten - Böhmenkirch,Böhmenkirch Rathaus,0,73-35-j24-1.33.H,20241021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60390,de:vvs:51644_::1-T0-8-j24,10:28:00,10:28:00,de:08115:2336:0:3,4,,0,0,3922.72,de:vvs:51644_:,T0#27,VVS,Leonberg (Bf -) Eltingen - Waldfriedhof,Leonberg Waldfriedhof,0,51-644-j24-1.2.H,20241021
60391,de:vvs:51644_::1-T0-9-j24,11:20:00,11:20:00,de:08115:2341:0:3,1,,0,0,0.00,de:vvs:51644_:,T0#27,VVS,Leonberg (Bf -) Eltingen - Waldfriedhof,Leonberg Waldfriedhof,0,51-644-j24-1.2.H,20241021
60392,de:vvs:51644_::1-T0-9-j24,11:28:00,11:28:00,de:08115:2336:0:3,4,,0,0,3922.72,de:vvs:51644_:,T0#27,VVS,Leonberg (Bf -) Eltingen - Waldfriedhof,Leonberg Waldfriedhof,0,51-644-j24-1.2.H,20241021
60393,de:vvs:52967_::1-T0-6-j24,06:29:00,06:29:00,de:08117:64:0:H1,1,,0,0,0.00,de:vvs:52967_:,T0#27,VVS,Wiesensteig - Gosbach - Merklingen,Merklingen,0,52-967-j24-1.1.H,20241021


In [23]:
timestamps=[]
agencies=[]
agenciesFlat=[]
trip_ids=[]
beginTimes=[]
endTimes=[]

longNames=[]

def generateTimeStamp(day,time:str):
    try:
        timestampStr=day+" "+time
        correct_timestamp=datetime.strptime(timestampStr,"%Y%m%d %H:%M:%S")
    except:
        
        timeparts=time.split(":")
        hours=(timeparts[0])
        minutes=int(timeparts[1])
        correctedHours=correctedHours=int(hours)-24
        current_datetime = datetime.today()

            # Set the time to midnight (00:00:00)
        beginning_of_tomorrow = current_datetime.replace(hour=0, minute=0, second=0, microsecond=0)

        correct_timestamp=beginning_of_tomorrow+timedelta(days=1,hours=correctedHours,minutes=minutes)
            
        
    return correct_timestamp

for i in range(len(todaysDepartureArrivals)):
    row=todaysDepartureArrivals.iloc[i]
    day=row["date"]
    currentTripId=row["trip_id"]
    
    
    if(row["stop_sequence"]==1):
        time=row["departure_time"]
        timestamp=generateTimeStamp(day,time)
        beginTimes.append(timestamp)
        timestamps.append(timestamp)
        trip_ids.append(currentTripId)
        agency=row["agency_id"]
        
        agencies.append(agency)
        agenciesFlat.append(agency)
        longname=row["route_long_name"]
        longNames.append(longname)
        
        
        
        
                
        
    else:
            
        
        time=row["arrival_time"]
        agency=row["agency_id"]
        agencies.append(agency)
        timestamp=generateTimeStamp(day,time)
        endTimes.append(timestamp)
        timestamps.append(timestamp)

In [35]:
trip_counts = todaysDepartureArrivals.groupby('trip_id').size()

# Step 2: Filter trip_ids that do not appear exactly twice
trip_ids_not_twice = trip_counts[trip_counts != 2].index

# Step 3: Output the result
print(trip_ids_not_twice)

Index(['de:vvs:50934_::1-T0-1-j24', 'de:vvs:50934_::1-T0-2-j24',
       'de:vvs:50934_::1-T0-4-j24'],
      dtype='object', name='trip_id')


In [37]:
stopTimes.loc[stopTimes["trip_id"]=="de:vvs:50934_::1-T0-1-j24"]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled
2402349,de:vvs:50934_::1-T0-1-j24,23:55:00,23:55:00,de:08117:4016:0:H1,1,,0,0,0.0
2402350,de:vvs:50934_::1-T0-1-j24,23:42:00,23:42:00,de:08117:1106:0:H1,1,,0,0,
2402351,de:vvs:50934_::1-T0-1-j24,23:59:00,23:59:00,de:08117:4030:0:H1,2,,0,0,2641.6
2402352,de:vvs:50934_::1-T0-1-j24,23:49:00,23:49:00,de:08117:1305:0:H1,2,,0,0,
2402353,de:vvs:50934_::1-T0-1-j24,24:00:00,24:00:00,de:08117:4031:0:H1,3,,1,0,4006.4
2402354,de:vvs:50934_::1-T0-1-j24,23:50:00,23:50:00,de:08117:1306:0:R1,3,,0,0,
2402355,de:vvs:50934_::1-T0-1-j24,24:01:00,24:01:00,de:08117:4033:0:H1,4,,0,0,4649.01
2402356,de:vvs:50934_::1-T0-1-j24,23:51:00,23:51:00,de:08117:1304:0:R1,4,,0,0,0.0
2402357,de:vvs:50934_::1-T0-1-j24,24:01:00,24:01:00,de:08117:4034:0:H1,5,,1,0,4925.71
2402358,de:vvs:50934_::1-T0-1-j24,24:02:00,24:02:00,de:08117:4036:0:H1,6,,1,0,5385.95
