In [1]:
import pandas as pd
import numpy as np

from datetime import datetime
from TimeIntervals import TimeIntervals 

In [2]:
eskom_df = pd.read_csv('load-shedding2023v5.csv',parse_dates=['Date'])
eskom_df.tail()

Unnamed: 0,Date,Text
21,2023-05-26,City customers\n26 May\nStage 3: 05:00 - 22:00...
22,2023-05-28,Load-shedding update 28 May\n\nCity customers\...
23,2023-05-29,Load-shedding update 29 May\n\nCity customers ...
24,2023-05-30,Load-shedding update 30 May\n\nCity customers\...
25,2023-05-31,Load-shedding update 31 May\n\nCity customers\...


In [3]:
from dateutil.parser import parse

def isDateTime(string, fuzzy=False):
    """
    Function that returns true if the string can be converted to a datetime 
    """
    try: 
        parse(string, fuzzy=fuzzy)
        return True

    except ValueError:
        return False  
    
#############################################################################################################################

class DayStages:
    """
    A df that will contain the relevent load-shedding stages as TimeInterval for everyday extracted from tweets of a particular
    format
    """

    def __init__(self, start_date, end_date):
        """
        Create df over date range with empty time interval coloumn and columns Error/Tweet_num for debugging of faulty input
        All paramaters must be strings 
        """
        dates = pd.date_range(start=start_date, end=end_date) #input is of form mm/dd/yyyy
        col_zeros = np.zeros(dates.shape[0],dtype=int)
        col_false = np.full(col_zeros.shape, False)
        
        self.stage_df = pd.DataFrame({'Date': dates,
                    'Time_intervals' : '',
                    'Is_Full' : col_false, #Added as can have roll over e.g. 00:00-05:00 with no actual slots for the day
                    'Error':col_false,
                    'Tweet_num' : col_zeros})
        
    def isNextDay(self,start_time,end_time):
        """
        Test if time interval extends to the following day
        """    
        
        start_time = datetime.strptime(start_time, '%H:%M')
        end_time = datetime.strptime(end_time, '%H:%M')
        
        if(end_time == datetime.strptime("00:00", '%H:%M')):
            return False
        
        if(end_time <= start_time):
            return True
        else:
            return False
        
        
    def insertStagesDate(self,query,stage_list,start_list,end_list):
        """
        If no TimeIntervals class create one
        else we insert each time interval entry into the existing one
        """                     
        
        if (not self.stage_df.loc[self.stage_df.eval(query), 'Time_intervals'].item()):
            temp_day = TimeIntervals(start_list,end_list,stage_list)
            self.stage_df.loc[self.stage_df.eval(query), 'Time_intervals'] = temp_day
            
        else:
            temp_day = self.stage_df.loc[self.stage_df.eval(query), 'Time_intervals'].item()
            
            for i in range(len(stage_list)):
                temp_day.fitNewInterval(stage_list[i],start_list[i],end_list[i])
                

        
    def setStages(self, lines, tweet_num, tweet_date):
        """
        For any date sets load-shedding stages for appropriate time intervals in the df from a tweet text as array of strings(lines)
        Tweet text must contain section with format:
        
        Date
        Stage X: Time1 - Time2
        
        to be inserted into the df
        
        """
        print(f"Tweet Number: {tweet_num}")
        is_stage = False #if in possible stage section, i.e. wheather previous line was a date
        skip_date = False
        year = " " + tweet_date.strftime('%Y')
        
        #empty lists initilaisation
        start_list = []
        end_list = []
        stage_list = []
        
        for line in lines:
            print(line)
            if(is_stage == True):
                words = line.split(' ')# words = ["Stage", "X:", "Time1", "-", "Time2"] 
                
                if(words[0]=='Stage'): #Confirm is a stage section
                    
                    if(words[1] == '0'):

                        if(isDateTime(words[len(words)-3]) and isDateTime(words[len(words)-1])):
                            stage_list += [0]
                            start_list += [words[len(words)-3]]
                            end_list += [words[len(words)-1]]  
                            
                        else:
                            start_list.clear()
                            end_list.clear()
                            stage_list.clear()
                            is_stage = False
                            skip_date = True
                        
                            query = f"Date==\042{date}\042"
                            self.stage_df.loc[self.stage_df.eval(query), 'Tweet_num'] = tweet_num
                            self.stage_df.loc[self.stage_df.eval(query), 'Error'] = True
                    
                    elif(isDateTime(words[2]) and isDateTime(words[4])):                    
                        stage_list += [int(words[1][0])]
                        start_list += [words[2]]
                        end_list += [words[4]]  
                        
                    else:                        
                        start_list.clear()
                        end_list.clear()
                        stage_list.clear()
                        is_stage = False
                        skip_date = True
                        
                        query = f"Date==\042{date}\042"
                        self.stage_df.loc[self.stage_df.eval(query), 'Tweet_num'] = tweet_num
                        self.stage_df.loc[self.stage_df.eval(query), 'Error'] = True
                        

                elif(len(stage_list) > 0): #Exited stage section and must input lists into TimeIntervals object
                    is_stage == False #Reset
                    n = len(start_list)-1 #last entry's index
                    
                    if(self.isNextDay(start_list[n],end_list[n])): #If last entry extends to next day we need also include it
                        query = f"Date==\042{date}\042"
                        self.stage_df.loc[self.stage_df.eval(query), 'Is_Full'] = True
                        temp = end_list[n]
                        end_list[n] = "00:00"
                        self.insertStagesDate(query,stage_list,start_list,end_list)
                       
                        date = date + pd.to_timedelta(1,unit='d')#Next day only 1 entry
                        query = f"Date==\042{date}\042"
                        stage_list = [stage_list[n]]
                        start_list = ['00:00']
                        end_list = [temp]
                        self.insertStagesDate(query,stage_list,start_list,end_list)
                    
                    else:
                        query = f"Date==\042{date}\042"
                        self.stage_df.loc[self.stage_df.eval(query), 'Is_Full'] = True
                        self.insertStagesDate(query,stage_list,start_list,end_list)
                    
                    #Clear lists for next day 
                    start_list.clear()
                    end_list.clear()
                    stage_list.clear()
                
                else: #Catch case when in stage section but no input list was obtained
                    is_stage == False
                 
            #prevents case when only one Time is present in incorrect stage line being incorrectly identified as a date     
            if (skip_date == False):  

                if(isDateTime(line,True)): # Test if line has date and if we are entering a possible stage section
                    date = parse(line+year,fuzzy=True)
                
                    if date < tweet_date:
                        date = parse(line+str(int(year)+1),fuzzy=True)
                    
                    is_stage = True
                    
            else: 
                skip_date = False
                
    def FillEmptySlots(self):
        
        """Currently a quick fix, where empty TimeSlot is replaced with adjacent time slot. In future will use some ML algorithm 
        to better fill based on additional variables."""
        
        
        for index, row in self.stage_df.iterrows():
            if row['Is_Full']==False:
                
                if index == 0:
                    self.stage_df.loc[index,'Time_intervals'] = self.stage_df.loc[index+1,'Time_intervals']
                    self.stage_df.loc[index,'Is_Full'] = True
                    
                elif index == self.stage_df.tail(1).index[0]:
                    self.stage_df.loc[index,'Time_intervals'] = self.stage_df.loc[index-1,'Time_intervals']
                    self.stage_df.loc[index,'Is_Full'] = True
                
                elif self.stage_df.loc[index-1]['Is_Full'] == True:
                    self.stage_df.loc[index,'Time_intervals'] = self.stage_df.loc[index-1,'Time_intervals']
                    self.stage_df.loc[index,'Is_Full'] = True
                
                else:
                    self.stage_df.loc[index,'Time_intervals'] = self.stage_df.loc[index+1,'Time_intervals']
                    self.stage_df.loc[index,'Is_Full'] = True
                    
                    
        
                

In [4]:
start_date =  '05/01/2023'#'04/01/2023'#'03/02/2023'#'02/03/2023'#'12/31/2022''06/01/2023'#
end_date = '06/03/2023'#'05/03/2023'#'04/03/2023'#'03/04/2023'#'02/03/2023' '07/03/2023'#
myStages = DayStages(start_date,end_date)

for index, row in eskom_df.iterrows():
    tweet = row['Text']
    tweet = tweet.replace('\r', '')
    tweet_date = row['Date']
    lines = tweet.split('\n')
    lines = list(filter(None, lines))
    myStages.setStages(lines,index,tweet_date)   

print("Finished")
#print(myStages.stage_df.iloc[8]['Time_intervals'])

#print(repr(tweet))
##MUST FIX PROPERLY
#myStages.FillEmptySlots() #Only to ensure that everyday has a time interval 
##Currently needed in Feb

Tweet Number: 0
Load-shedding update
City customers  1 May
Stage 1: 16:00 - 20:00
Stage 3: 20:00 - 05:00
2 May
Stage 3: 05:00 - 16:00
Stage 4: 16:00 - 20:00
Stage 6: 20:00 - 05:00
3 May
Stage 3: 05:00 - 16:00
Stage 4: 16:00 - 20:00
Stage 6: 20:00 - 05:00
??See more in the thread??
Tweet Number: 1
Load-shedding update 3 May
City customers 
3 May
Stage 4: 16:00 - 20:00
Stage 6: 20:00 - 05:00
Updates to follow. Subject to rapid change. 
Download the City's load-shedding app:
? https://apple.co/3iPP07o
? https://bit.ly/CCTApp-Android
Tweet Number: 2
Load-shedding update 3 May
City customers 
3 May
Stage 4: 16:00 - 20:00
Stage 6: 20:00 - 05:00
4 May
Stage 3: 05:00 - 16:00
Stage 4: 16:00 - 22:00
Stage 6:  22:00 - 05:00
5 May
Stage 3: 05:00 - 22:00
Stage 5: 22:00 - 05:00
Updates to follow. Subject to rapid change.
Tweet Number: 3
Load-shedding update 5 May
City customers 
5 May
Stage 3: 05:00 - 22:00
Stage 5: 22:00 - 05:00
6 May
Stage 3: 05:00 - 16:00
Stage 4: 16:00 - 05:00
7 May 
Stage 3: 05

In [5]:
for index, row in myStages.stage_df.iterrows():
    if row['Error'] == True:
        print(row['Tweet_num'])

2
5
6
13


In [6]:
"""Print out tweets to see format"""
for i in range(eskom_df.shape[0]):
    print(f"Tweet {i}:\n" )
    print(eskom_df.iloc[i]['Text'], "\n")

Tweet 0:

Load-shedding update

City customers  1 May
Stage 1: 16:00 - 20:00
Stage 3: 20:00 - 05:00

2 May
Stage 3: 05:00 - 16:00
Stage 4: 16:00 - 20:00
Stage 6: 20:00 - 05:00

3 May
Stage 3: 05:00 - 16:00
Stage 4: 16:00 - 20:00
Stage 6: 20:00 - 05:00

??See more in the thread?? 

Tweet 1:

Load-shedding update 3 May

City customers 
3 May
Stage 4: 16:00 - 20:00
Stage 6: 20:00 - 05:00

Updates to follow. Subject to rapid change. 

Download the City's load-shedding app:
? https://apple.co/3iPP07o

? https://bit.ly/CCTApp-Android
 

Tweet 2:

Load-shedding update 3 May

City customers 
3 May
Stage 4: 16:00 - 20:00
Stage 6: 20:00 - 05:00

4 May
Stage 3: 05:00 - 16:00
Stage 4: 16:00 - 22:00
Stage 6:  22:00 - 05:00

5 May
Stage 3: 05:00 - 22:00
Stage 5: 22:00 - 05:00

Updates to follow. Subject to rapid change. 

Tweet 3:

Load-shedding update 5 May

City customers 
5 May
Stage 3: 05:00 - 22:00
Stage 5: 22:00 - 05:00

6 May
Stage 3: 05:00 - 16:00
Stage 4: 16:00 - 05:00

7 May 
Stage 3: 05:0

In [7]:
"""Print out tweets to see format"""
for index, row in myStages.stage_df.iterrows():
    print( row['Date'] )
    #row['Time_intervals'].removeDuplicate()
    print(row['Time_intervals'])

2023-05-01 00:00:00
stage: 0 from 00:00 to 16:00
stage: 1 from 16:00 to 20:00
stage: 3 from 20:00 to 00:00

2023-05-02 00:00:00
stage: 3 from 00:00 to 16:00
stage: 4 from 16:00 to 20:00
stage: 6 from 20:00 to 00:00

2023-05-03 00:00:00
stage: 6 from 00:00 to 05:00
stage: 3 from 05:00 to 16:00
stage: 4 from 16:00 to 20:00
stage: 6 from 20:00 to 00:00

2023-05-04 00:00:00
stage: 6 from 00:00 to 05:00
stage: 0 from 05:00 to 00:00

2023-05-05 00:00:00
stage: 0 from 00:00 to 05:00
stage: 3 from 05:00 to 22:00
stage: 5 from 22:00 to 00:00

2023-05-06 00:00:00
stage: 5 from 00:00 to 05:00
stage: 4 from 05:00 to 00:00

2023-05-07 00:00:00
stage: 4 from 00:00 to 05:00
stage: 5 from 05:00 to 00:00

2023-05-08 00:00:00
stage: 5 from 00:00 to 16:00
stage: 4 from 16:00 to 20:00
stage: 6 from 20:00 to 00:00

2023-05-09 00:00:00
stage: 6 from 00:00 to 05:00
stage: 5 from 05:00 to 16:00
stage: 4 from 16:00 to 22:00
stage: 6 from 22:00 to 00:00

2023-05-10 00:00:00
stage: 6 from 00:00 to 05:00
stage: 5

In [19]:
past_stages = {} 

for index, row in myStages.stage_df.iterrows():
    day_date = row['Date'].date()
    day_slots = row['Time_intervals'].outputDayDict()
    past_stages[day_date.strftime('%d/%m/%Y')] = day_slots 

In [20]:
import json

with open('day_slots.json', 'w', encoding='utf-8') as f:
    json.dump(past_stages, f, ensure_ascii=False, indent=4)

In [21]:
#Test read in correctly
with open('day_slots.json') as json_file:
    data_dict = json.load(json_file)

count = 0
for day,day_slots in data_dict.items():
    for slot_num, slot in day_slots.items():
        count += 1
        #print(f"{day}: stage{slot['stage']} from {slot['start']} to {slot['end']}")
        
print(count)

16
