In [1]:
import snscrape.modules.twitter as sntwitter #scrapper
import pandas as pd
import numpy as np

from datetime import datetime
from TimeIntervals import TimeIntervals 

#Change Default of empty in TimeInterval class to either be 0 or specical empty case(-1)
#If -1 apply appropriate filling method

In [2]:
"""Twitter Scraper"""

tweets_list = []
num_tweets = 1000

# Scrape tweets from City of Cape Town account with hashtag #CTInfo
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('#CTInfo from:CityofCT').get_items()): 
    if i>num_tweets:
        break
    tweets_list.append([tweet.date, tweet.id, tweet.rawContent, tweet.user.username, tweet.hashtags]) 
    
# Creating a dataframe from the tweets list above 
eskom_df = pd.DataFrame(tweets_list, columns=['Datetime', 'Tweet Id', 'Text', 'Username', 'Hashtags'])

Skipping unrecognised entry ID: 'promoted-tweet-1666018391876853764-29754bfcebf8a0e9'
Skipping unrecognised entry ID: 'promoted-tweet-1667174962250039296-2975529aa57c9f01'
Skipping unrecognised entry ID: 'promoted-tweet-1656539633977634816-29755d7ff317696c'
Skipping unrecognised entry ID: 'promoted-tweet-1666018391876853764-2975e1293496da42'
Skipping unrecognised entry ID: 'promoted-tweet-1665713534787780608-29752c57946add8a'
Skipping unrecognised entry ID: 'promoted-tweet-1656539633977634816-29751ea0270f84bc'
Skipping unrecognised entry ID: 'promoted-tweet-1666018391876853764-2975d8032902e0b3'
Skipping unrecognised entry ID: 'promoted-tweet-1667174962250039296-2975e95f0e9690e7'
Skipping unrecognised entry ID: 'promoted-tweet-1656539633977634816-297554ade9ebccb8'
Skipping unrecognised entry ID: 'promoted-tweet-1666018391876853764-29753fd22ada6b4b'
Skipping unrecognised entry ID: 'promoted-tweet-1667174962250039296-29754caa7db2321b'
Skipping unrecognised entry ID: 'promoted-tweet-165653

KeyboardInterrupt: 

In [35]:
"""Saved as pickle file to preserve dtypes and as it will only be used in python"""
eskom_df.to_pickle('outCityofCT.pkl')

In [2]:
"""Read pickle file so as not to scrape twitter everytime"""
eskom_df = pd.read_pickle('outCityofCT.pkl')
print(eskom_df.dtypes)

Datetime    datetime64[ns, UTC]
Tweet Id                  int64
Text                     object
Username                 object
Hashtags                 object
dtype: object


In [3]:
print(f"Number of entries in eskom_df: {len(eskom_df.index)}")
eskom_df.tail(5)

Number of entries in eskom_df: 1001


Unnamed: 0,Datetime,Tweet Id,Text,Username,Hashtags
996,2021-02-09 13:08:03+00:00,1359126917631406083,The City will implement Stage 2 load-shedding ...,CityofCT,[CTInfo]
997,2021-02-08 09:47:04+00:00,1358713950838341632,Our Public Emergency Contact Centre received a...,CityofCT,[CTInfo]
998,2021-02-07 07:12:35+00:00,1358312687168536576,UPDATE:\nEskom has suspended its load-shedding...,CityofCT,[CTInfo]
999,2021-02-06 11:16:04+00:00,1358011573697142785,City-supplied customers will be on Stage 1 bet...,CityofCT,[CTInfo]
1000,2021-02-05 11:00:00+00:00,1357645142363103232,The City’s website (https://t.co/9QeJVOTDcN) w...,CityofCT,[CTInfo]


In [4]:
"""Seperate out tweets that discuss load-shedding into stages_df"""
ctinfo_df = eskom_df[eskom_df['Text'].str.contains("Stage")]

#filter out tweets before earliest date
earliest_date = np.datetime64('2022-01-01T00:00:00') 
ctinfo_df['Datetime'] = ctinfo_df['Datetime'].dt.tz_convert(None) #Fix
mask = (ctinfo_df['Datetime'] > earliest_date) 

ctinfo_df = ctinfo_df.loc[mask]
N = len(ctinfo_df.index)

print(f"Number of entries with Stage present: {N}")
ctinfo_df.tail()

Number of entries with Stage present: 275


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ctinfo_df['Datetime'] = ctinfo_df['Datetime'].dt.tz_convert(None) #Fix


Unnamed: 0,Datetime,Tweet Id,Text,Username,Hashtags
639,2022-03-07 14:49:40,1500846146947796992,Load-shedding update:\n\nEskom’s load-shedding...,CityofCT,[CTInfo]
657,2022-02-07 14:30:49,1490694542529601543,Load-shedding update #5\n\nEskom’s load-sheddi...,CityofCT,[CTInfo]
658,2022-02-04 16:28:36,1489637019194167311,Load-shedding update #4\n\nEskom’s load-sheddi...,CityofCT,[CTInfo]
660,2022-02-03 15:38:22,1489261990044262400,Load-shedding update #3\n\nEskom’s Stage 2 loa...,CityofCT,[CTInfo]
662,2022-02-02 20:08:59,1488967707571720202,Load-shedding update #2\n\nEskom’s Stage 2 loa...,CityofCT,[CTInfo]


In [5]:
from dateutil.parser import parse

def isDateTime(string, fuzzy=False):
    """
    Function that returns true if the string can be converted to a datetime 
    """
    try: 
        parse(string, fuzzy=fuzzy)
        return True

    except ValueError:
        return False  
    
#############################################################################################################################

class DayStages:
    """
    A df that will contain the relevent load-shedding stages as TimeInterval for everyday extracted from tweets of a particular
    format
    """

    def __init__(self, start_date, end_date):
        """
        Create df over date range with empty time interval coloumn and columns Error/Tweet_num for debugging of faulty input
        """
        dates = pd.date_range(start=start_date, end=end_date) #input is of form mm/dd/yyyy
        col_zeros = np.zeros(dates.shape[0],dtype=int)
        col_false = np.full(col_zeros.shape, False)
        
        self.stage_df = pd.DataFrame({'Date': dates,
                    'Time_intervals' : '',
                    'Is_Full' : col_false, #Added as can have roll over e.g. 00:00-05:00 with no actual slots for the day
                    'Error':col_false,
                    'Tweet_num' : col_zeros})
        
    def isNextDay(self,start_time,end_time):
        """
        Test if time interval extends to the following day
        """
        start_time = datetime.strptime(start_time, '%H:%M')
        end_time = datetime.strptime(end_time, '%H:%M')
        
        if(end_time <= start_time):
            return True
        else:
            return False
        
    def insertStagesDate(self,query,stage_list,start_list,end_list):
        """
        If no TimeIntervals class create one
        else we insert each time interval entry into the existing one
        """                               
        if (not self.stage_df.loc[self.stage_df.eval(query), 'Time_intervals'].item()):
            
            temp_day = TimeIntervals(start_list,end_list,stage_list)
            self.stage_df.loc[self.stage_df.eval(query), 'Time_intervals'] = temp_day
            
        else:
            
            temp_day = self.stage_df.loc[self.stage_df.eval(query), 'Time_intervals'].item()
            
            for i in range(len(stage_list)):
                temp_day.fitNewInterval(stage_list[i],start_list[i],end_list[i])
                
        
        
    def setStages(self, lines, tweet_num):
        """
        For any date sets load-shedding stages for appropriate time intervals in the df from a tweet text as array of strings(lines)
        Tweet text must contain section with format:
        
        Date
        Stage X: Time1 - Time2
        
        to be inserted into the df
        
        """
        is_stage = False #if in possible stage section, i.e. wheather previous line was a date
        #empty lists initilaisation
        start_list = []
        end_list = []
        stage_list = []
        
        for line in lines: 
            if(is_stage == True):
            
                words = line.split(' ')# words = ["Stage", "X:", "Time1", "-", "Time2"] 
                
                if(words[0]=='Stage'): #Confirm is a stage section
                    
                    if(isDateTime(words[2]) and isDateTime(words[2])):
                    
                        stage_list += [words[1][0]]
                        start_list += [words[2]]
                        end_list += [words[4]]  
                        
                    else:
                        
                        start_list.clear()
                        end_list.clear()
                        stage_list.clear()
                        is_stage == False
                        
                        query = f"Date==\042{date}\042"
                        self.stage_df.loc[self.stage_df.eval(query), 'Tweet_num'] = tweet_num
                        self.stage_df.loc[self.stage_df.eval(query), 'Error'] = True
                        


                elif(len(start_list) > 0): #Exited stage section and must input lists into TimeIntervals object
                    
                    is_stage == False #Reset
                    n = len(start_list)-1 #last entry's index
                    
                    if(self.isNextDay(start_list[n],end_list[n])): #If last entry extends to next day we need also include it
                       
                        query = f"Date==\042{date}\042"
                        self.stage_df.loc[self.stage_df.eval(query), 'Is_Full'] = True
                        temp = end_list[n]
                        end_list[n] = "00:00"
                        self.insertStagesDate(query,stage_list,start_list,end_list)
                       
                        date = date + pd.to_timedelta(1,unit='d')#Next day only 1 entry
                        query = f"Date==\042{date}\042"
                        stage_list = [stage_list[n]]
                        start_list = ['00:00']
                        end_list = [temp]
                        self.insertStagesDate(query,stage_list,start_list,end_list)
                    
                    else:
                        
                        query = f"Date==\042{date}\042"
                        self.stage_df.loc[self.stage_df.eval(query), 'Is_Full'] = True
                        self.insertStagesDate(query,stage_list,start_list,end_list)
                    
                    #Clear lists for next day        
                    start_list.clear()
                    end_list.clear()
                    stage_list.clear()
                
                else: #Catch case when in stage section but no input list was obtained
                    is_stage == False
                    
                
            if(isDateTime(line,True)): # Test if line has date and if we are entering a possible stage section
                date = parse(line,fuzzy=True)
                is_stage = True
                
    def FillEmptySlots(self):
        """
        Currently a quick fix, where empty TimeSlot is replaced with adjacent time slot. In future will use some ML algorithm 
        to better fill based on additional variables.
        """
        
        for index, row in self.stage_df.iterrows():
            if row['Is_Full']==False:
                
                if index == 0:
                    self.stage_df.loc[index,'Time_intervals'] = self.stage_df.loc[index+1,'Time_intervals']
                    self.stage_df.loc[index,'Is_Full'] = True
                    
                elif index == self.stage_df.tail(1).index[0]:
                    self.stage_df.loc[index,'Time_intervals'] = self.stage_df.loc[index-1,'Time_intervals']
                    self.stage_df.loc[index,'Is_Full'] = True
                
                elif self.stage_df.loc[index-1]['Is_Full'] == True:
                    self.stage_df.loc[index,'Time_intervals'] = self.stage_df.loc[index-1,'Time_intervals']
                    self.stage_df.loc[index,'Is_Full'] = True
                
                else:
                    self.stage_df.loc[index,'Time_intervals'] = self.stage_df.loc[index+1,'Time_intervals']
                    self.stage_df.loc[index,'Is_Full'] = True
                    
        
                

In [6]:
start_date = '5/01/2023' 
end_date = '5/25/2023'
myStages = DayStages(start_date,end_date)

#Extract information from tweets
#for i in range(14): #Currently testing for 5 tweets
for i in range(13,-1,-1):
    tweet = ctinfo_df.iloc[i]['Text']
    lines = tweet.split('\n')
    myStages.setStages(lines,i) 
    
    
#print(myStages.stage_df['Time_intervals'][22])

#MUST FIX PROPERLY
myStages.FillEmptySlots() #Only to ensure that everyday has a time interval 

myStages.stage_df.tail(25)

Unnamed: 0,Date,Time_intervals,Is_Full,Error,Tweet_num
0,2023-05-01,stage: 0 from 00:00 to 16:00\nstage: 1 from 16...,True,False,0
1,2023-05-02,stage: 3 from 00:00 to 16:00\nstage: 4 from 16...,True,False,0
2,2023-05-03,stage: 6 from 00:00 to 05:00\nstage: 3 from 05...,True,False,0
3,2023-05-04,stage: 6 from 00:00 to 05:00\nstage: 3 from 05...,True,False,0
4,2023-05-05,stage: 6 from 00:00 to 05:00\nstage: 3 from 05...,True,False,0
5,2023-05-06,stage: 0 from 00:00 to 16:00\nstage: 5 from 16...,True,True,12
6,2023-05-07,stage: 5 from 00:00 to 16:00\nstage: 6 from 16...,True,True,11
7,2023-05-08,stage: 6 from 00:00 to 05:00\nstage: 5 from 05...,True,False,0
8,2023-05-09,stage: 6 from 00:00 to 05:00\nstage: 5 from 05...,True,False,0
9,2023-05-10,stage: 6 from 00:00 to 05:00\nstage: 5 from 05...,True,False,0


In [7]:
"""Print out tweets to see format"""
for i in range(14):
    print(f"Tweet {i}:\n" )
    print(ctinfo_df.iloc[i]['Text'], "\n")

#Error list
#Tweet 5: 14 May stage 3 under way - 16:00
#Tweet 6: 14 May Stage 3 until further notice
#Tweet 11: 7 May Stage 5: until 16:00
#Tweet 12: 6 May Stage 4: until 16:00

Tweet 0:

Load-shedding update 22 May

City customers
22 May
Stage 5: 20:00 - 05:00

23 May
Stage 3: 05:00 - 20:00
Stage 5: 20:00 - 05:00

24 May
Stage 2: 05:00 - 16:00
Stage 3: 16:00 - 20:00
Stage 5: 20:00 - 05:00

*Updates likely. 
*Subject to rapid change by Eskom. 

#CTInfo https://t.co/d4AceUnfQY 

Tweet 1:

24 May
Stage 2: 05:00 - 16:00
Stage 3: 16:00 - 20:00
Stage 5: 20:00 - 05:00

*Updates likely.
*Subject to rapid change by Eskom.

Download the City's load-shedding app:
🍎 https://t.co/195WUqpJaV
🤖 https://t.co/STgO6WHJxc

#CTInfo 

Tweet 2:

[Thread] Load-shedding update 21 May

City customers
21 May
Stage 4: 16:00 - 05:00

22 May
Stage 2: 05:00 - 16:00
Stage 3: 16:00 - 20:00
Stage 5: 20:00 - 05:00

23 May
Stage 2: 05:00 - 16:00
Stage 3: 16:00 - 20:00
Stage 5: 20:00 - 05:00

#CTInfo https://t.co/X9Z9ok074N 

Tweet 3:

Load-shedding update 18 May

City customers
18 May 
Stage 3: 05:00 - 22:00
Stage 5: 22:00 - 05:00

19 May
Stage 3: 05:00 - 22:00
Stage 5: 22:00 - 05:00

20 May
S

In [8]:
"""Print out tweets to see format"""
for index, row in myStages.stage_df.iterrows():
    print( row['Date'] )
    print(row['Time_intervals'])

2023-05-01 00:00:00
stage: 0 from 00:00 to 16:00
stage: 1 from 16:00 to 20:00
stage: 3 from 20:00 to 00:00

2023-05-02 00:00:00
stage: 3 from 00:00 to 16:00
stage: 4 from 16:00 to 20:00
stage: 6 from 20:00 to 00:00

2023-05-03 00:00:00
stage: 6 from 00:00 to 05:00
stage: 3 from 05:00 to 16:00
stage: 4 from 16:00 to 20:00
stage: 6 from 20:00 to 00:00

2023-05-04 00:00:00
stage: 6 from 00:00 to 05:00
stage: 3 from 05:00 to 16:00
stage: 4 from 16:00 to 20:00
stage: 6 from 20:00 to 00:00

2023-05-05 00:00:00
stage: 6 from 00:00 to 05:00
stage: 3 from 05:00 to 16:00
stage: 4 from 16:00 to 20:00
stage: 6 from 20:00 to 00:00

2023-05-06 00:00:00
stage: 0 from 00:00 to 16:00
stage: 5 from 16:00 to 00:00

2023-05-07 00:00:00
stage: 5 from 00:00 to 16:00
stage: 6 from 16:00 to 00:00

2023-05-08 00:00:00
stage: 6 from 00:00 to 05:00
stage: 5 from 05:00 to 16:00
stage: 4 from 16:00 to 20:00
stage: 6 from 20:00 to 00:00

2023-05-09 00:00:00
stage: 6 from 00:00 to 05:00
stage: 5 from 05:00 to 16:00


In [18]:
past_stages = {} 

for index, row in myStages.stage_df.iterrows():
    day_date = row['Date'].date()
    day_slots = row['Time_intervals'].outputDayDict()
    past_stages[day_date.strftime('%Y-%m-%d')] = day_slots

#day_date = myStages.stage_df.iloc[3]['Date'].date()
#day_slots = myStages.stage_df.iloc[3]['Time_intervals'].outputDayDict()
#past_stages[day_date.strftime('%d/%m/%Y')] = day_slots
#print(past_stages)


In [19]:
import json

with open('../cape_town_slots/cape_town_day_stages.json', 'w', encoding='utf-8') as f:
    json.dump(past_stages, f, ensure_ascii=False, indent=4)

In [20]:
with open('../cape_town_slots/cape_town_day_stages.json') as json_file:
    data_dict = json.load(json_file)

count = 0
for day,day_slots in data_dict.items():
    for slot_num, slot in day_slots.items():
        count += 1
        print(f"{day}: stage{slot['stage']} from {slot['start']} to {slot['end']}")
        
print(count)

2023-05-01: stage0 from 00:00 to 16:00
2023-05-01: stage1 from 16:00 to 20:00
2023-05-01: stage3 from 20:00 to 00:00
2023-05-02: stage3 from 00:00 to 16:00
2023-05-02: stage4 from 16:00 to 20:00
2023-05-02: stage6 from 20:00 to 00:00
2023-05-03: stage6 from 00:00 to 05:00
2023-05-03: stage3 from 05:00 to 16:00
2023-05-03: stage4 from 16:00 to 20:00
2023-05-03: stage6 from 20:00 to 00:00
2023-05-04: stage6 from 00:00 to 05:00
2023-05-04: stage3 from 05:00 to 16:00
2023-05-04: stage4 from 16:00 to 20:00
2023-05-04: stage6 from 20:00 to 00:00
2023-05-05: stage6 from 00:00 to 05:00
2023-05-05: stage3 from 05:00 to 16:00
2023-05-05: stage4 from 16:00 to 20:00
2023-05-05: stage6 from 20:00 to 00:00
2023-05-06: stage0 from 00:00 to 16:00
2023-05-06: stage5 from 16:00 to 00:00
2023-05-07: stage5 from 00:00 to 16:00
2023-05-07: stage6 from 16:00 to 00:00
2023-05-08: stage6 from 00:00 to 05:00
2023-05-08: stage5 from 05:00 to 16:00
2023-05-08: stage4 from 16:00 to 20:00
2023-05-08: stage6 from 2