In [23]:
import snscrape.modules.twitter as sntwitter
import pandas as pd

# Creating list to append tweet data to
tweets_list1 = []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:BloreVaccine' +  ' lang:en' + ' -filter:links -filter:replies').get_items()):
    tweets_list1.append([tweet.date, tweet.id, tweet.content, tweet.username])
    
# Creating a dataframe from the tweets list above 
tweets_df1 = pd.DataFrame(tweets_list1, columns=['Datetime', 'Tweet Id', 'Text', 'Username'])

In [25]:
tweets_df1.to_csv('vaccine_analysis.csv')

In [26]:
tweets_df1

Unnamed: 0,Datetime,Tweet Id,Text,Username
0,2021-05-28 15:27:01+00:00,1398299778547519495,560011 on 29-05-2021 COVISHIELD available for ...,BloreVaccine
1,2021-05-28 14:45:22+00:00,1398289301104824329,560066 on 29-05-2021 COVAXIN available for 18 ...,BloreVaccine
2,2021-05-28 14:42:05+00:00,1398288471152726018,560066 on 30-05-2021 COVAXIN available for 18 ...,BloreVaccine
3,2021-05-28 14:39:09+00:00,1398287735333396486,560066 on 29-05-2021 COVISHIELD available for ...,BloreVaccine
4,2021-05-28 14:13:18+00:00,1398281229389099008,560098 on 29-05-2021 COVISHIELD available for ...,BloreVaccine
...,...,...,...,...
646,2021-05-01 05:29:17+00:00,1388364883226288129,#Vaccine appointment available for ages 18 and...,BloreVaccine
647,2021-05-01 05:29:16+00:00,1388364881066266624,#Vaccine appointment available for ages 18 and...,BloreVaccine
648,2021-05-01 05:29:12+00:00,1388364863219453953,#Vaccine appointment available for ages 18 and...,BloreVaccine
649,2021-05-01 05:21:27+00:00,1388362914378117120,#Vaccine appointment available for ages 18 and...,BloreVaccine


In [27]:
tweets_df1.iloc[0,2]

'560011 on 29-05-2021 COVISHIELD available for 18 to 44 in #Karnataka, #BBMP at Cloudnine Fertility- Jayanagar(Capacity: 40, Dose 1: 40 slots left, Dose 2: 0 slots left)'

# Cleaning

In [28]:
import re
from datetime import datetime
from dateutil import tz
import pytz

In [29]:
def get_IST(date):
    
    gmt = pytz.timezone('GMT')
    eastern = tz.tzlocal()
    date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S+00:00')

    dategmt = gmt.localize(date)
    date_ist = dategmt.astimezone(eastern)
    return str(date_ist)

In [97]:
def preprocess(df):
    
    # Filter only tweets containing vaccine info by using keywords like "available for 18 to 44" as the user only tweeets for that
    df = df[df.Text.str.contains("available for 18 to 44")]
    
    # Date
    df['Datetime']  =  df['Datetime'].apply(lambda x : get_IST(str(x)))
    df['Datetime'] = pd.to_datetime(df['Datetime'], format='%Y-%m-%d %H:%M:%S')
    
    # Vaccine (looks like this field started after sometime)
    df['Vaccine'] = df.Text.apply(lambda x : 'COVAXIN' if 'COVAXIN' in x else ('COVISHIELD' if 'COVISHIELD' in x else ''))
    df = df[df.Vaccine.notna()]
    
    # Zip Code
    df['zip_code'] = df.Text.apply(lambda x : re.match('\d{6}', x).group(0))
    
    # Hospital Name
    df['Hospital_Name'] = df.Text.apply(lambda x :  re.search('at (.+?)\(', x).group(0).replace('at ', '').replace('(', ''))
    df['Hospital_Name'] = df.Hospital_Name.apply(lambda x : x.split('on')[0][:-1])
    
     # Capacity(WIP)
    # df['Capacity'] = df.Text.apply(lambda x :   re.search('Capacity: (.+?),', x).group(0).replace(',', '').replace('Capacity: ',''))
    df['Capacity'] = df.apply(lambda x: re.findall(r'\d+', (x.Text.split(x['Hospital_Name'])[1]).split('(')[1]) if x['Hospital_Name'] != '' else x, axis = 1)
    df = df[df.Hospital_Name != '']
    df['Capacity'] = df.Capacity.apply(lambda x: x[0] if len(x) > 0 else -999)
    df = df[df.Capacity != -999]
    
    # Get time attributes
    df['Hour'] = df['Datetime'].dt.hour
    df['Day'] = df['Datetime'].dt.day
    df['dayofweek'] = df['Datetime'].dt.dayofweek
    
    df = df.reset_index(drop = True)

    return df

In [98]:
tweets_df2 = preprocess(tweets_df1)
tweets_df2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Datetime']  =  df['Datetime'].apply(lambda x : get_IST(str(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Datetime'] = pd.to_datetime(df['Datetime'], format='%Y-%m-%d %H:%M:%S')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Vaccine'] = df.Text.apply(lambda x : 'COVAXIN' if 'COVAXI

Unnamed: 0,Datetime,Tweet Id,Text,Username,Vaccine,zip_code,Hospital_Name,Capacity,Hour,Day,dayofweek
0,2021-05-28 20:57:01+05:30,1398299778547519495,560011 on 29-05-2021 COVISHIELD available for ...,BloreVaccine,COVISHIELD,560011,Cloudnine Fertility- Jayanaga,40,20,28,4
1,2021-05-28 20:15:22+05:30,1398289301104824329,560066 on 29-05-2021 COVAXIN available for 18 ...,BloreVaccine,COVAXIN,560066,RXDX WHITE FIELD S1 P,50,20,28,4
2,2021-05-28 20:12:05+05:30,1398288471152726018,560066 on 30-05-2021 COVAXIN available for 18 ...,BloreVaccine,COVAXIN,560066,RXDX WHITE FIELD S1 P,50,20,28,4
3,2021-05-28 20:09:09+05:30,1398287735333396486,560066 on 29-05-2021 COVISHIELD available for ...,BloreVaccine,COVISHIELD,560066,CLOUDNINE WHITEFIELD P,50,20,28,4
4,2021-05-28 19:43:18+05:30,1398281229389099008,560098 on 29-05-2021 COVISHIELD available for ...,BloreVaccine,COVISHIELD,560098,FORTIS HOSPITAL NAGARABHAV,50,19,28,4
...,...,...,...,...,...,...,...,...,...,...,...
405,2021-05-07 09:08:29+05:30,1390511325697118208,561203 #Vaccine available for 18 to 44 in #Kar...,BloreVaccine,,561203,Doddballapura UPHC,40,9,7,4
406,2021-05-07 09:08:28+05:30,1390511320932384768,562110 #Vaccine available for 18 to 44 in #Kar...,BloreVaccine,,562110,Godlumuddenahalli PHC,47,9,7,4
407,2021-05-07 09:08:27+05:30,1390511319388856328,562110 #Vaccine available for 18 to 44 in #Kar...,BloreVaccine,,562110,Godlumuddenahalli PHC,48,9,7,4
408,2021-05-07 09:08:26+05:30,1390511313722351619,561203 #Vaccine available for 18 to 44 in #Kar...,BloreVaccine,,561203,Doddaballapura TH,31,9,7,4


In [104]:
tweets_df2.to_csv('app_data.csv', index = False)