In [1]:
import pandas as pd
import numpy as np

# Transform Raw Vaccine Tweets

In [2]:
raw_vaccine_tweets = pd.read_json("../data/raw/vaccine_tweets_hydrated.jsonl", lines=True, encoding="iso-8859-1")

- annotated dataset is needed later on for model training (tweet sentiment)
- clean vaccine dataset, keep:
    - tweet id
    - date
    - text
    - geolocation if possible, else set null
    - (username for network analysis)
    - is retweet: boolean (to filter out afterwards)
 

In [3]:
raw_vaccine_tweets.head()

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,source,in_reply_to_status_id,in_reply_to_status_id_str,...,retweeted,possibly_sensitive,lang,extended_entities,quoted_status_id,quoted_status_id_str,quoted_status_permalink,quoted_status,withheld_in_countries,withheld_scope
0,2020-12-13 16:27:13+00:00,1338158543359250400,1338158543359250432,While the world has been on the wrong side of ...,False,"[0, 275]","{'hashtags': [{'text': 'covid19', 'indices': [...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,...,False,0.0,en,,,,,,,
1,2020-12-12 19:22:45+00:00,1337840331522453500,1337840331522453504,@cnnbrk #COVID19 #CovidVaccine #vaccine #Coron...,False,"[8, 173]","{'hashtags': [{'text': 'COVID19', 'indices': [...","<a href=""https://mobile.twitter.com"" rel=""nofo...",1.337811e+18,1.337811e+18,...,False,,en,,,,,,,
2,2020-12-14 18:00:29+00:00,1338544403795882000,1338544403795881984,The FDA Authorizes Emergency Use Of The Pfizer...,False,"[0, 263]","{'hashtags': [{'text': 'PFE', 'indices': [79, ...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,...,False,0.0,en,"{'media': [{'id': 1338544352956719000, 'id_str...",,,,,,
3,2020-12-12 12:26:34+00:00,1337735595704115200,1337735595704115200,The #FDA finally issues #EUA now comes the pro...,False,"[0, 224]","{'hashtags': [{'text': 'FDA', 'indices': [4, 8...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,...,False,,en,,,,,,,
4,2020-12-12 20:04:29+00:00,1337850832256176000,1337850832256176128,There have not been many bright days in 2020 b...,False,"[0, 276]","{'hashtags': [{'text': 'BidenHarris', 'indices...","<a href=""http://twitter.com/download/iphone"" r...",,,...,False,,en,,,,,,,


# Data preparation

- Remove duplicate tweets
    - Drop if retweeted == true
    - Either remove duplicate text
- Relevant columns = id, created_at, username, full_text, retweet, hashtags

## Remove duplicate tweets

In [4]:
raw_vaccine_tweets[raw_vaccine_tweets.retweeted == True]

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,source,in_reply_to_status_id,in_reply_to_status_id_str,...,retweeted,possibly_sensitive,lang,extended_entities,quoted_status_id,quoted_status_id_str,quoted_status_permalink,quoted_status,withheld_in_countries,withheld_scope


In [5]:
raw_vaccine_tweets.id.count()

101947

In [6]:
raw_vaccine_tweets = raw_vaccine_tweets.drop_duplicates(subset=["full_text"], keep='first').reset_index(drop=True)

In [7]:
raw_vaccine_tweets.id.count()

101023

- 924 Tweets were dropped

## Select relevant columns

In [8]:
raw_vaccine_tweets.columns

Index(['created_at', 'id', 'id_str', 'full_text', 'truncated',
       'display_text_range', 'entities', 'source', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo',
       'coordinates', 'place', 'contributors', 'is_quote_status',
       'retweet_count', 'favorite_count', 'favorited', 'retweeted',
       'possibly_sensitive', 'lang', 'extended_entities', 'quoted_status_id',
       'quoted_status_id_str', 'quoted_status_permalink', 'quoted_status',
       'withheld_in_countries', 'withheld_scope'],
      dtype='object')

In [9]:
raw_vaccine_tweets = raw_vaccine_tweets[["id_str","created_at","user","geo","full_text", "entities"]]

## Selecting user id

In [10]:
raw_vaccine_tweets["user_id"] = int

for i in range(len(raw_vaccine_tweets)):
    raw_vaccine_tweets.user_id[i] = raw_vaccine_tweets.user[i]["id"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_vaccine_tweets.user_id[i] = raw_vaccine_tweets.user[i]["id"]


In [11]:
raw_vaccine_tweets.user

0         {'id': 76052772, 'id_str': '76052772', 'name':...
1         {'id': 1300382181605494800, 'id_str': '1300382...
2         {'id': 1164717209253552000, 'id_str': '1164717...
3         {'id': 1316036067754205200, 'id_str': '1316036...
4         {'id': 1110032180237852700, 'id_str': '1110032...
                                ...                        
101018    {'id': 1504133713, 'id_str': '1504133713', 'na...
101019    {'id': 3029283761, 'id_str': '3029283761', 'na...
101020    {'id': 1072356934148251600, 'id_str': '1072356...
101021    {'id': 1204276725372215300, 'id_str': '1204276...
101022    {'id': 1050804894133874700, 'id_str': '1050804...
Name: user, Length: 101023, dtype: object

## Hashtags

renaming entities column to hashtags:

In [12]:
raw_vaccine_tweets = raw_vaccine_tweets.rename(columns={'entities': 'hashtags', 'id_str':'id'})

filtering out hashtags only which are stored in entities>hashtags>text:

In [13]:
for i in range(len(raw_vaccine_tweets)):
    try:
        raw_vaccine_tweets.hashtags[i] = [value["text"] for value in raw_vaccine_tweets.iloc[i]["hashtags"]["hashtags"]]
    except:
        print("failed: ",i)
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_vaccine_tweets.hashtags[i] = [value["text"] for value in raw_vaccine_tweets.iloc[i]["hashtags"]["hashtags"]]


In [14]:
hashtag_dict = {}
for i in range(len(raw_vaccine_tweets)):
    for hashtag in raw_vaccine_tweets["hashtags"][i]:
        if hashtag not in hashtag_dict:
            hashtag_dict[hashtag] = 1
        else:
            hashtag_dict[hashtag] += 1

Identifying relevant hashtags for each vaccine manufacturer:

In [15]:
sorted(hashtag_dict.items(), key=lambda x: x[1], reverse=True)

[('Moderna', 25268),
 ('Covaxin', 21670),
 ('COVID19', 17271),
 ('SputnikV', 16547),
 ('vaccine', 11479),
 ('Pfizer', 8513),
 ('moderna', 6784),
 ('PfizerBioNTech', 6480),
 ('COVAXIN', 6325),
 ('Sinovac', 6244),
 ('CovidVaccine', 6070),
 ('Sinopharm', 5578),
 ('Covishield', 4971),
 ('AstraZeneca', 4573),
 ('coronavirus', 3582),
 ('vaccinated', 3413),
 ('COVID19Vaccine', 3331),
 ('covaxin', 2963),
 ('vaccination', 2822),
 ('vaccines', 2488),
 ('OxfordAstraZeneca', 2488),
 ('Vaccine', 2442),
 ('lockdown', 2277),
 ('China', 2102),
 ('Covid19', 2018),
 ('India', 2001),
 ('COVID', 1937),
 ('Russia', 1900),
 ('BharatBiotech', 1896),
 ('BBMP', 1853),
 ('covid19', 1686),
 ('GetVaccinated', 1602),
 ('PfizerVaccine', 1580),
 ('oxfordastrazeneca', 1527),
 ('pfizer', 1418),
 ('PfizerBiontech', 1399),
 ('COVIDVaccination', 1368),
 ('COVID19Vaccination', 1360),
 ('CoronaVaccine', 1355),
 ('covid', 1340),
 ('Covid', 1059),
 ('Covid_19', 1057),
 ('Coronavirus', 1045),
 ('CovishieldVaccine', 1003),
 ('

In [16]:
pfizer_biontech_vax = ["Pfizer", "PFIZER", "PfizerBioNTech", "PfizerVaccine", "pfizer", "PfizerBiontech", "BioNTech", "pfizerbiontech", "PfizerBioNtech", "Biontech", "biontech", "PFIZERBIONTECH", "BioNTechpfizer"]
sputnik_vax = ["SputnikV", "Sputnik", "Sputnikv", "sputnikv", "SputnikUpdates", "Sputnikvaccine", "sputnikV", "sputnik", "SPUTNIKV", "SputnikVaccinated", "SputnikLight", "SputnikVaccineInKenya", "SputnikVaccine"]
sinopharm_vax = ["Sinopharm", "sinopharm", "SinoPharm", "BoycottSinopharm", "SINOPHARM"]
sinovac_vax = ["Sinovac", "sinovac", "SinoVac", "SINOVAC", "BoycottSinovac"]
moderna_vax = ["Moderna", "moderna", "modernavaccine", "MODERNA", "ModernaVaccine", "modernagang", "teammoderna", "modeRNA", "ModernaGang"]
oxford_az_vax = ["OxfordAstraZeneca", "oxfordastrazeneca", "Oxford", "oxfordvaccine", "OxfordVaccine", "OxfordAstrazeneca", "AstraZeneca", "astrazeneca", "AstraZenaca", "astrazenecavaccine", "Astrazeneca", "AstraZeneka", "Astrazenaca", "ASTRAZENECA", "AstraZenecaVaccine"]
covaxin_vax = ["Covaxin", "COVAXIN", "covaxin", "GurgaonCOVAXIN", "MumbaiCOVAXIN", "covaxine", "BBMPCOVAXIN", "PuneCOVAXIN", "CoVaxin", "ThaneCOVAXIN", "covaxinated", "covaxinvaccine", "BharatBiotech", "AatmanirbharBharat", "AtmaNirbharBharat", "bharatbiotech", "bharatBiotech", "AtmanirbharBharat", "Bharat", "congressmuktbharat", "atmanirbharbharat"]
jandj_vax = ["johnsonandjohnson", "JohnsonandJohnson", "JohnsonAndJohnson", "JohnsonAndJohnsonVaccine", "Johnson", "JandJ", "JohnsonJohnson", "johnson", "JJ"]

normalizing hashtags to all lowercase:

In [17]:
for i in range(0, len(raw_vaccine_tweets)):
    for j in range(len(raw_vaccine_tweets["hashtags"][i])):
        review = raw_vaccine_tweets["hashtags"][i][j]
        review = review.lower()
        raw_vaccine_tweets["hashtags"][i][j] = review

adding columns for each vaccine manufacturer, based on the hashtags of a tweet:

In [18]:
raw_vaccine_tweets["PfizerBiontech"] = 0
raw_vaccine_tweets["SputnikV"] = 0
raw_vaccine_tweets["Sinopharm"] = 0
raw_vaccine_tweets["Sinovac"] = 0
raw_vaccine_tweets["Moderna"] = 0
raw_vaccine_tweets["AstraZeneca"] = 0
raw_vaccine_tweets["Covaxin"] = 0
raw_vaccine_tweets["JandJ"] = 0

In [19]:
for i in range(len(raw_vaccine_tweets)):
    for hashtag in raw_vaccine_tweets["hashtags"][i]:
        if hashtag in pfizer_biontech_vax:
            raw_vaccine_tweets["PfizerBiontech"][i] = 1
        if hashtag in sputnik_vax:
            raw_vaccine_tweets["SputnikV"][i] = 1
        if hashtag in sinopharm_vax:
            raw_vaccine_tweets["Sinopharm"][i] = 1
        if hashtag in sinovac_vax:
            raw_vaccine_tweets["Sinovac"][i] = 1
        if hashtag in moderna_vax:
            raw_vaccine_tweets["Moderna"][i] = 1
        if hashtag in oxford_az_vax:
            raw_vaccine_tweets["AstraZeneca"][i] = 1
        if hashtag in covaxin_vax:
            raw_vaccine_tweets["Covaxin"][i] = 1
        if hashtag in jandj_vax:
            raw_vaccine_tweets["JandJ"][i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_vaccine_tweets["PfizerBiontech"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_vaccine_tweets["Moderna"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_vaccine_tweets["AstraZeneca"][i] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_vaccine_tweets["Sinopharm"][i] = 1
A value

---

In [20]:
from geopy import Nominatim
import reverse_geocode

## Geo location

- Filtering out all tweets that had a geolocation when tweeted
- storing geo location as coordinates and as countryname

- [x] country from user location
- [x] transform the country to coordinates (whole df) (store in "coordinates")
- [x] overwrite coordinates that are in geo originally, and in same loop add country name to "country"
- [ ] that way we have all coordinates and all country names

adding geolocation from user location:

- get geolocation from user info, for that:
    - check if one of the countries in countries list is in the user_location, if so, assign that country to him
    - in second  step, use that country in geopy to get lat&long
- apply geopy to turn geolocation to coordinates
- if geo is empty then fill it with other value otherwise leave it as is

Fetching location specified in user profile:

In [21]:
raw_vaccine_tweets["user_location"] = None
for i in range(len(raw_vaccine_tweets["user"])):
    raw_vaccine_tweets["user_location"][i] = raw_vaccine_tweets["user"][i]["location"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_vaccine_tweets["user_location"][i] = raw_vaccine_tweets["user"][i]["location"]


In [22]:
countries = ['argentina', 'australia', 'austria', 'belgium', 'brazil', 'canada', 'france', 'germany', 'india', 'israel', 'italy', 'japan', 'mexico', 'pakistan', 'russia', 'spain', 'uae', 'uk', 'usa']


set the country in user profile to the normalized country name:

In [23]:
#set country for locations that contain that country
for country in countries :
    raw_vaccine_tweets["user_location"][raw_vaccine_tweets["user_location"].str.lower().str.contains(country)] = country

#remove any location that isn't in country-list
for i in range(len(raw_vaccine_tweets)):
    if raw_vaccine_tweets["user_location"][i] not in countries:
        raw_vaccine_tweets["user_location"][i] = None

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_vaccine_tweets["user_location"][raw_vaccine_tweets["user_location"].str.lower().str.contains(country)] = country
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_vaccine_tweets["user_location"][i] = None


create new dataframe with locations only:

get coordinates for every country:

In [24]:
geolocator = Nominatim(user_agent="CovaxAnalytica")
location_coordinates = {}

for country in countries:
    location = geolocator.geocode(country)
    try:
        location_coordinates[country] = [location.latitude, location.longitude]
    except:
        location_coordinates[country] = None

In [25]:
location_coordinates

{'argentina': [-34.9964963, -64.9672817],
 'australia': [-24.7761086, 134.755],
 'austria': [47.2, 13.2],
 'belgium': [50.6402809, 4.6667145],
 'brazil': [-10.3333333, -53.2],
 'canada': [61.0666922, -107.991707],
 'france': [46.603354, 1.8883335],
 'germany': [51.0834196, 10.4234469],
 'india': [22.3511148, 78.6677428],
 'israel': [31.5313113, 34.8667654],
 'italy': [42.6384261, 12.674297],
 'japan': [36.5748441, 139.2394179],
 'mexico': [22.5000485, -100.0000375],
 'pakistan': [30.3308401, 71.247499],
 'russia': [64.6863136, 97.7453061],
 'spain': [39.3260685, -4.8379791],
 'uae': [49.4871968, 31.2718321],
 'uk': [54.7023545, -3.2765753],
 'usa': [39.7837304, -100.4458825]}

map country name to coordinates:

In [26]:
raw_vaccine_tweets["coordinates"] = None
for i in range(len(raw_vaccine_tweets)):
     if raw_vaccine_tweets["user_location"][i] in countries:
        raw_vaccine_tweets["coordinates"][i] = location_coordinates[raw_vaccine_tweets["user_location"][i].lower()]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_vaccine_tweets["coordinates"][i] = location_coordinates[raw_vaccine_tweets["user_location"][i].lower()]


In [27]:
raw_vaccine_tweets

Unnamed: 0,id,created_at,user,geo,full_text,hashtags,user_id,PfizerBiontech,SputnikV,Sinopharm,Sinovac,Moderna,AstraZeneca,Covaxin,JandJ,user_location,coordinates
0,1338158543359250432,2020-12-13 16:27:13+00:00,"{'id': 76052772, 'id_str': '76052772', 'name':...",,While the world has been on the wrong side of ...,"[covid19, supplychain, logistics, vaccine, uni...",76052772,1,0,0,0,0,0,0,0,,
1,1337840331522453504,2020-12-12 19:22:45+00:00,"{'id': 1300382181605494800, 'id_str': '1300382...",,@cnnbrk #COVID19 #CovidVaccine #vaccine #Coron...,"[covid19, covidvaccine, vaccine, corona, pfize...",1300382181605494800,1,0,0,0,0,0,0,0,,
2,1338544403795881984,2020-12-14 18:00:29+00:00,"{'id': 1164717209253552000, 'id_str': '1164717...",,The FDA Authorizes Emergency Use Of The Pfizer...,"[pfe, pfizer, pfizervaccine, pfizerbiontech, f...",1164717209253552000,1,0,0,0,0,0,0,0,,
3,1337735595704115200,2020-12-12 12:26:34+00:00,"{'id': 1316036067754205200, 'id_str': '1316036...",,The #FDA finally issues #EUA now comes the pro...,"[fda, eua, pfizerbiontech, vaccinated]",1316036067754205200,1,0,0,0,0,0,0,0,,
4,1337850832256176128,2020-12-12 20:04:29+00:00,"{'id': 1110032180237852700, 'id_str': '1110032...",,There have not been many bright days in 2020 b...,"[bidenharris, election2020, pfizerbiontech, co...",1110032180237852700,1,0,0,0,0,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101018,1403362726521438208,2021-06-11 14:45:21+00:00,"{'id': 1504133713, 'id_str': '1504133713', 'na...",,Those making issue of #covaxine should know th...,"[covaxine, sputnikv]",1504133713,0,1,0,0,0,0,1,0,india,"[22.3511148, 78.6677428]"
101019,1403638668418420736,2021-06-12 09:01:51+00:00,"{'id': 3029283761, 'id_str': '3029283761', 'na...",,How much more time will #Sputnik take to be av...,"[sputnik, sputnikv]",3029283761,0,1,0,0,0,0,0,0,,
101020,1403728228548628480,2021-06-12 14:57:44+00:00,"{'id': 1072356934148251600, 'id_str': '1072356...",,Intl renown broadcaster @mattfrei discussed Va...,"[sinopharm, sinovac, sputnikv, sad]",1072356934148251600,0,1,1,1,0,0,0,0,,
101021,1403606811299631104,2021-06-12 06:55:16+00:00,"{'id': 1204276725372215300, 'id_str': '1204276...",,"#Getvaccinated, Be informed!\n\n#breakthechain...","[getvaccinated, breakthechain, getyourfactsrig...",1204276725372215300,0,1,0,0,0,0,1,0,,


add coordinates from geo:

In [28]:
for i in range(len(raw_vaccine_tweets)):
    if raw_vaccine_tweets.geo[i] != None:
        raw_vaccine_tweets.coordinates[i] = raw_vaccine_tweets.geo[i]["coordinates"]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_vaccine_tweets.coordinates[i] = raw_vaccine_tweets.geo[i]["coordinates"]


renaming all countries in standardized way

In [29]:
for i in range(len(raw_vaccine_tweets)):
    if raw_vaccine_tweets.coordinates[i] != None:
        raw_vaccine_tweets.user_location[i] = reverse_geocode.search(tuple([raw_vaccine_tweets.coordinates[i],(1,1)]))[0]["country"]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  raw_vaccine_tweets.user_location[i] = reverse_geocode.search(tuple([raw_vaccine_tweets.coordinates[i],(1,1)]))[0]["country"]


In [30]:
raw_vaccine_tweets["user_location"].unique()

array([None, 'Canada', 'Palestinian Territory', 'India', 'Germany',
       'United States', 'Italy', 'United Kingdom', 'France',
       'Russian Federation', 'Mexico', 'Belgium', 'Spain', 'Australia',
       'Pakistan', 'Ukraine', 'Argentina', 'Austria',
       'Virgin Islands, U.S.', 'Malaysia', 'Japan', 'Brazil',
       'United Arab Emirates', 'Jersey', 'Philippines', 'Chile',
       'Indonesia', 'Hong Kong', 'Qatar', 'Netherlands', 'China',
       'Saudi Arabia', 'Guyana', 'Thailand', 'Singapore', 'Croatia',
       'Switzerland', 'Trinidad and Tobago', 'Sweden', 'Greece'],
      dtype=object)

# Exporting dataset

In [31]:
raw_vaccine_tweets.to_csv("../data/interim/cleaned_vaccine_tweets.csv")