# This is a basic sanity check for our data set

In [1]:
import json
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt

import pickle
from itertools import combinations

import pprint
import json
import glob
from random import random, randrange
import logging
from collections import defaultdict
import sys

import subprocess
import shlex
from scipy.stats import pearsonr

from datetime import datetime, timedelta
from dateutil.parser import parse
import datetime
from collections import Counter
from scipy.signal import savgol_filter
import itertools
import time
import csv
import pandas as pd
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

from tqdm import trange, tqdm



In [2]:
start_time = time.time()

In [3]:
# Using the regular dictionary
f = open('retweet_dict_and_counts','rb')
retweet_dict = pickle.load(f)
f.close()
elapsed_time = time.time() - start_time
print("Elapsed time: " + str(elapsed_time))

Elapsed time: 17.064847946166992


# First Check: Time Frame

In [4]:
# Count retweets before and after the declaration of pandemic
cutoff = datetime.datetime.strptime('11/03/20',"%d/%m/%y")
before = 0
after = 0
latest_retweet = cutoff
earliest_retweet = cutoff
for tweet in tqdm(retweet_dict):
    for a,d in retweet_dict[tweet]['rt_list']:
        rt_date = datetime.datetime.strptime(d,'%y-%m-%d-%H:%M:%S')
        if (rt_date < cutoff):
            before += 1
        elif (cutoff < rt_date):
            after += 1
        if (latest_retweet < rt_date):
            latest_retweet = rt_date
        if (earliest_retweet > rt_date):
            earliest_retweet = rt_date
        
print("Retweets before:",before)
print("Retweets after:",after)
print("Total retweets:",before+after)
print('Total increase:', after*100/before)
print("The latest retweet on record is:", latest_retweet)
print("The earliest retweet on record is:", earliest_retweet)
elapsed_time = time.time() - start_time
print("Elapsed time: " + str(elapsed_time))

print("Window for retweets is correct.")

100%|██████████| 1271451/1271451 [02:29<00:00, 8508.07it/s] 

Retweets before: 3413021
Retweets after: 14501140
Total retweets: 17914161
Total increase: 424.87696383936697
The latest retweet on record is: 2020-05-26 23:11:59
The earliest retweet on record is: 2020-01-01 00:12:00
Elapsed time: 166.51526999473572
Window for retweets is correct.





In [5]:
# Count tweets before and after the declaration of pandemic
cutoff = datetime.datetime.strptime('11/03/20',"%d/%m/%y")
before = 0
after = 0
latest_tweet = cutoff
earliest_tweet = cutoff
for tweet in tqdm(retweet_dict):
    t_date = datetime.datetime.strptime(retweet_dict[tweet]['date'],'%y-%m-%d-%H:%M:%S')
    if (t_date < cutoff):
        before += 1
    elif (cutoff < t_date):
        after += 1
    if (latest_tweet < t_date):
        latest_tweet = t_date
    if (earliest_tweet > t_date):
        earliest_tweet = t_date

print("Tweets before:",before)
print("Tweets after:",after)
print("Total tweets:",before+after)
print('Total increase:', after*100/before)
print("The latest tweet on record is:", latest_tweet)
print("The earliest tweet on record is:", earliest_tweet)
elapsed_time = time.time() - start_time
print("Elapsed time: " + str(elapsed_time))

print("Window for tweets is correct")

100%|██████████| 1271451/1271451 [00:11<00:00, 111849.81it/s]

Tweets before: 249140
Tweets after: 1022311
Total tweets: 1271451
Total increase: 410.3359556875652
The latest tweet on record is: 2020-05-22 00:12:00
The earliest tweet on record is: 2019-12-27 00:12:02
Elapsed time: 177.8896987438202
Window for tweets is correct





In [6]:
rt_author_set = set()
for a in retweet_dict:
    rt_author_set.add(retweet_dict[a]['author'])
    
print('Total amount of authors is:',len(rt_author_set))

Total amount of authors is: 534403


## Second Check: Misspelling

mininglist1 = ['vax', 'vaxxed', 'vaccine', 'vaccination', 'vaccinations', 
'vaxsafety', 'vax saftey', 'vaccineswork', 'vaccines work', 'vaccinesaftey',
'vaccine saftey', 'vaccines revealed', 'vaccinesrevealed', 
'novax', 'no vax', 'no-vax', 'antivax', 'anti-vax', 'anti vax', 'immunisation', 
'Vaccin', 'Vaccinaties', 'vaccinatiezorg', 'vaccine injury', 'vax injury',
'vaccinatieschade']

mininglist2 = ['\#vax', '\#vaxxed', '\#vaccine', '\#vaccination', '\#vaccinations', '\#vaxsafety', '\#vaccineswork', '\#vaccinesaftey', '\#vaccinesrevealed', '\#novax', '\#antivax', '\#immunisation', '\#Vaccin', '\#Vaccinaties', '\#vaccinatiezorg', 
'\#vaccinatieschade', '\#nvkp', '\#rvp', '\#rijksvaccinatieprogramma', '\#vaccineinjury', 
'\#vaxinjury', '\#anti-vax']

So actually we need to look for: 'saftey', 'vaccinesaftey', and '\#vaccinesaftey'.

In [7]:
def light_detector(s):
    # first some preprocessing
    s = s.strip().lower()
    s = s.replace('\n'," ")
    s = [x for x in s if (x.isalnum() or x == " " or x=="#" or x=="@")]
    s = "".join(s)
    s = " ".join([x for x in s.split() if ((x not in ENGLISH_STOP_WORDS) 
                                           and (x.find('http') == -1) 
                                           #and (x[0] != '#')
                                           and (x[0] != '@')
                                          and (x != 'amp'))])
    
    # An xi will give index-integer -1 if the sequence does not appear
    x1 = s.find('saftey')
    x2 = s.find('vaccinesaftey')
    x3 = s.find('#vaccinesaftey')
    return max(x1,x2,x3)

In [8]:
# Find misspelled tweets
misspelled_tweets = []
for t in tqdm(retweet_dict):
    x = light_detector(retweet_dict[t]['text'])
    if x > -1:
        misspelled_tweets.append(t)

print("Count of misspelled tweets: " + str(len(misspelled_tweets)))

elapsed_time = time.time() - start_time
print("Elapsed time: " + str(elapsed_time))

100%|██████████| 1271451/1271451 [00:27<00:00, 45419.27it/s]

Count of misspelled tweets: 26
Elapsed time: 206.67106747627258





In [10]:
# Delete misspelled tweets
for t in misspelled_tweets:
    del retweet_dict[t]

# Check things are good
# Find misspelled tweets
misspelled_tweets = []
for t in tqdm(retweet_dict):
    x = light_detector(retweet_dict[t]['text'])
    if x > -1:
        misspelled_tweets.append(t)

print("Count of misspelled tweets: " + str(len(misspelled_tweets)))

# Save the new dictionary
save = True
if save:
    f = open("pruned_retweet_dict","wb")
    pickle.dump(retweet_dict,f)
    f.close()

elapsed_time = time.time() - start_time
print("Elapsed time: " + str(elapsed_time))

100%|██████████| 1271425/1271425 [00:30<00:00, 42314.24it/s]


Count of misspelled tweets: 0
Elapsed time: 294.566365480423


In [None]:
# Last time it took 232 seconds, so not bad at all.