In [1]:
#handling data
import pandas as pd
import numpy as np
from scipy import stats
from operator import itemgetter


In [3]:
# Read json into a pandas dataframe
tweets_df = pd.read_json("plandemic_tweets.txt", lines=True)[:2000]

In [4]:
tweets_df.columns

Index(['created_at', 'id', 'id_str', 'text', 'source', 'truncated',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str',
       'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place',
       'contributors', 'retweeted_status', 'quoted_status_id',
       'quoted_status_id_str', 'quoted_status', 'quoted_status_permalink',
       'is_quote_status', 'quote_count', 'reply_count', 'retweet_count',
       'favorite_count', 'entities', 'favorited', 'retweeted', 'filter_level',
       'lang', 'timestamp_ms', 'extended_tweet', 'possibly_sensitive',
       'extended_entities', 'display_text_range', 'withheld_in_countries'],
      dtype='object')

In [5]:
all_hashtags = []
for e in tweets_df.entities:
    hashtags = [t['text'] for t in e['hashtags']]
    all_hashtags = all_hashtags + hashtags

from collections import Counter
c = Counter(all_hashtags)
most_common_tuples = c.most_common()
sorted_keys = sorted(c, key=c.get, reverse=True)
most_common_tuples

[('Plandemic', 478),
 ('GatesVirus', 262),
 ('virus', 261),
 ('infected', 259),
 ('COVID19', 63),
 ('plandemic', 52),
 ('ObamaGate', 50),
 ('PlandemicDocumentary', 41),
 ('ArrestBillGates', 38),
 ('CoronaHoax', 38),
 ('BillGatesIsEvil', 36),
 ('endthelockdown', 34),
 ('PlanDemic', 28),
 ('coronavirus', 27),
 ('PLANDEMIC', 26),
 ('GreatAwakening', 20),
 ('Plandemic2020', 18),
 ('QAnon', 18),
 ('obamagate', 17),
 ('WHO', 16),
 ('scamdemic', 15),
 ('BillGates', 15),
 ('WWG1WGA', 15),
 ('NWO', 14),
 ('Event201', 14),
 ('FakeNews', 13),
 ('Covid19', 13),
 ('Democrats', 12),
 ('Obamagate', 12),
 ('wwg1wga', 12),
 ('COVID1984', 12),
 ('Covid_19', 11),
 ('COVID', 11),
 ('NoVaccineForMe', 10),
 ('Scamdemic', 10),
 ('BillGatesVirus', 9),
 ('COVIDー19', 9),
 ('ByTheBook', 9),
 ('FactsMatter', 9),
 ('redpilled', 9),
 ('endlockdown', 8),
 ('Trump2020', 8),
 ('NewWorldOrderVirus', 7),
 ('pandemic', 7),
 ('lockdown', 7),
 ('ObaMAGAte', 7),
 ('CDC', 7),
 ('Lockdown', 7),
 ('ID2020', 7),
 ('Agenda2030',

In [6]:
hashtag_dict = {}
for t in most_common_tuples:
    hashtag_dict[t[0]] = t[1]

In [7]:
hashtag_wordclouds = []
for t in most_common_tuples:
    hashtag_wordclouds.append({'text': t[0], 'value': t[1]})

In [8]:
hashtag_dict

{'Plandemic': 478,
 'GatesVirus': 262,
 'virus': 261,
 'infected': 259,
 'COVID19': 63,
 'plandemic': 52,
 'ObamaGate': 50,
 'PlandemicDocumentary': 41,
 'ArrestBillGates': 38,
 'CoronaHoax': 38,
 'BillGatesIsEvil': 36,
 'endthelockdown': 34,
 'PlanDemic': 28,
 'coronavirus': 27,
 'PLANDEMIC': 26,
 'GreatAwakening': 20,
 'Plandemic2020': 18,
 'QAnon': 18,
 'obamagate': 17,
 'WHO': 16,
 'scamdemic': 15,
 'BillGates': 15,
 'WWG1WGA': 15,
 'NWO': 14,
 'Event201': 14,
 'FakeNews': 13,
 'Covid19': 13,
 'Democrats': 12,
 'Obamagate': 12,
 'wwg1wga': 12,
 'COVID1984': 12,
 'Covid_19': 11,
 'COVID': 11,
 'NoVaccineForMe': 10,
 'Scamdemic': 10,
 'BillGatesVirus': 9,
 'COVIDー19': 9,
 'ByTheBook': 9,
 'FactsMatter': 9,
 'redpilled': 9,
 'endlockdown': 8,
 'Trump2020': 8,
 'NewWorldOrderVirus': 7,
 'pandemic': 7,
 'lockdown': 7,
 'ObaMAGAte': 7,
 'CDC': 7,
 'Lockdown': 7,
 'ID2020': 7,
 'Agenda2030': 7,
 'EndTheShutdown': 7,
 'IDoNotConsent': 7,
 'PLANdemic': 6,
 'vaccines': 6,
 'ArrestFauci': 6,


In [9]:
import json

with open('hashtags.json', 'w') as outfile:
    json.dump(hashtag_dict, outfile)
    

with open('hashtags_wordcloud.json', 'w') as outfile:
    json.dump(hashtag_wordclouds, outfile)

In [10]:
!head hashtags.json

{"Plandemic": 478, "GatesVirus": 262, "virus": 261, "infected": 259, "COVID19": 63, "plandemic": 52, "ObamaGate": 50, "PlandemicDocumentary": 41, "ArrestBillGates": 38, "CoronaHoax": 38, "BillGatesIsEvil": 36, "endthelockdown": 34, "PlanDemic": 28, "coronavirus": 27, "PLANDEMIC": 26, "GreatAwakening": 20, "Plandemic2020": 18, "QAnon": 18, "obamagate": 17, "WHO": 16, "scamdemic": 15, "BillGates": 15, "WWG1WGA": 15, "NWO": 14, "Event201": 14, "FakeNews": 13, "Covid19": 13, "Democrats": 12, "Obamagate": 12, "wwg1wga": 12, "COVID1984": 12, "Covid_19": 11, "COVID": 11, "NoVaccineForMe": 10, "Scamdemic": 10, "BillGatesVirus": 9, "COVID\u30fc19": 9, "ByTheBook": 9, "FactsMatter": 9, "redpilled": 9, "endlockdown": 8, "Trump2020": 8, "NewWorldOrderVirus": 7, "pandemic": 7, "lockdown": 7, "ObaMAGAte": 7, "CDC": 7, "Lockdown": 7, "ID2020": 7, "Agenda2030": 7, "EndTheShutdown": 7, "IDoNotConsent": 7, "PLANdemic": 6, "vaccines": 6, "ArrestFauci": 6, "FakePandemic": 6, "coronahoax": 6, "propaganda":

In [11]:
for t in most_common_tuples:
    tweets_df['#'+t[0]] = False

In [12]:
# tweets_df.at[0, '#AppleMusic'] = False

In [13]:
for i, e in enumerate(tweets_df.entities):
    hashtags = [t['text'] for t in e['hashtags']]
    for hashtag in hashtags:
        tweets_df.at[i, '#' + hashtag] = True
    

In [14]:
for t in tweets_df[tweets_df['#insurance'] == True]['text'].values:
    print(t)

KeyError: '#insurance'

# Getting media

In [23]:
media_list = list(filter(lambda x: isinstance(x, dict), tweets_df['extended_entities'].values.tolist()))
for media in media_list:
    for m in media['media']:
        print(m['type'], m['media_url'])

photo http://pbs.twimg.com/media/EYAfLwdX0AYvcOG.jpg
photo http://pbs.twimg.com/media/EYAfXCsXgAg_SZr.jpg
photo http://pbs.twimg.com/media/EYAmZrxXsAUV-2e.jpg
photo http://pbs.twimg.com/media/EYAma8wWsAAQQP1.jpg
photo http://pbs.twimg.com/media/EYAohJsWkAAuR3c.jpg
photo http://pbs.twimg.com/media/EYAo0FIX0AMQkNT.jpg
photo http://pbs.twimg.com/media/EX-wCr4WkAgl-8D.jpg
photo http://pbs.twimg.com/media/EYAtL5iXQAA1TOQ.jpg
photo http://pbs.twimg.com/media/EYAuVjQWsAIKmjd.jpg
video http://pbs.twimg.com/ext_tw_video_thumb/1251037581908860931/pu/img/OsaUJKWsK6Glo1Qx.jpg
photo http://pbs.twimg.com/media/EYAwsLMWAAEQwJ8.jpg
video http://pbs.twimg.com/ext_tw_video_thumb/1260946764548489220/pu/img/cXocYxCzWBzes3vX.jpg
photo http://pbs.twimg.com/media/EX-60FDVcA0aSGz.jpg
photo http://pbs.twimg.com/media/EYAypKLWsAE-kxE.jpg
photo http://pbs.twimg.com/media/EYA3zsxU0AAOBGm.jpg
photo http://pbs.twimg.com/media/EYA5NWjVcAcjVPP.jpg
photo http://pbs.twimg.com/media/EYA-PXiXYAE8pX8.jpg
photo http://pbs.

# Full text

In [15]:
import math
def apply_func(x):
    if not isinstance(x, float):
        if 'full_text' in x: 
            return x['full_text']
        else:
            return float('nan')
    else:
        return float('nan')
    

tweets_df['full_text'] = tweets_df['extended_tweet'].apply(lambda x: apply_func(x))
tweets_df.full_text.fillna(tweets_df.text, inplace=True)

In [26]:
!pip install favicon

Collecting favicon
  Downloading https://files.pythonhosted.org/packages/93/4c/8baf94bb789972634d933152d27529f2bad4e5d2397b8da9c30f6f5342ce/favicon-0.7.0-py2.py3-none-any.whl
Installing collected packages: favicon
Successfully installed favicon-0.7.0


# Links

In [30]:
import re 
from collections import Counter
import requests

import extraction
import requests

import favicon



url_list = []
for text in tweets_df['text'].values.tolist():
    urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text)
    url_list += urls

    

url_list = list(filter(lambda url: len(url)>13, url_list))

c = Counter(url_list)
most_common_urls = c.most_common()
sorted_urls = sorted(c, key=c.get, reverse=True)

news_articles = []
twitter_domain_url = 'https://twitter.com'

for i, url in enumerate(sorted_urls):
    print('count value:', i)
    if i == 20:
        break
    resp = requests.head(url)
    try:
        news_article_dict = {}
        
        url = resp.headers["Location"]
        html = requests.get(url).text
        extracted = extraction.Extractor().extract(html, source_url=url)
        icon_url = favicon.get(url)[0][0]
        
        
        
        if url[:19] != twitter_domain_url:
            print('title:', extracted.title)
            print('description:', extracted.description)
            print(extracted.image, url, icon_url)



            news_article_dict['title'] = extracted.title
            news_article_dict['description'] = extracted.description
            news_article_dict['favicon'] = icon_url
            news_article_dict['image'] = extracted.image
            news_article_dict['url'] = url
            news_article_dict['share_count'] = most_common_urls[i][1]

            news_articles.append(news_article_dict)
        
        
        
        
        
        print('\n')
    except:
        print(url)
        print('\n')



count value: 0
title: Infowars
description: YouTube CENSORED: DOCTORS IN BLACK / #PlanDemic
https://static-3.bitchute.com/live/cover_images/9c7qJvwx7YQT/bJ1KTxyB0Beq02gqEpl62fAm_640x360.jpg https://www.bitchute.com/video/Cxk4baYujWJs/ https://static-1.bitchute.com/live/cover_images/9c7qJvwx7YQT/bJ1KTxyB0Beq02gqEpl62fAm_640x360.jpg


count value: 1
title: The Global Health Mafia Protection Racket
description: BOOM revelations get bigger as we go along. Stay with me! Fauci, Event201, Foundations ...and something called the Global Preparedness Monitoring Board. I expose the front organizations covering for the Global Health Maf
https://d2servp9jyqzxd.cloudfront.net/thumb/1Z5VYqJqrtI_7slMF3R6evNLNIP.jpg https://videos.utahgunexchange.com/watch/the-global-health-mafia-protection-racket_7slMF3R6evNLNIP.html https://videos.utahgunexchange.com/themes/default/img/icons/android-chrome-192x192.png?v=00QnQ0dwEm


count value: 2


count value: 3


count value: 4


count value: 5


count value: 6
ht

In [31]:
most_common_urls

[('https://t.co/2yfjdDhcEh', 22),
 ('https://t.co/iyvQcjlSzk', 10),
 ('https://t.co/omUDE517ad', 8),
 ('https://t.co/LixNpGV9hd', 8),
 ('https://t.co/3IXtJsES3x', 8),
 ('https://t.co/v5EwvaVNCv', 8),
 ('https://t.co/I6NJexr7c', 6),
 ('https://t.co/29xrLXF9wL', 6),
 ('https://t.co/eAh8ZCwdeD', 5),
 ('https://t.co/40XhhuSZqs', 5),
 ('https://t.co/v6Nh9lYeHZ', 4),
 ('https://t.co/q8Q7gMECQN', 4),
 ('https://t.co/0AjiMOapCR', 4),
 ('https://t.co/lfbtGhnrAV', 4),
 ('https://t.co/6L9pQgQd4r', 4),
 ('https://t.co/eOeKDCkSee', 3),
 ('https://t.co/E2iuzc9EX4', 3),
 ('https://t.co/FePJmu04MU', 3),
 ('https://t.co/BsAuEZNC2n', 3),
 ('https://t.co/BKTrNe69lp', 3),
 ('https://t.co/lfEmLgH7fk', 3),
 ('https://t.co/5oREyPEPyd', 3),
 ('https://t.co/zoFrNX7Wla', 3),
 ('https://t.co/mGCPGd8', 3),
 ('https://t.co/eA3sqh2PTj', 3),
 ('https://t.co/6WZX87aHGN', 2),
 ('https://t.co/UBEK2hDZmv', 2),
 ('https://t.co/6q5qbBbPRz', 2),
 ('https://t.co/s05gHPHaQT', 2),
 ('https://t.co/h5GD4Ah3fE', 2),
 ('https://t

In [32]:
import json

with open('articles.json', 'w') as outfile:
    json.dump(news_articles, outfile)

In [25]:
main_url[:19] != twitter_domain_url

False

In [26]:
twitter_domain_url

'https://twitter.com'

Apache Kafka Installation on Mac using Homebrew
Hi all,
https://miro.medium.com/max/612/1*tsXkwb5Q57qUzUhC07ESFQ.png


In [22]:
main_url = 'https://twitter.com/something'
twitter_domain_url = 'https://twitter.com'

twitter_domain_url == main_url[:19]

True

In [21]:
main_url[:19]

'https://twitter.com'