# Covid-19 Twitter Data Descriptive Analysis - Data Processing & Preparation




# Part 1 - PreProcessing the Data

In [0]:
pip uninstall contractions

Uninstalling contractions-0.0.24:
  Would remove:
    /usr/local/lib/python3.6/dist-packages/contractions-0.0.24.dist-info/*
    /usr/local/lib/python3.6/dist-packages/contractions/*
Proceed (y/n)? y
  Successfully uninstalled contractions-0.0.24


In [0]:
pip install gensim spacy emoji langdetect polyglot pycld2 PyICU contractions demoji

Collecting contractions
  Using cached https://files.pythonhosted.org/packages/85/41/c3dfd5feb91a8d587ed1a59f553f07c05f95ad4e5d00ab78702fbf8fe48a/contractions-0.0.24-py2.py3-none-any.whl
Installing collected packages: contractions
Successfully installed contractions-0.0.24


In [0]:
import pandas as pd
import numpy as np
import os
import nltk
import spacy
import demoji
demoji.download_codes()
nltk.download('stopwords')
spacy.cli.download("en_core_web_sm")

Downloading emoji data ...
... OK (Got response in 1.35 seconds)
Writing emoji data to /root/.demoji/codes.json ...
... OK
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
✔ Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
os.chdir('/content/drive/My Drive/LING4813 - COVID19 Project/')


## Inital Exploratory Data Analysis

## (Utility Functions) Data Cleaning - removing stopwords an unnecessary characters, transitioning to all lowercase


In [0]:
from nltk.corpus import stopwords
from polyglot.detect import Detector
from langdetect import detect
from nltk.tokenize import TweetTokenizer
import contractions
import emoji
tt = TweetTokenizer()

from nltk.tokenize import WordPunctTokenizer
import re
from bs4 import BeautifulSoup

tok = WordPunctTokenizer()

cols_to_remove = [
                   'id','truncated', 'display_text_range', 'entities', 
                  'extended_entities', 'source', 'in_reply_to_status_id', 
                  'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 
                  'in_reply_to_screen_name', 'geo', 'coordinates', 'place', 'contributors', 
                  'retweeted_status', 'is_quote_status', 'retweet_count', 'favorite_count', 'favorited', 
                  'retweeted', 'possibly_sensitive', 'lang', 'quoted_status_id', 'quoted_status_id_str', 
                  'quoted_status_permalink', 'quoted_status', 'withheld_in_countries','withheld_scope', 'withheld_copyright'
                  ]
contractions.add("ain't", 'is not')
contractions.add("aint", 'is not')
contractions.add("aight", 'is not')
contractions.add("lesgo", "let us go")
contractions.add(' ofc ', ' of course ')
contractions.add(' bc ', ' because ')
contractions.add(' nah ', ' no ')
contractions.add(' pls ', ' please ')
contractions.add(' wtf ', ' what the fuck ')
contractions.add(' lmao ', ' laughing my ass off ')
contractions.add(' lmfao ', ' laughing my fucking ass off ')
contractions.add(' rofl ', ' rolling on the floor laughing ')
contractions.add(' yolo ', ' you only live once ')
contractions.add(' stfu ', ' shut the fuck up ')
contractions.add(' lmk ', ' let me know ')
contractions.add(' lemme ', ' let me ')
contractions.add(' smh ', ' shake my head ')
contractions.add(' smfh ', ' shake my fucking head ')
contractions.add(' smdh ', ' shake my damn head ')
contractions.add(' ikr ', ' i know right ')
contractions.add(' nvm ', ' never mind ')
contractions.add(' thx ', ' thanks ')
contractions.add(' thnx ', ' thanks ')
contractions.add(' pto ', ' paid time off ')
contractions.add(' wfh ', ' work from home ')
contractions.add(' rly ', ' really ')
contractions.add(' prolly ', ' probably ')
contractions.add(' ootd ', ' outfit of the day ')
contractions.add(' nsfw ', ' not safe for work ')
contractions.add(' lol ', ' laugh out loud ')
contractions.add(' omg ', ' oh my god ')
contractions.add(' btw ', ' by the way ')
contractions.add(' irl ', ' in real life ')
contractions.add(' ily ', ' i love you ')
contractions.add(' idgaf ', ' i do not give a fuck ')
contractions.add(' dgaf ', ' do not give a fuck ')
contractions.add(' tfw ', ' that feeling when ')
contractions.add(' tbh ', ' to be honest ')
contractions.add(' imho ', ' in my humble opinion ')
contractions.add(' imo ', ' in my opinion ')
contractions.add(' srsly ', ' seriously ')
contractions.add(' forreal ', ' for real ')
contractions.add(' gtfo ', ' get the fuck out ')
contractions.add(' w/ ', ' with ')
contractions.add(' w ', ' with ')
contractions.add(' ppl ', ' people ')

pat1 = r"@[A-Za-z0-9]+"
pat2 = r'https?://[A-Za-z0-9./]+'
pat3 = r'#+'
pat4 = r'[!#%&()*+,-./:;<=>@[\]^_`{|}~“?"-]'

combined_pat = r'|'.join((pat1, pat2))
combined_pat = r'|'.join((combined_pat, pat3))
def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    clean = clean.replace('U.S', 'United States')
    clean = clean.replace('U.S.', 'United States')
    clean = clean.lower()
    clean = contractions.fix(clean)
    clean = re.sub(pat4, '', clean)
    clean = emoji.demojize(clean)
    clean = clean.replace ("'", "")
    clean = clean.replace ('"', "")
    clean = clean.replace("”", "")
    # clean = clean.strip('"')
    clean = clean.lower()
    return clean

def format_df(df_twitter):
  df_twitter.drop(cols_to_remove, axis=1, inplace=True)
  df_twitter['user_id'] = df_twitter['user'].apply(lambda x: x.get('id'))
  # df_twitter.drop_duplicates(subset=['user_id', 'full_text'], keep='first', inplace=True)
  df_twitter.drop(['user'], axis=1, inplace=True)

def remove_non_english_tweets(row):
  try:
    detector = Detector(row)
    if detector.languages[0].name != 'English' or detect(row) != 'en':
      row = None
  except:
    row = None
  return row


def remove_RT(row):
  if 'RT' in row[:2]:
    return row.replace('RT', '')
  return row

def remove_mention(row):
  return " ".join(filter(lambda x:x[0] != '@', row.split()))

def remove_short_tweets(row):
  if len(row) < 30:
    return None
  else:
    return row


# OVERALL FUNCTION FOR PROCESSING
def process_data(df, sentiment_flag):
  if sentiment_flag is False:
    format_df(df)
  df['cleaned_text'] = df['full_text'].apply(remove_RT)
  df['cleaned_text'] = df['cleaned_text'].apply(remove_mention)
  df['cleaned_text'] = df['cleaned_text'].apply(remove_non_english_tweets)
  df.dropna(inplace=True)
  df['cleaned_text'] = df['cleaned_text'].apply(tweet_cleaner)
  # tokenize_tweets(df)
  # df.dropna(inplace=True)
  # df['token_text'] = df['token_text'].apply(clean_tweets)
  df['cleaned_text'] = df['cleaned_text'].apply(remove_short_tweets)
  df.dropna(inplace=True)



In [0]:

pd.options.display.max_colwidth = 100
df_test = pd.read_json('./sample_tweets/04_2020/coronavirus-tweet-id-2020-04-01-00.jsonl', lines=True)
print(df_test.shape)
col_names = list(df_test.columns)
df_test.tail(50)

In [0]:

pd.options.display.max_colwidth = 500
print(df_test.shape)
df_test[['full_text', 'cleaned_text']].sample(50)

In [0]:
import demoji
from emoji.unicode_codes import UNICODE_EMOJI
duni = f'U+{ord(s):X}'
y = uni.encode('utf-8')
z = y.decode('utf-8')
print(z)
print(UNICODE_EMOJI[s])
print (emoji.demojize(s))

U+1F923
:rolling_on_the_floor_laughing:
:rolling_on_the_floor_laughing:


## 1) Cleaning and Tokenizing COVID-19 Tweets

In [0]:
# JANUARY MASKS EXPORT CSV - PART 1
files = [
"coronavirus-tweet-id-2020-01-21-23.jsonl",
"coronavirus-tweet-id-2020-01-22-00.jsonl",
"coronavirus-tweet-id-2020-01-22-01.jsonl",
"coronavirus-tweet-id-2020-01-22-02.jsonl",
"coronavirus-tweet-id-2020-01-22-03.jsonl",
"coronavirus-tweet-id-2020-01-22-04.jsonl",
"coronavirus-tweet-id-2020-01-22-05.jsonl",
"coronavirus-tweet-id-2020-01-22-06.jsonl",
"coronavirus-tweet-id-2020-01-22-07.jsonl",
"coronavirus-tweet-id-2020-01-22-08.jsonl",
"coronavirus-tweet-id-2020-01-22-09.jsonl",
"coronavirus-tweet-id-2020-01-22-10.jsonl",
"coronavirus-tweet-id-2020-01-22-11.jsonl",
"coronavirus-tweet-id-2020-01-22-12.jsonl",
"coronavirus-tweet-id-2020-01-22-13.jsonl",
"coronavirus-tweet-id-2020-01-22-14.jsonl",
"coronavirus-tweet-id-2020-01-22-15.jsonl",
"coronavirus-tweet-id-2020-01-22-16.jsonl",
"coronavirus-tweet-id-2020-01-22-17.jsonl",
"coronavirus-tweet-id-2020-01-22-18.jsonl",
"coronavirus-tweet-id-2020-01-22-19.jsonl",
"coronavirus-tweet-id-2020-01-22-20.jsonl",
"coronavirus-tweet-id-2020-01-22-21.jsonl",
"coronavirus-tweet-id-2020-01-22-22.jsonl",
"coronavirus-tweet-id-2020-01-22-23.jsonl",
"coronavirus-tweet-id-2020-01-23-00.jsonl",
"coronavirus-tweet-id-2020-01-23-01.jsonl",
"coronavirus-tweet-id-2020-01-23-02.jsonl",
"coronavirus-tweet-id-2020-01-23-03.jsonl",
"coronavirus-tweet-id-2020-01-23-04.jsonl",
"coronavirus-tweet-id-2020-01-23-05.jsonl",
"coronavirus-tweet-id-2020-01-23-06.jsonl",
"coronavirus-tweet-id-2020-01-23-07.jsonl",
"coronavirus-tweet-id-2020-01-23-08.jsonl",
"coronavirus-tweet-id-2020-01-23-09.jsonl",
"coronavirus-tweet-id-2020-01-23-10.jsonl",
"coronavirus-tweet-id-2020-01-23-11.jsonl",
"coronavirus-tweet-id-2020-01-23-12.jsonl",
"coronavirus-tweet-id-2020-01-23-13.jsonl",
"coronavirus-tweet-id-2020-01-23-14.jsonl",
"coronavirus-tweet-id-2020-01-23-15.jsonl",
"coronavirus-tweet-id-2020-01-23-16.jsonl",
"coronavirus-tweet-id-2020-01-23-17.jsonl",
"coronavirus-tweet-id-2020-01-23-18.jsonl",
"coronavirus-tweet-id-2020-01-23-19.jsonl",
"coronavirus-tweet-id-2020-01-23-20.jsonl",
"coronavirus-tweet-id-2020-01-23-21.jsonl",
"coronavirus-tweet-id-2020-01-23-22.jsonl",
"coronavirus-tweet-id-2020-01-23-23.jsonl",
"coronavirus-tweet-id-2020-01-24-00.jsonl",
"coronavirus-tweet-id-2020-01-24-01.jsonl",
"coronavirus-tweet-id-2020-01-24-02.jsonl",
"coronavirus-tweet-id-2020-01-24-03.jsonl",
"coronavirus-tweet-id-2020-01-24-04.jsonl",
"coronavirus-tweet-id-2020-01-24-05.jsonl",
"coronavirus-tweet-id-2020-01-24-06.jsonl",
"coronavirus-tweet-id-2020-01-24-07.jsonl",
"coronavirus-tweet-id-2020-01-24-08.jsonl",
"coronavirus-tweet-id-2020-01-24-09.jsonl",
"coronavirus-tweet-id-2020-01-24-10.jsonl",
"coronavirus-tweet-id-2020-01-24-11.jsonl",
"coronavirus-tweet-id-2020-01-24-12.jsonl",
"coronavirus-tweet-id-2020-01-24-13.jsonl",
"coronavirus-tweet-id-2020-01-24-14.jsonl",
"coronavirus-tweet-id-2020-01-24-15.jsonl",
"coronavirus-tweet-id-2020-01-24-16.jsonl",
"coronavirus-tweet-id-2020-01-24-17.jsonl",
"coronavirus-tweet-id-2020-01-24-18.jsonl",
"coronavirus-tweet-id-2020-01-24-19.jsonl",
"coronavirus-tweet-id-2020-01-24-20.jsonl",
"coronavirus-tweet-id-2020-01-24-21.jsonl",
"coronavirus-tweet-id-2020-01-24-22.jsonl",
"coronavirus-tweet-id-2020-01-24-23.jsonl",
"coronavirus-tweet-id-2020-01-25-00.jsonl",
"coronavirus-tweet-id-2020-01-25-01.jsonl",
"coronavirus-tweet-id-2020-01-25-02.jsonl",
"coronavirus-tweet-id-2020-01-25-03.jsonl",
"coronavirus-tweet-id-2020-01-25-04.jsonl",
"coronavirus-tweet-id-2020-01-25-05.jsonl",
"coronavirus-tweet-id-2020-01-25-06.jsonl",
"coronavirus-tweet-id-2020-01-25-07.jsonl",
"coronavirus-tweet-id-2020-01-25-08.jsonl",
"coronavirus-tweet-id-2020-01-25-09.jsonl",
"coronavirus-tweet-id-2020-01-25-10.jsonl",
"coronavirus-tweet-id-2020-01-25-11.jsonl",
"coronavirus-tweet-id-2020-01-25-12.jsonl",
"coronavirus-tweet-id-2020-01-25-13.jsonl",
"coronavirus-tweet-id-2020-01-25-14.jsonl",
"coronavirus-tweet-id-2020-01-25-15.jsonl",
"coronavirus-tweet-id-2020-01-25-16.jsonl",
"coronavirus-tweet-id-2020-01-25-17.jsonl",
"coronavirus-tweet-id-2020-01-25-18.jsonl",
"coronavirus-tweet-id-2020-01-25-19.jsonl",
"coronavirus-tweet-id-2020-01-25-20.jsonl",
"coronavirus-tweet-id-2020-01-25-21.jsonl",
"coronavirus-tweet-id-2020-01-25-22.jsonl",
"coronavirus-tweet-id-2020-01-25-23.jsonl",
"coronavirus-tweet-id-2020-01-26-00.jsonl",
"coronavirus-tweet-id-2020-01-26-01.jsonl",
"coronavirus-tweet-id-2020-01-26-02.jsonl",
"coronavirus-tweet-id-2020-01-26-03.jsonl",
"coronavirus-tweet-id-2020-01-26-04.jsonl",
"coronavirus-tweet-id-2020-01-26-05.jsonl",
"coronavirus-tweet-id-2020-01-26-06.jsonl",
"coronavirus-tweet-id-2020-01-26-07.jsonl",
"coronavirus-tweet-id-2020-01-26-08.jsonl",
"coronavirus-tweet-id-2020-01-26-09.jsonl",
"coronavirus-tweet-id-2020-01-26-10.jsonl",
"coronavirus-tweet-id-2020-01-26-11.jsonl",
"coronavirus-tweet-id-2020-01-26-12.jsonl",
"coronavirus-tweet-id-2020-01-26-13.jsonl",
"coronavirus-tweet-id-2020-01-26-14.jsonl",
"coronavirus-tweet-id-2020-01-26-15.jsonl",
"coronavirus-tweet-id-2020-01-26-16.jsonl",
"coronavirus-tweet-id-2020-01-26-17.jsonl",
"coronavirus-tweet-id-2020-01-26-18.jsonl",
"coronavirus-tweet-id-2020-01-26-19.jsonl",
"coronavirus-tweet-id-2020-01-26-20.jsonl",
"coronavirus-tweet-id-2020-01-26-21.jsonl",
"coronavirus-tweet-id-2020-01-26-22.jsonl",
"coronavirus-tweet-id-2020-01-26-23.jsonl",
]

df_covid = pd.read_json('./sample_tweets/01_2020/coronavirus-tweet-id-2020-01-21-22.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/01_2020/%s'%i,lines=True)
  df_covid = pd.concat([df_covid, df_part])

df_covid.drop(['withheld_scope', 'withheld_copyright'], axis=1, inplace=True)

df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.head()
df_masks.to_csv('jan_masks_1.csv',index=False)

In [0]:
# JANUARY MASKS EXPORT CSV - PART 2
files = [
"coronavirus-tweet-id-2020-01-27-01.jsonl",
"coronavirus-tweet-id-2020-01-27-02.jsonl",
"coronavirus-tweet-id-2020-01-27-03.jsonl",
"coronavirus-tweet-id-2020-01-27-04.jsonl",
"coronavirus-tweet-id-2020-01-27-05.jsonl",
"coronavirus-tweet-id-2020-01-27-06.jsonl",
"coronavirus-tweet-id-2020-01-27-07.jsonl",
"coronavirus-tweet-id-2020-01-27-08.jsonl",
"coronavirus-tweet-id-2020-01-27-09.jsonl",
"coronavirus-tweet-id-2020-01-27-10.jsonl",
"coronavirus-tweet-id-2020-01-27-11.jsonl",
"coronavirus-tweet-id-2020-01-27-12.jsonl",
"coronavirus-tweet-id-2020-01-27-13.jsonl",
"coronavirus-tweet-id-2020-01-27-14.jsonl",
"coronavirus-tweet-id-2020-01-27-15.jsonl",
"coronavirus-tweet-id-2020-01-27-16.jsonl",
"coronavirus-tweet-id-2020-01-27-17.jsonl",
"coronavirus-tweet-id-2020-01-27-18.jsonl",
"coronavirus-tweet-id-2020-01-27-19.jsonl",
"coronavirus-tweet-id-2020-01-27-20.jsonl",
"coronavirus-tweet-id-2020-01-27-21.jsonl",
"coronavirus-tweet-id-2020-01-27-22.jsonl",
"coronavirus-tweet-id-2020-01-27-23.jsonl",
"coronavirus-tweet-id-2020-01-28-00.jsonl",
"coronavirus-tweet-id-2020-01-28-01.jsonl",
"coronavirus-tweet-id-2020-01-28-02.jsonl",
"coronavirus-tweet-id-2020-01-28-03.jsonl",
"coronavirus-tweet-id-2020-01-28-04.jsonl",
"coronavirus-tweet-id-2020-01-28-05.jsonl",
"coronavirus-tweet-id-2020-01-28-06.jsonl",
"coronavirus-tweet-id-2020-01-28-07.jsonl",
"coronavirus-tweet-id-2020-01-28-08.jsonl",
"coronavirus-tweet-id-2020-01-28-09.jsonl",
"coronavirus-tweet-id-2020-01-28-10.jsonl",
"coronavirus-tweet-id-2020-01-28-11.jsonl",
"coronavirus-tweet-id-2020-01-28-12.jsonl",
"coronavirus-tweet-id-2020-01-28-13.jsonl",
"coronavirus-tweet-id-2020-01-28-14.jsonl",
"coronavirus-tweet-id-2020-01-28-15.jsonl",
"coronavirus-tweet-id-2020-01-28-16.jsonl",
"coronavirus-tweet-id-2020-01-28-17.jsonl",
"coronavirus-tweet-id-2020-01-28-18.jsonl",
"coronavirus-tweet-id-2020-01-28-19.jsonl",
"coronavirus-tweet-id-2020-01-28-20.jsonl",
"coronavirus-tweet-id-2020-01-28-21.jsonl",
"coronavirus-tweet-id-2020-01-28-22.jsonl",
"coronavirus-tweet-id-2020-01-28-23.jsonl",
"coronavirus-tweet-id-2020-01-29-00.jsonl",
"coronavirus-tweet-id-2020-01-29-01.jsonl",
"coronavirus-tweet-id-2020-01-29-02.jsonl",
"coronavirus-tweet-id-2020-01-29-03.jsonl",
"coronavirus-tweet-id-2020-01-29-04.jsonl",
"coronavirus-tweet-id-2020-01-29-05.jsonl",
"coronavirus-tweet-id-2020-01-29-06.jsonl",
"coronavirus-tweet-id-2020-01-29-07.jsonl",
"coronavirus-tweet-id-2020-01-29-08.jsonl",
"coronavirus-tweet-id-2020-01-29-09.jsonl",
"coronavirus-tweet-id-2020-01-29-10.jsonl",
"coronavirus-tweet-id-2020-01-29-11.jsonl",
"coronavirus-tweet-id-2020-01-29-12.jsonl",
"coronavirus-tweet-id-2020-01-29-13.jsonl",
"coronavirus-tweet-id-2020-01-29-14.jsonl",
"coronavirus-tweet-id-2020-01-29-15.jsonl",
"coronavirus-tweet-id-2020-01-29-16.jsonl",
"coronavirus-tweet-id-2020-01-29-17.jsonl",
"coronavirus-tweet-id-2020-01-29-18.jsonl",
"coronavirus-tweet-id-2020-01-29-19.jsonl",
"coronavirus-tweet-id-2020-01-29-20.jsonl",
"coronavirus-tweet-id-2020-01-29-21.jsonl",
"coronavirus-tweet-id-2020-01-29-22.jsonl",
"coronavirus-tweet-id-2020-01-29-23.jsonl",
"coronavirus-tweet-id-2020-01-30-00.jsonl",
"coronavirus-tweet-id-2020-01-30-01.jsonl",
"coronavirus-tweet-id-2020-01-30-02.jsonl",
"coronavirus-tweet-id-2020-01-30-03.jsonl",
"coronavirus-tweet-id-2020-01-30-04.jsonl",
"coronavirus-tweet-id-2020-01-30-05.jsonl",
"coronavirus-tweet-id-2020-01-30-06.jsonl",
"coronavirus-tweet-id-2020-01-30-07.jsonl",
"coronavirus-tweet-id-2020-01-30-08.jsonl",
"coronavirus-tweet-id-2020-01-30-09.jsonl",
"coronavirus-tweet-id-2020-01-30-10.jsonl",
"coronavirus-tweet-id-2020-01-30-11.jsonl",
"coronavirus-tweet-id-2020-01-30-12.jsonl",
"coronavirus-tweet-id-2020-01-30-13.jsonl",
"coronavirus-tweet-id-2020-01-30-14.jsonl",
"coronavirus-tweet-id-2020-01-30-15.jsonl",
"coronavirus-tweet-id-2020-01-30-16.jsonl",
"coronavirus-tweet-id-2020-01-30-17.jsonl",
"coronavirus-tweet-id-2020-01-30-18.jsonl",
"coronavirus-tweet-id-2020-01-30-19.jsonl",
"coronavirus-tweet-id-2020-01-30-20.jsonl",
"coronavirus-tweet-id-2020-01-30-21.jsonl",
"coronavirus-tweet-id-2020-01-30-22.jsonl",
"coronavirus-tweet-id-2020-01-30-23.jsonl",
"coronavirus-tweet-id-2020-01-31-00.jsonl",
"coronavirus-tweet-id-2020-01-31-01.jsonl",
"coronavirus-tweet-id-2020-01-31-02.jsonl",
"coronavirus-tweet-id-2020-01-31-03.jsonl",
"coronavirus-tweet-id-2020-01-31-04.jsonl",
"coronavirus-tweet-id-2020-01-31-05.jsonl",
"coronavirus-tweet-id-2020-01-31-06.jsonl",
"coronavirus-tweet-id-2020-01-31-07.jsonl",
"coronavirus-tweet-id-2020-01-31-08.jsonl",
"coronavirus-tweet-id-2020-01-31-09.jsonl",
"coronavirus-tweet-id-2020-01-31-10.jsonl",
"coronavirus-tweet-id-2020-01-31-11.jsonl",
"coronavirus-tweet-id-2020-01-31-12.jsonl",
"coronavirus-tweet-id-2020-01-31-13.jsonl",
"coronavirus-tweet-id-2020-01-31-14.jsonl",
"coronavirus-tweet-id-2020-01-31-15.jsonl",
"coronavirus-tweet-id-2020-01-31-16.jsonl",
"coronavirus-tweet-id-2020-01-31-17.jsonl",
"coronavirus-tweet-id-2020-01-31-18.jsonl",
"coronavirus-tweet-id-2020-01-31-19.jsonl",
"coronavirus-tweet-id-2020-01-31-20.jsonl",
"coronavirus-tweet-id-2020-01-31-21.jsonl",
"coronavirus-tweet-id-2020-01-31-22.jsonl",
"coronavirus-tweet-id-2020-01-31-23.jsonl"]
df_covid = pd.read_json('./sample_tweets/01_2020/coronavirus-tweet-id-2020-01-27-00.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/01_2020/%s'%i,lines=True)
  df_covid = pd.concat([df_covid, df_part])

# df_covid.drop(['withheld_scope', 'withheld_copyright'], axis=1, inplace=True)
df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
print(df_masks.shape)
# process_data(df_masks, False)
# print(df_covid.shape, df_masks.shape)
# df_masks.to_csv('jan_masks_2.csv',index=False)

(28375, 34)


In [0]:
df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
df_masks.head()

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,source,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,in_reply_to_screen_name,user,geo,coordinates,place,contributors,retweeted_status,is_quote_status,retweet_count,favorite_count,favorited,retweeted,lang,extended_entities,possibly_sensitive,quoted_status_id,quoted_status_id_str,quoted_status_permalink,quoted_status,withheld_in_countries
34,2020-01-27 00:24:06+00:00,1221589663330131973,1221589663330131968,@sanzhao41 the Hubei (province where Wuhan cit...,False,"[11, 121]","{'hashtags': [{'text': 'wuhan', 'indices': [10...","<a href=""https://mobile.twitter.com"" rel=""nofo...",1.221483e+18,1.221483e+18,1.162532e+18,1.162532e+18,sanzhao41,"{'id': 55166691, 'id_str': '55166691', 'name':...",,,,,,False,0,1,False,False,en,,,,,,,
35,2020-01-27 00:26:22+00:00,1221590234791600130,1221590234791600128,"RT @jjkmaryy: How to wear mask , pls share thi...",False,"[0, 140]","{'hashtags': [{'text': 'coronavirus', 'indices...","<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 261227145, 'id_str': '261227145', 'name...",,,,,{'created_at': 'Sun Jan 26 01:46:01 +0000 2020...,False,17883,0,False,False,en,,,,,,,
71,2020-01-27 00:46:09+00:00,1221595211832446980,1221595211832446976,@Eric10145366 @BoomChickaNow2 @StephenMcDonell...,False,"[47, 300]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""https://mobile.twitter.com"" rel=""nofo...",1.221593e+18,1.221593e+18,8.923485e+17,8.923485e+17,Eric10145366,"{'id': 806144231081472000, 'id_str': '80614423...",,,,,,False,1,1,False,False,en,,,,,,,
72,2020-01-27 00:47:20+00:00,1221595511620112384,1221595511620112384,RT @QuickTake: Wearing a face mask does help i...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 300232687, 'id_str': '300232687', 'name...",,,,,{'created_at': 'Fri Jan 24 16:22:02 +0000 2020...,False,19539,0,False,False,en,,,,,,,
121,2020-01-27 00:20:51+00:00,1221588845616984067,1221588845616984064,RT @QuickTake: Wearing a face mask does help i...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 527221337, 'id_str': '527221337', 'name...",,,,,{'created_at': 'Fri Jan 24 16:22:02 +0000 2020...,False,19539,0,False,False,en,,,,,,,


In [0]:

process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.to_csv('jan_masks_2.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(717554, 32) (26532, 5)


In [0]:
# FEBRUARY MASKS EXPORT CSV - PART 1

files = [
"coronavirus-tweet-id-2020-02-01-01.jsonl",
"coronavirus-tweet-id-2020-02-01-02.jsonl",
"coronavirus-tweet-id-2020-02-01-03.jsonl",
"coronavirus-tweet-id-2020-02-01-09.jsonl",
"coronavirus-tweet-id-2020-02-01-10.jsonl",
"coronavirus-tweet-id-2020-02-01-11.jsonl",
"coronavirus-tweet-id-2020-02-01-12.jsonl",
"coronavirus-tweet-id-2020-02-01-13.jsonl",
"coronavirus-tweet-id-2020-02-01-14.jsonl",
"coronavirus-tweet-id-2020-02-01-15.jsonl",
"coronavirus-tweet-id-2020-02-01-16.jsonl",
"coronavirus-tweet-id-2020-02-01-17.jsonl",
"coronavirus-tweet-id-2020-02-01-18.jsonl",
"coronavirus-tweet-id-2020-02-01-19.jsonl",
"coronavirus-tweet-id-2020-02-01-20.jsonl",
"coronavirus-tweet-id-2020-02-01-21.jsonl",
"coronavirus-tweet-id-2020-02-01-22.jsonl",
"coronavirus-tweet-id-2020-02-01-23.jsonl",
"coronavirus-tweet-id-2020-02-02-00.jsonl",
"coronavirus-tweet-id-2020-02-02-01.jsonl",
"coronavirus-tweet-id-2020-02-02-02.jsonl",
"coronavirus-tweet-id-2020-02-02-03.jsonl",
"coronavirus-tweet-id-2020-02-02-04.jsonl",
"coronavirus-tweet-id-2020-02-02-05.jsonl",
"coronavirus-tweet-id-2020-02-02-06.jsonl",
"coronavirus-tweet-id-2020-02-02-07.jsonl",
"coronavirus-tweet-id-2020-02-02-08.jsonl",
"coronavirus-tweet-id-2020-02-02-09.jsonl",
"coronavirus-tweet-id-2020-02-02-10.jsonl",
"coronavirus-tweet-id-2020-02-02-11.jsonl",
"coronavirus-tweet-id-2020-02-02-12.jsonl",
"coronavirus-tweet-id-2020-02-02-13.jsonl",
"coronavirus-tweet-id-2020-02-02-14.jsonl",
"coronavirus-tweet-id-2020-02-02-15.jsonl",
"coronavirus-tweet-id-2020-02-02-16.jsonl",
"coronavirus-tweet-id-2020-02-02-17.jsonl",
"coronavirus-tweet-id-2020-02-02-18.jsonl",
"coronavirus-tweet-id-2020-02-02-19.jsonl",
"coronavirus-tweet-id-2020-02-02-20.jsonl",
"coronavirus-tweet-id-2020-02-02-21.jsonl",
"coronavirus-tweet-id-2020-02-02-22.jsonl",
"coronavirus-tweet-id-2020-02-02-23.jsonl",
"coronavirus-tweet-id-2020-02-03-00.jsonl",
"coronavirus-tweet-id-2020-02-03-01.jsonl",
"coronavirus-tweet-id-2020-02-03-02.jsonl",
"coronavirus-tweet-id-2020-02-03-03.jsonl",
"coronavirus-tweet-id-2020-02-03-04.jsonl",
"coronavirus-tweet-id-2020-02-03-05.jsonl",
"coronavirus-tweet-id-2020-02-03-06.jsonl",
"coronavirus-tweet-id-2020-02-03-07.jsonl",
"coronavirus-tweet-id-2020-02-03-08.jsonl",
"coronavirus-tweet-id-2020-02-03-09.jsonl",
"coronavirus-tweet-id-2020-02-03-10.jsonl",
"coronavirus-tweet-id-2020-02-03-11.jsonl",
"coronavirus-tweet-id-2020-02-03-12.jsonl",
"coronavirus-tweet-id-2020-02-03-13.jsonl",
"coronavirus-tweet-id-2020-02-03-14.jsonl",
"coronavirus-tweet-id-2020-02-03-15.jsonl",
"coronavirus-tweet-id-2020-02-03-16.jsonl",
"coronavirus-tweet-id-2020-02-03-17.jsonl",
"coronavirus-tweet-id-2020-02-03-18.jsonl",
"coronavirus-tweet-id-2020-02-03-19.jsonl",
"coronavirus-tweet-id-2020-02-03-20.jsonl",
"coronavirus-tweet-id-2020-02-03-21.jsonl",
"coronavirus-tweet-id-2020-02-03-22.jsonl",
"coronavirus-tweet-id-2020-02-03-23.jsonl",
"coronavirus-tweet-id-2020-02-04-00.jsonl",
"coronavirus-tweet-id-2020-02-04-01.jsonl",
"coronavirus-tweet-id-2020-02-04-02.jsonl",
"coronavirus-tweet-id-2020-02-04-03.jsonl",
"coronavirus-tweet-id-2020-02-04-04.jsonl",
"coronavirus-tweet-id-2020-02-04-05.jsonl",
"coronavirus-tweet-id-2020-02-04-06.jsonl",
"coronavirus-tweet-id-2020-02-04-07.jsonl",
"coronavirus-tweet-id-2020-02-04-08.jsonl",
"coronavirus-tweet-id-2020-02-04-09.jsonl",
"coronavirus-tweet-id-2020-02-04-10.jsonl",
"coronavirus-tweet-id-2020-02-04-11.jsonl",
"coronavirus-tweet-id-2020-02-04-12.jsonl",
"coronavirus-tweet-id-2020-02-04-13.jsonl",
"coronavirus-tweet-id-2020-02-04-14.jsonl",
"coronavirus-tweet-id-2020-02-04-15.jsonl",
"coronavirus-tweet-id-2020-02-04-16.jsonl",
"coronavirus-tweet-id-2020-02-04-17.jsonl",
"coronavirus-tweet-id-2020-02-04-18.jsonl",
"coronavirus-tweet-id-2020-02-04-19.jsonl",
"coronavirus-tweet-id-2020-02-04-20.jsonl",
"coronavirus-tweet-id-2020-02-04-21.jsonl",
"coronavirus-tweet-id-2020-02-04-22.jsonl",
"coronavirus-tweet-id-2020-02-04-23.jsonl",
"coronavirus-tweet-id-2020-02-05-00.jsonl",
"coronavirus-tweet-id-2020-02-05-01.jsonl",
"coronavirus-tweet-id-2020-02-05-02.jsonl",
"coronavirus-tweet-id-2020-02-05-03.jsonl",
"coronavirus-tweet-id-2020-02-05-04.jsonl",
"coronavirus-tweet-id-2020-02-05-05.jsonl",
"coronavirus-tweet-id-2020-02-05-06.jsonl",
"coronavirus-tweet-id-2020-02-05-07.jsonl",
"coronavirus-tweet-id-2020-02-05-08.jsonl",
"coronavirus-tweet-id-2020-02-05-09.jsonl",
"coronavirus-tweet-id-2020-02-05-10.jsonl",
"coronavirus-tweet-id-2020-02-05-11.jsonl",
"coronavirus-tweet-id-2020-02-05-12.jsonl",
"coronavirus-tweet-id-2020-02-05-13.jsonl",
"coronavirus-tweet-id-2020-02-05-14.jsonl",
"coronavirus-tweet-id-2020-02-05-15.jsonl",
"coronavirus-tweet-id-2020-02-05-16.jsonl",
"coronavirus-tweet-id-2020-02-05-17.jsonl",
"coronavirus-tweet-id-2020-02-05-18.jsonl",
"coronavirus-tweet-id-2020-02-05-19.jsonl",
"coronavirus-tweet-id-2020-02-05-20.jsonl",
"coronavirus-tweet-id-2020-02-05-21.jsonl",
"coronavirus-tweet-id-2020-02-05-22.jsonl",
"coronavirus-tweet-id-2020-02-05-23.jsonl",
"coronavirus-tweet-id-2020-02-06-00.jsonl",
"coronavirus-tweet-id-2020-02-06-01.jsonl",
"coronavirus-tweet-id-2020-02-06-02.jsonl",
"coronavirus-tweet-id-2020-02-06-03.jsonl",
"coronavirus-tweet-id-2020-02-06-04.jsonl",
"coronavirus-tweet-id-2020-02-06-05.jsonl",
"coronavirus-tweet-id-2020-02-06-06.jsonl",
"coronavirus-tweet-id-2020-02-06-07.jsonl",
"coronavirus-tweet-id-2020-02-06-08.jsonl",
"coronavirus-tweet-id-2020-02-06-09.jsonl",
"coronavirus-tweet-id-2020-02-06-10.jsonl",
"coronavirus-tweet-id-2020-02-06-11.jsonl",
"coronavirus-tweet-id-2020-02-06-12.jsonl",
"coronavirus-tweet-id-2020-02-06-13.jsonl",
"coronavirus-tweet-id-2020-02-06-14.jsonl",
"coronavirus-tweet-id-2020-02-06-15.jsonl",
"coronavirus-tweet-id-2020-02-06-16.jsonl",
"coronavirus-tweet-id-2020-02-06-17.jsonl",
"coronavirus-tweet-id-2020-02-06-18.jsonl",
"coronavirus-tweet-id-2020-02-06-19.jsonl",
"coronavirus-tweet-id-2020-02-06-20.jsonl",
"coronavirus-tweet-id-2020-02-06-21.jsonl",
"coronavirus-tweet-id-2020-02-06-22.jsonl",
"coronavirus-tweet-id-2020-02-06-23.jsonl",
"coronavirus-tweet-id-2020-02-07-00.jsonl",
"coronavirus-tweet-id-2020-02-07-01.jsonl",
"coronavirus-tweet-id-2020-02-07-02.jsonl",
"coronavirus-tweet-id-2020-02-07-03.jsonl",
"coronavirus-tweet-id-2020-02-07-04.jsonl",
"coronavirus-tweet-id-2020-02-07-05.jsonl",
"coronavirus-tweet-id-2020-02-07-06.jsonl",
"coronavirus-tweet-id-2020-02-07-07.jsonl",
"coronavirus-tweet-id-2020-02-07-08.jsonl",
"coronavirus-tweet-id-2020-02-07-09.jsonl",
"coronavirus-tweet-id-2020-02-07-10.jsonl",
"coronavirus-tweet-id-2020-02-07-11.jsonl",
"coronavirus-tweet-id-2020-02-07-12.jsonl",
"coronavirus-tweet-id-2020-02-07-13.jsonl",
"coronavirus-tweet-id-2020-02-07-14.jsonl",
"coronavirus-tweet-id-2020-02-07-15.jsonl",
"coronavirus-tweet-id-2020-02-07-16.jsonl",
"coronavirus-tweet-id-2020-02-07-17.jsonl",
"coronavirus-tweet-id-2020-02-07-18.jsonl",
"coronavirus-tweet-id-2020-02-07-19.jsonl",
"coronavirus-tweet-id-2020-02-07-20.jsonl",
"coronavirus-tweet-id-2020-02-07-21.jsonl",
"coronavirus-tweet-id-2020-02-07-22.jsonl",
"coronavirus-tweet-id-2020-02-07-23.jsonl",

]
df_covid = pd.read_json('./sample_tweets/02_2020/coronavirus-tweet-id-2020-02-01-00.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/02_2020/%s'%i,lines=True)
  print("finished reading: ", i)
  df_covid = pd.concat([df_covid, df_part])

df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
df_masks.head()

finished reading:  coronavirus-tweet-id-2020-02-01-01.jsonl
finished reading:  coronavirus-tweet-id-2020-02-01-02.jsonl
finished reading:  coronavirus-tweet-id-2020-02-01-03.jsonl
finished reading:  coronavirus-tweet-id-2020-02-01-09.jsonl
finished reading:  coronavirus-tweet-id-2020-02-01-10.jsonl
finished reading:  coronavirus-tweet-id-2020-02-01-11.jsonl
finished reading:  coronavirus-tweet-id-2020-02-01-12.jsonl
finished reading:  coronavirus-tweet-id-2020-02-01-13.jsonl
finished reading:  coronavirus-tweet-id-2020-02-01-14.jsonl
finished reading:  coronavirus-tweet-id-2020-02-01-15.jsonl
finished reading:  coronavirus-tweet-id-2020-02-01-16.jsonl
finished reading:  coronavirus-tweet-id-2020-02-01-17.jsonl
finished reading:  coronavirus-tweet-id-2020-02-01-18.jsonl
finished reading:  coronavirus-tweet-id-2020-02-01-19.jsonl
finished reading:  coronavirus-tweet-id-2020-02-01-20.jsonl
finished reading:  coronavirus-tweet-id-2020-02-01-21.jsonl
finished reading:  coronavirus-tweet-id-

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,extended_entities,source,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,in_reply_to_screen_name,user,geo,coordinates,place,contributors,retweeted_status,is_quote_status,retweet_count,favorite_count,favorited,retweeted,possibly_sensitive,lang,quoted_status_id,quoted_status_id_str,quoted_status_permalink,quoted_status,withheld_in_countries,withheld_scope,withheld_copyright
37,2020-02-01 00:20:03+00:00,1223400581798084608,1223400581798084608,RT @nicolasubi: ur rave mask won’t protect u a...,False,"[0, 72]","{'hashtags': [], 'symbols': [], 'user_mentions...",,"<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 2189072592, 'id_str': '2189072592', 'na...",,,,,{'created_at': 'Thu Jan 30 18:18:52 +0000 2020...,False,962,0,False,False,,en,,,,,,,
38,2020-02-01 00:20:09+00:00,1223400607995891714,1223400607995891712,RT @globaltimesnews: Walking around without a ...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...",,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,,,,"{'id': 3434333991, 'id_str': '3434333991', 'na...",,,,,{'created_at': 'Fri Jan 31 12:18:25 +0000 2020...,False,6756,0,False,False,,en,,,,,,,
100,2020-02-01 00:11:46+00:00,1223398499628339200,1223398499628339200,This coronavirus has got me paro I’m seeing ba...,False,"[0, 92]","{'hashtags': [], 'symbols': [], 'user_mentions...",,"<a href=""http://twitter.com/download/iphone"" r...",,,,,,"{'id': 260409127, 'id_str': '260409127', 'name...",,,,,,False,1,0,False,False,,en,,,,,,,
134,2020-02-01 00:42:30+00:00,1223406233513689095,1223406233513689088,RT @anderscorr: Protective gear like goggles a...,False,"[0, 140]","{'hashtags': [{'text': 'nCoV2019', 'indices': ...",,"<a href=""https://mobile.twitter.com"" rel=""nofo...",,,,,,"{'id': 1170636124236042240, 'id_str': '1170636...",,,,,{'created_at': 'Sat Feb 01 00:16:12 +0000 2020...,False,15,0,False,False,,en,,,,,,,
214,2020-02-01 00:43:24+00:00,1223406461876760576,1223406461876760576,RT @marvicleonen: Please do not hoard n95 mask...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...",,"<a href=""http://twitter.com/download/android"" ...",,,,,,"{'id': 517684327, 'id_str': '517684327', 'name...",,,,,{'created_at': 'Fri Jan 31 09:23:44 +0000 2020...,False,767,0,False,False,,en,,,,,,,


In [0]:
print(df_masks.shape)
process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.head(50)
df_masks.to_csv('feb_masks_1.csv',index=False)

(8730, 34)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(624724, 34) (7612, 5)


In [0]:
# FEBRUARY MASKS EXPORT CSV - PART 2

files = [
"coronavirus-tweet-id-2020-02-08-01.jsonl",
"coronavirus-tweet-id-2020-02-08-02.jsonl",
"coronavirus-tweet-id-2020-02-08-03.jsonl",
"coronavirus-tweet-id-2020-02-08-04.jsonl",
"coronavirus-tweet-id-2020-02-08-05.jsonl",
"coronavirus-tweet-id-2020-02-08-07.jsonl",
"coronavirus-tweet-id-2020-02-08-08.jsonl",
"coronavirus-tweet-id-2020-02-08-09.jsonl",
"coronavirus-tweet-id-2020-02-08-10.jsonl",
"coronavirus-tweet-id-2020-02-08-11.jsonl",
"coronavirus-tweet-id-2020-02-08-12.jsonl",
"coronavirus-tweet-id-2020-02-08-13.jsonl",
"coronavirus-tweet-id-2020-02-08-14.jsonl",
"coronavirus-tweet-id-2020-02-08-15.jsonl",
"coronavirus-tweet-id-2020-02-08-16.jsonl",
"coronavirus-tweet-id-2020-02-08-17.jsonl",
"coronavirus-tweet-id-2020-02-08-18.jsonl",
"coronavirus-tweet-id-2020-02-08-19.jsonl",
"coronavirus-tweet-id-2020-02-08-20.jsonl",
"coronavirus-tweet-id-2020-02-08-21.jsonl",
"coronavirus-tweet-id-2020-02-08-22.jsonl",
"coronavirus-tweet-id-2020-02-08-23.jsonl",
"coronavirus-tweet-id-2020-02-09-00.jsonl",
"coronavirus-tweet-id-2020-02-09-01.jsonl",
"coronavirus-tweet-id-2020-02-09-02.jsonl",
"coronavirus-tweet-id-2020-02-09-03.jsonl",
"coronavirus-tweet-id-2020-02-09-04.jsonl",
"coronavirus-tweet-id-2020-02-09-05.jsonl",
"coronavirus-tweet-id-2020-02-09-06.jsonl",
"coronavirus-tweet-id-2020-02-09-07.jsonl",
"coronavirus-tweet-id-2020-02-09-08.jsonl",
"coronavirus-tweet-id-2020-02-09-09.jsonl",
"coronavirus-tweet-id-2020-02-09-10.jsonl",
"coronavirus-tweet-id-2020-02-09-11.jsonl",
"coronavirus-tweet-id-2020-02-09-12.jsonl",
"coronavirus-tweet-id-2020-02-09-13.jsonl",
"coronavirus-tweet-id-2020-02-09-14.jsonl",
"coronavirus-tweet-id-2020-02-09-15.jsonl",
"coronavirus-tweet-id-2020-02-09-16.jsonl",
"coronavirus-tweet-id-2020-02-09-17.jsonl",
"coronavirus-tweet-id-2020-02-09-18.jsonl",
"coronavirus-tweet-id-2020-02-09-19.jsonl",
"coronavirus-tweet-id-2020-02-09-20.jsonl",
"coronavirus-tweet-id-2020-02-09-21.jsonl",
"coronavirus-tweet-id-2020-02-09-22.jsonl",
"coronavirus-tweet-id-2020-02-09-23.jsonl",
"coronavirus-tweet-id-2020-02-10-00.jsonl",
"coronavirus-tweet-id-2020-02-10-01.jsonl",
"coronavirus-tweet-id-2020-02-10-02.jsonl",
"coronavirus-tweet-id-2020-02-10-03.jsonl",
"coronavirus-tweet-id-2020-02-10-04.jsonl",
"coronavirus-tweet-id-2020-02-10-05.jsonl",
"coronavirus-tweet-id-2020-02-10-06.jsonl",
"coronavirus-tweet-id-2020-02-10-07.jsonl",
"coronavirus-tweet-id-2020-02-10-08.jsonl",
"coronavirus-tweet-id-2020-02-10-09.jsonl",
"coronavirus-tweet-id-2020-02-10-10.jsonl",
"coronavirus-tweet-id-2020-02-10-11.jsonl",
"coronavirus-tweet-id-2020-02-10-12.jsonl",
"coronavirus-tweet-id-2020-02-10-13.jsonl",
"coronavirus-tweet-id-2020-02-10-14.jsonl",
"coronavirus-tweet-id-2020-02-10-15.jsonl",
"coronavirus-tweet-id-2020-02-10-16.jsonl",
"coronavirus-tweet-id-2020-02-10-17.jsonl",
"coronavirus-tweet-id-2020-02-10-18.jsonl",
"coronavirus-tweet-id-2020-02-10-19.jsonl",
"coronavirus-tweet-id-2020-02-10-20.jsonl",
"coronavirus-tweet-id-2020-02-10-21.jsonl",
"coronavirus-tweet-id-2020-02-10-22.jsonl",
"coronavirus-tweet-id-2020-02-10-23.jsonl",
"coronavirus-tweet-id-2020-02-11-00.jsonl",
"coronavirus-tweet-id-2020-02-11-01.jsonl",
"coronavirus-tweet-id-2020-02-11-02.jsonl",
"coronavirus-tweet-id-2020-02-11-03.jsonl",
"coronavirus-tweet-id-2020-02-11-04.jsonl",
"coronavirus-tweet-id-2020-02-11-05.jsonl",
"coronavirus-tweet-id-2020-02-11-06.jsonl",
"coronavirus-tweet-id-2020-02-11-07.jsonl",
"coronavirus-tweet-id-2020-02-11-08.jsonl",
"coronavirus-tweet-id-2020-02-11-09.jsonl",
"coronavirus-tweet-id-2020-02-11-10.jsonl",
"coronavirus-tweet-id-2020-02-11-11.jsonl",
"coronavirus-tweet-id-2020-02-11-12.jsonl",
"coronavirus-tweet-id-2020-02-11-13.jsonl",
"coronavirus-tweet-id-2020-02-11-14.jsonl",
"coronavirus-tweet-id-2020-02-11-15.jsonl",
"coronavirus-tweet-id-2020-02-11-16.jsonl",
"coronavirus-tweet-id-2020-02-11-17.jsonl",
"coronavirus-tweet-id-2020-02-11-18.jsonl",
"coronavirus-tweet-id-2020-02-11-19.jsonl",
"coronavirus-tweet-id-2020-02-11-20.jsonl",
"coronavirus-tweet-id-2020-02-11-21.jsonl",
"coronavirus-tweet-id-2020-02-11-22.jsonl",
"coronavirus-tweet-id-2020-02-11-23.jsonl",
"coronavirus-tweet-id-2020-02-12-00.jsonl",
"coronavirus-tweet-id-2020-02-12-01.jsonl",
"coronavirus-tweet-id-2020-02-12-02.jsonl",
"coronavirus-tweet-id-2020-02-12-03.jsonl",
"coronavirus-tweet-id-2020-02-12-04.jsonl",
"coronavirus-tweet-id-2020-02-12-05.jsonl",
"coronavirus-tweet-id-2020-02-12-06.jsonl",
"coronavirus-tweet-id-2020-02-12-07.jsonl",
"coronavirus-tweet-id-2020-02-12-08.jsonl",
"coronavirus-tweet-id-2020-02-12-09.jsonl",
"coronavirus-tweet-id-2020-02-12-10.jsonl",
"coronavirus-tweet-id-2020-02-12-11.jsonl",
"coronavirus-tweet-id-2020-02-12-12.jsonl",
"coronavirus-tweet-id-2020-02-12-13.jsonl",
"coronavirus-tweet-id-2020-02-12-14.jsonl",
"coronavirus-tweet-id-2020-02-12-15.jsonl",
"coronavirus-tweet-id-2020-02-12-16.jsonl",
"coronavirus-tweet-id-2020-02-12-17.jsonl",
"coronavirus-tweet-id-2020-02-12-18.jsonl",
"coronavirus-tweet-id-2020-02-12-19.jsonl",
"coronavirus-tweet-id-2020-02-12-20.jsonl",
"coronavirus-tweet-id-2020-02-12-21.jsonl",
"coronavirus-tweet-id-2020-02-12-22.jsonl",
"coronavirus-tweet-id-2020-02-12-23.jsonl",
"coronavirus-tweet-id-2020-02-13-00.jsonl",
"coronavirus-tweet-id-2020-02-13-01.jsonl",
"coronavirus-tweet-id-2020-02-13-02.jsonl",
"coronavirus-tweet-id-2020-02-13-03.jsonl",
"coronavirus-tweet-id-2020-02-13-04.jsonl",
"coronavirus-tweet-id-2020-02-13-05.jsonl",
"coronavirus-tweet-id-2020-02-13-06.jsonl",
"coronavirus-tweet-id-2020-02-13-07.jsonl",
"coronavirus-tweet-id-2020-02-13-08.jsonl",
"coronavirus-tweet-id-2020-02-13-09.jsonl",
"coronavirus-tweet-id-2020-02-13-10.jsonl",
"coronavirus-tweet-id-2020-02-13-11.jsonl",
"coronavirus-tweet-id-2020-02-13-12.jsonl",
"coronavirus-tweet-id-2020-02-13-13.jsonl",
"coronavirus-tweet-id-2020-02-13-14.jsonl",
"coronavirus-tweet-id-2020-02-13-15.jsonl",
"coronavirus-tweet-id-2020-02-13-16.jsonl",
"coronavirus-tweet-id-2020-02-13-17.jsonl",
"coronavirus-tweet-id-2020-02-13-18.jsonl",
"coronavirus-tweet-id-2020-02-13-19.jsonl",
"coronavirus-tweet-id-2020-02-13-20.jsonl",
"coronavirus-tweet-id-2020-02-13-21.jsonl",
"coronavirus-tweet-id-2020-02-13-22.jsonl",
"coronavirus-tweet-id-2020-02-13-23.jsonl",
"coronavirus-tweet-id-2020-02-14-00.jsonl",
"coronavirus-tweet-id-2020-02-14-01.jsonl",
"coronavirus-tweet-id-2020-02-14-02.jsonl",
"coronavirus-tweet-id-2020-02-14-03.jsonl",
"coronavirus-tweet-id-2020-02-14-04.jsonl",
"coronavirus-tweet-id-2020-02-14-05.jsonl",
"coronavirus-tweet-id-2020-02-14-06.jsonl",
"coronavirus-tweet-id-2020-02-14-07.jsonl",
"coronavirus-tweet-id-2020-02-14-08.jsonl",
"coronavirus-tweet-id-2020-02-14-09.jsonl",
"coronavirus-tweet-id-2020-02-14-10.jsonl",
"coronavirus-tweet-id-2020-02-14-11.jsonl",
"coronavirus-tweet-id-2020-02-14-12.jsonl",
"coronavirus-tweet-id-2020-02-14-13.jsonl",
"coronavirus-tweet-id-2020-02-14-14.jsonl",
"coronavirus-tweet-id-2020-02-14-15.jsonl",
"coronavirus-tweet-id-2020-02-14-16.jsonl",
"coronavirus-tweet-id-2020-02-14-17.jsonl",
"coronavirus-tweet-id-2020-02-14-18.jsonl",
"coronavirus-tweet-id-2020-02-14-19.jsonl",
"coronavirus-tweet-id-2020-02-14-20.jsonl",
"coronavirus-tweet-id-2020-02-14-21.jsonl",
"coronavirus-tweet-id-2020-02-14-22.jsonl",
"coronavirus-tweet-id-2020-02-14-23.jsonl",
"coronavirus-tweet-id-2020-02-15-00.jsonl",
"coronavirus-tweet-id-2020-02-15-01.jsonl",
"coronavirus-tweet-id-2020-02-15-02.jsonl",
"coronavirus-tweet-id-2020-02-15-03.jsonl",
"coronavirus-tweet-id-2020-02-15-04.jsonl",
"coronavirus-tweet-id-2020-02-15-05.jsonl",
"coronavirus-tweet-id-2020-02-15-06.jsonl",
"coronavirus-tweet-id-2020-02-15-07.jsonl",
"coronavirus-tweet-id-2020-02-15-08.jsonl",
"coronavirus-tweet-id-2020-02-15-09.jsonl",
"coronavirus-tweet-id-2020-02-15-10.jsonl",
"coronavirus-tweet-id-2020-02-15-11.jsonl",
"coronavirus-tweet-id-2020-02-15-12.jsonl",
"coronavirus-tweet-id-2020-02-15-13.jsonl",
"coronavirus-tweet-id-2020-02-15-14.jsonl",
"coronavirus-tweet-id-2020-02-15-15.jsonl",
"coronavirus-tweet-id-2020-02-15-16.jsonl",
"coronavirus-tweet-id-2020-02-15-17.jsonl",
"coronavirus-tweet-id-2020-02-15-18.jsonl",
"coronavirus-tweet-id-2020-02-15-19.jsonl",
"coronavirus-tweet-id-2020-02-15-20.jsonl",
"coronavirus-tweet-id-2020-02-15-21.jsonl",
"coronavirus-tweet-id-2020-02-15-22.jsonl",
"coronavirus-tweet-id-2020-02-15-23.jsonl",
]
df_covid = pd.read_json('./sample_tweets/02_2020/coronavirus-tweet-id-2020-02-08-00.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/02_2020/%s'%i,lines=True)
  print("finished reading: ", i)
  df_covid = pd.concat([df_covid, df_part])

df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
print(df_masks.shape)
process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.head(50)
df_masks.to_csv('feb_masks_2.csv',index=False)

finished reading:  coronavirus-tweet-id-2020-02-08-01.jsonl
finished reading:  coronavirus-tweet-id-2020-02-08-02.jsonl
finished reading:  coronavirus-tweet-id-2020-02-08-03.jsonl
finished reading:  coronavirus-tweet-id-2020-02-08-04.jsonl
finished reading:  coronavirus-tweet-id-2020-02-08-05.jsonl
finished reading:  coronavirus-tweet-id-2020-02-08-07.jsonl
finished reading:  coronavirus-tweet-id-2020-02-08-08.jsonl
finished reading:  coronavirus-tweet-id-2020-02-08-09.jsonl
finished reading:  coronavirus-tweet-id-2020-02-08-10.jsonl
finished reading:  coronavirus-tweet-id-2020-02-08-11.jsonl
finished reading:  coronavirus-tweet-id-2020-02-08-12.jsonl
finished reading:  coronavirus-tweet-id-2020-02-08-13.jsonl
finished reading:  coronavirus-tweet-id-2020-02-08-14.jsonl
finished reading:  coronavirus-tweet-id-2020-02-08-15.jsonl
finished reading:  coronavirus-tweet-id-2020-02-08-16.jsonl
finished reading:  coronavirus-tweet-id-2020-02-08-17.jsonl
finished reading:  coronavirus-tweet-id-

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(703055, 32) (6496, 5)


In [0]:
# FEBRUARY MASKS EXPORT CSV - PART 3

files = [
"coronavirus-tweet-id-2020-02-16-01.jsonl",
"coronavirus-tweet-id-2020-02-16-02.jsonl",
"coronavirus-tweet-id-2020-02-16-03.jsonl",
"coronavirus-tweet-id-2020-02-16-04.jsonl",
"coronavirus-tweet-id-2020-02-16-05.jsonl",
"coronavirus-tweet-id-2020-02-16-06.jsonl",
"coronavirus-tweet-id-2020-02-16-07.jsonl",
"coronavirus-tweet-id-2020-02-16-08.jsonl",
"coronavirus-tweet-id-2020-02-16-09.jsonl",
"coronavirus-tweet-id-2020-02-16-10.jsonl",
"coronavirus-tweet-id-2020-02-16-11.jsonl",
"coronavirus-tweet-id-2020-02-16-12.jsonl",
"coronavirus-tweet-id-2020-02-16-13.jsonl",
"coronavirus-tweet-id-2020-02-16-14.jsonl",
"coronavirus-tweet-id-2020-02-16-15.jsonl",
"coronavirus-tweet-id-2020-02-16-16.jsonl",
"coronavirus-tweet-id-2020-02-16-17.jsonl",
"coronavirus-tweet-id-2020-02-16-18.jsonl",
"coronavirus-tweet-id-2020-02-16-19.jsonl",
"coronavirus-tweet-id-2020-02-16-20.jsonl",
"coronavirus-tweet-id-2020-02-16-21.jsonl",
"coronavirus-tweet-id-2020-02-16-22.jsonl",
"coronavirus-tweet-id-2020-02-16-23.jsonl",
"coronavirus-tweet-id-2020-02-17-00.jsonl",
"coronavirus-tweet-id-2020-02-17-01.jsonl",
"coronavirus-tweet-id-2020-02-17-02.jsonl",
"coronavirus-tweet-id-2020-02-17-03.jsonl",
"coronavirus-tweet-id-2020-02-17-04.jsonl",
"coronavirus-tweet-id-2020-02-17-05.jsonl",
"coronavirus-tweet-id-2020-02-17-06.jsonl",
"coronavirus-tweet-id-2020-02-17-07.jsonl",
"coronavirus-tweet-id-2020-02-17-08.jsonl",
"coronavirus-tweet-id-2020-02-17-09.jsonl",
"coronavirus-tweet-id-2020-02-17-10.jsonl",
"coronavirus-tweet-id-2020-02-17-11.jsonl",
"coronavirus-tweet-id-2020-02-17-12.jsonl",
"coronavirus-tweet-id-2020-02-17-13.jsonl",
"coronavirus-tweet-id-2020-02-17-14.jsonl",
"coronavirus-tweet-id-2020-02-17-15.jsonl",
"coronavirus-tweet-id-2020-02-17-16.jsonl",
"coronavirus-tweet-id-2020-02-17-17.jsonl",
"coronavirus-tweet-id-2020-02-17-18.jsonl",
"coronavirus-tweet-id-2020-02-17-19.jsonl",
"coronavirus-tweet-id-2020-02-17-20.jsonl",
"coronavirus-tweet-id-2020-02-17-21.jsonl",
"coronavirus-tweet-id-2020-02-17-22.jsonl",
"coronavirus-tweet-id-2020-02-17-23.jsonl",
"coronavirus-tweet-id-2020-02-18-00.jsonl",
"coronavirus-tweet-id-2020-02-18-01.jsonl",
"coronavirus-tweet-id-2020-02-18-02.jsonl",
"coronavirus-tweet-id-2020-02-18-03.jsonl",
"coronavirus-tweet-id-2020-02-18-04.jsonl",
"coronavirus-tweet-id-2020-02-18-05.jsonl",
"coronavirus-tweet-id-2020-02-18-06.jsonl",
"coronavirus-tweet-id-2020-02-18-07.jsonl",
"coronavirus-tweet-id-2020-02-18-08.jsonl",
"coronavirus-tweet-id-2020-02-18-09.jsonl",
"coronavirus-tweet-id-2020-02-18-10.jsonl",
"coronavirus-tweet-id-2020-02-18-11.jsonl",
"coronavirus-tweet-id-2020-02-18-12.jsonl",
"coronavirus-tweet-id-2020-02-18-13.jsonl",
"coronavirus-tweet-id-2020-02-18-14.jsonl",
"coronavirus-tweet-id-2020-02-18-15.jsonl",
"coronavirus-tweet-id-2020-02-18-16.jsonl",
"coronavirus-tweet-id-2020-02-18-17.jsonl",
"coronavirus-tweet-id-2020-02-18-18.jsonl",
"coronavirus-tweet-id-2020-02-18-19.jsonl",
"coronavirus-tweet-id-2020-02-18-20.jsonl",
"coronavirus-tweet-id-2020-02-18-21.jsonl",
"coronavirus-tweet-id-2020-02-18-22.jsonl",
"coronavirus-tweet-id-2020-02-18-23.jsonl",
"coronavirus-tweet-id-2020-02-19-00.jsonl",
"coronavirus-tweet-id-2020-02-19-01.jsonl",
"coronavirus-tweet-id-2020-02-19-02.jsonl",
"coronavirus-tweet-id-2020-02-19-03.jsonl",
"coronavirus-tweet-id-2020-02-19-04.jsonl",
"coronavirus-tweet-id-2020-02-19-05.jsonl",
"coronavirus-tweet-id-2020-02-19-06.jsonl",
"coronavirus-tweet-id-2020-02-19-07.jsonl",
"coronavirus-tweet-id-2020-02-19-08.jsonl",
"coronavirus-tweet-id-2020-02-19-09.jsonl",
"coronavirus-tweet-id-2020-02-19-10.jsonl",
"coronavirus-tweet-id-2020-02-19-11.jsonl",
"coronavirus-tweet-id-2020-02-19-12.jsonl",
"coronavirus-tweet-id-2020-02-19-13.jsonl",
"coronavirus-tweet-id-2020-02-19-14.jsonl",
"coronavirus-tweet-id-2020-02-19-15.jsonl",
"coronavirus-tweet-id-2020-02-19-16.jsonl",
"coronavirus-tweet-id-2020-02-19-17.jsonl",
"coronavirus-tweet-id-2020-02-19-18.jsonl",
"coronavirus-tweet-id-2020-02-19-19.jsonl",
"coronavirus-tweet-id-2020-02-19-20.jsonl",
"coronavirus-tweet-id-2020-02-19-21.jsonl",
"coronavirus-tweet-id-2020-02-19-22.jsonl",
"coronavirus-tweet-id-2020-02-19-23.jsonl",
"coronavirus-tweet-id-2020-02-20-00.jsonl",
"coronavirus-tweet-id-2020-02-20-01.jsonl",
"coronavirus-tweet-id-2020-02-20-02.jsonl",
"coronavirus-tweet-id-2020-02-20-03.jsonl",
"coronavirus-tweet-id-2020-02-20-04.jsonl",
"coronavirus-tweet-id-2020-02-20-05.jsonl",
"coronavirus-tweet-id-2020-02-20-06.jsonl",
"coronavirus-tweet-id-2020-02-20-07.jsonl",
"coronavirus-tweet-id-2020-02-20-08.jsonl",
"coronavirus-tweet-id-2020-02-20-09.jsonl",
"coronavirus-tweet-id-2020-02-20-10.jsonl",
"coronavirus-tweet-id-2020-02-20-11.jsonl",
"coronavirus-tweet-id-2020-02-20-12.jsonl",
"coronavirus-tweet-id-2020-02-20-13.jsonl",
"coronavirus-tweet-id-2020-02-20-14.jsonl",
"coronavirus-tweet-id-2020-02-20-15.jsonl",
"coronavirus-tweet-id-2020-02-20-16.jsonl",
"coronavirus-tweet-id-2020-02-20-17.jsonl",
"coronavirus-tweet-id-2020-02-20-18.jsonl",
"coronavirus-tweet-id-2020-02-20-19.jsonl",
"coronavirus-tweet-id-2020-02-20-20.jsonl",
"coronavirus-tweet-id-2020-02-20-21.jsonl",
"coronavirus-tweet-id-2020-02-20-22.jsonl",
"coronavirus-tweet-id-2020-02-20-23.jsonl",
"coronavirus-tweet-id-2020-02-21-00.jsonl",
"coronavirus-tweet-id-2020-02-21-01.jsonl",
"coronavirus-tweet-id-2020-02-21-02.jsonl",
"coronavirus-tweet-id-2020-02-21-03.jsonl",
"coronavirus-tweet-id-2020-02-21-04.jsonl",
"coronavirus-tweet-id-2020-02-21-05.jsonl",
"coronavirus-tweet-id-2020-02-21-06.jsonl",
"coronavirus-tweet-id-2020-02-21-07.jsonl",
"coronavirus-tweet-id-2020-02-21-08.jsonl",
"coronavirus-tweet-id-2020-02-21-09.jsonl",
"coronavirus-tweet-id-2020-02-21-10.jsonl",
"coronavirus-tweet-id-2020-02-21-11.jsonl",
"coronavirus-tweet-id-2020-02-21-12.jsonl",
"coronavirus-tweet-id-2020-02-21-13.jsonl",
"coronavirus-tweet-id-2020-02-21-14.jsonl",
"coronavirus-tweet-id-2020-02-21-15.jsonl",
"coronavirus-tweet-id-2020-02-21-16.jsonl",
"coronavirus-tweet-id-2020-02-21-17.jsonl",
"coronavirus-tweet-id-2020-02-21-18.jsonl",
"coronavirus-tweet-id-2020-02-21-19.jsonl",
"coronavirus-tweet-id-2020-02-21-20.jsonl",
"coronavirus-tweet-id-2020-02-21-21.jsonl",
"coronavirus-tweet-id-2020-02-21-22.jsonl",
"coronavirus-tweet-id-2020-02-21-23.jsonl",
"coronavirus-tweet-id-2020-02-22-00.jsonl",
"coronavirus-tweet-id-2020-02-22-01.jsonl",
"coronavirus-tweet-id-2020-02-22-02.jsonl",
"coronavirus-tweet-id-2020-02-22-03.jsonl",
"coronavirus-tweet-id-2020-02-22-04.jsonl",
"coronavirus-tweet-id-2020-02-22-05.jsonl",
"coronavirus-tweet-id-2020-02-22-06.jsonl",
"coronavirus-tweet-id-2020-02-22-07.jsonl",
"coronavirus-tweet-id-2020-02-22-08.jsonl",
"coronavirus-tweet-id-2020-02-22-09.jsonl",
"coronavirus-tweet-id-2020-02-22-10.jsonl",
"coronavirus-tweet-id-2020-02-22-11.jsonl",
"coronavirus-tweet-id-2020-02-22-12.jsonl",
"coronavirus-tweet-id-2020-02-22-13.jsonl",
"coronavirus-tweet-id-2020-02-22-14.jsonl",
"coronavirus-tweet-id-2020-02-22-15.jsonl",
"coronavirus-tweet-id-2020-02-22-16.jsonl",
"coronavirus-tweet-id-2020-02-22-17.jsonl",
"coronavirus-tweet-id-2020-02-22-18.jsonl",
"coronavirus-tweet-id-2020-02-22-19.jsonl",
"coronavirus-tweet-id-2020-02-22-20.jsonl"]
df_covid = pd.read_json('./sample_tweets/02_2020/coronavirus-tweet-id-2020-02-16-00.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/02_2020/%s'%i,lines=True)
  print("finished reading: ", i)
  df_covid = pd.concat([df_covid, df_part])

df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.head(50)
df_masks.to_csv('feb_masks_3.csv',index=False)

finished reading:  coronavirus-tweet-id-2020-02-16-01.jsonl
finished reading:  coronavirus-tweet-id-2020-02-16-02.jsonl
finished reading:  coronavirus-tweet-id-2020-02-16-03.jsonl
finished reading:  coronavirus-tweet-id-2020-02-16-04.jsonl
finished reading:  coronavirus-tweet-id-2020-02-16-05.jsonl
finished reading:  coronavirus-tweet-id-2020-02-16-06.jsonl
finished reading:  coronavirus-tweet-id-2020-02-16-07.jsonl
finished reading:  coronavirus-tweet-id-2020-02-16-08.jsonl
finished reading:  coronavirus-tweet-id-2020-02-16-09.jsonl
finished reading:  coronavirus-tweet-id-2020-02-16-10.jsonl
finished reading:  coronavirus-tweet-id-2020-02-16-11.jsonl
finished reading:  coronavirus-tweet-id-2020-02-16-12.jsonl
finished reading:  coronavirus-tweet-id-2020-02-16-13.jsonl
finished reading:  coronavirus-tweet-id-2020-02-16-14.jsonl
finished reading:  coronavirus-tweet-id-2020-02-16-15.jsonl
finished reading:  coronavirus-tweet-id-2020-02-16-16.jsonl
finished reading:  coronavirus-tweet-id-

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(611157, 34) (5003, 5)


In [0]:
# FEBRUARY MASKS EXPORT CSV - PART 4

files = [
"coronavirus-tweet-id-2020-02-24-05.jsonl",
"coronavirus-tweet-id-2020-02-24-06.jsonl",
"coronavirus-tweet-id-2020-02-24-07.jsonl",
"coronavirus-tweet-id-2020-02-24-08.jsonl",
"coronavirus-tweet-id-2020-02-24-09.jsonl",
"coronavirus-tweet-id-2020-02-24-10.jsonl",
"coronavirus-tweet-id-2020-02-24-11.jsonl",
"coronavirus-tweet-id-2020-02-24-12.jsonl",
"coronavirus-tweet-id-2020-02-24-13.jsonl",
"coronavirus-tweet-id-2020-02-24-14.jsonl",
"coronavirus-tweet-id-2020-02-24-15.jsonl",
"coronavirus-tweet-id-2020-02-24-16.jsonl",
"coronavirus-tweet-id-2020-02-24-17.jsonl",
"coronavirus-tweet-id-2020-02-24-18.jsonl",
"coronavirus-tweet-id-2020-02-24-19.jsonl",
"coronavirus-tweet-id-2020-02-24-20.jsonl",
"coronavirus-tweet-id-2020-02-24-21.jsonl",
"coronavirus-tweet-id-2020-02-24-22.jsonl",
"coronavirus-tweet-id-2020-02-24-23.jsonl",
"coronavirus-tweet-id-2020-02-25-03.jsonl",
"coronavirus-tweet-id-2020-02-25-04.jsonl",
"coronavirus-tweet-id-2020-02-25-05.jsonl",
"coronavirus-tweet-id-2020-02-25-06.jsonl",
"coronavirus-tweet-id-2020-02-25-07.jsonl",
"coronavirus-tweet-id-2020-02-25-08.jsonl",
"coronavirus-tweet-id-2020-02-25-09.jsonl",
"coronavirus-tweet-id-2020-02-25-10.jsonl",
"coronavirus-tweet-id-2020-02-25-11.jsonl",
"coronavirus-tweet-id-2020-02-25-12.jsonl",
"coronavirus-tweet-id-2020-02-25-13.jsonl",
"coronavirus-tweet-id-2020-02-25-14.jsonl",
"coronavirus-tweet-id-2020-02-25-15.jsonl",
"coronavirus-tweet-id-2020-02-25-16.jsonl",
"coronavirus-tweet-id-2020-02-25-17.jsonl",
"coronavirus-tweet-id-2020-02-25-18.jsonl",
"coronavirus-tweet-id-2020-02-25-19.jsonl",
"coronavirus-tweet-id-2020-02-25-20.jsonl",
"coronavirus-tweet-id-2020-02-25-21.jsonl",
"coronavirus-tweet-id-2020-02-25-22.jsonl",
"coronavirus-tweet-id-2020-02-25-23.jsonl",
"coronavirus-tweet-id-2020-02-26-00.jsonl",
"coronavirus-tweet-id-2020-02-26-01.jsonl",
"coronavirus-tweet-id-2020-02-26-02.jsonl",
"coronavirus-tweet-id-2020-02-26-03.jsonl",
"coronavirus-tweet-id-2020-02-26-04.jsonl",
"coronavirus-tweet-id-2020-02-26-05.jsonl",
"coronavirus-tweet-id-2020-02-26-06.jsonl",
"coronavirus-tweet-id-2020-02-26-07.jsonl",
"coronavirus-tweet-id-2020-02-26-08.jsonl",
"coronavirus-tweet-id-2020-02-26-09.jsonl",
"coronavirus-tweet-id-2020-02-26-10.jsonl",
"coronavirus-tweet-id-2020-02-26-11.jsonl",
"coronavirus-tweet-id-2020-02-26-12.jsonl",
"coronavirus-tweet-id-2020-02-26-13.jsonl",
"coronavirus-tweet-id-2020-02-26-14.jsonl",
"coronavirus-tweet-id-2020-02-26-15.jsonl",
"coronavirus-tweet-id-2020-02-26-16.jsonl",
"coronavirus-tweet-id-2020-02-26-17.jsonl",
"coronavirus-tweet-id-2020-02-26-18.jsonl",
"coronavirus-tweet-id-2020-02-26-19.jsonl",
"coronavirus-tweet-id-2020-02-26-20.jsonl",
"coronavirus-tweet-id-2020-02-26-21.jsonl",
"coronavirus-tweet-id-2020-02-26-22.jsonl",
"coronavirus-tweet-id-2020-02-26-23.jsonl",
]
df_covid = pd.read_json('./sample_tweets/02_2020/coronavirus-tweet-id-2020-02-24-04.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/02_2020/%s'%i,lines=True)
  print("finished reading: ", i)
  df_covid = pd.concat([df_covid, df_part])

df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
print(df_masks.shape)

finished reading:  coronavirus-tweet-id-2020-02-24-05.jsonl
finished reading:  coronavirus-tweet-id-2020-02-24-06.jsonl
finished reading:  coronavirus-tweet-id-2020-02-24-07.jsonl
finished reading:  coronavirus-tweet-id-2020-02-24-08.jsonl
finished reading:  coronavirus-tweet-id-2020-02-24-09.jsonl
finished reading:  coronavirus-tweet-id-2020-02-24-10.jsonl
finished reading:  coronavirus-tweet-id-2020-02-24-11.jsonl
finished reading:  coronavirus-tweet-id-2020-02-24-12.jsonl
finished reading:  coronavirus-tweet-id-2020-02-24-13.jsonl
finished reading:  coronavirus-tweet-id-2020-02-24-14.jsonl
finished reading:  coronavirus-tweet-id-2020-02-24-15.jsonl
finished reading:  coronavirus-tweet-id-2020-02-24-16.jsonl
finished reading:  coronavirus-tweet-id-2020-02-24-17.jsonl
finished reading:  coronavirus-tweet-id-2020-02-24-18.jsonl
finished reading:  coronavirus-tweet-id-2020-02-24-19.jsonl
finished reading:  coronavirus-tweet-id-2020-02-24-20.jsonl
finished reading:  coronavirus-tweet-id-

In [0]:
process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.head(50)
df_masks.to_csv('feb_masks_4.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(265232, 32) (1709, 5)


In [0]:
# FEBRUARY MASKS EXPORT CSV - PART 5

files = [
"coronavirus-tweet-id-2020-02-27-01.jsonl",
"coronavirus-tweet-id-2020-02-27-02.jsonl",
"coronavirus-tweet-id-2020-02-27-03.jsonl",
"coronavirus-tweet-id-2020-02-27-04.jsonl",
"coronavirus-tweet-id-2020-02-27-05.jsonl",
"coronavirus-tweet-id-2020-02-27-06.jsonl",
"coronavirus-tweet-id-2020-02-27-07.jsonl",
"coronavirus-tweet-id-2020-02-27-08.jsonl",
"coronavirus-tweet-id-2020-02-27-09.jsonl",
"coronavirus-tweet-id-2020-02-27-10.jsonl",
"coronavirus-tweet-id-2020-02-27-11.jsonl",
"coronavirus-tweet-id-2020-02-27-12.jsonl",
"coronavirus-tweet-id-2020-02-27-13.jsonl",
"coronavirus-tweet-id-2020-02-27-14.jsonl",
"coronavirus-tweet-id-2020-02-27-15.jsonl",
"coronavirus-tweet-id-2020-02-27-16.jsonl",
"coronavirus-tweet-id-2020-02-27-17.jsonl",
"coronavirus-tweet-id-2020-02-27-18.jsonl",
"coronavirus-tweet-id-2020-02-27-19.jsonl",
"coronavirus-tweet-id-2020-02-27-20.jsonl",
"coronavirus-tweet-id-2020-02-27-21.jsonl",
"coronavirus-tweet-id-2020-02-27-22.jsonl",
"coronavirus-tweet-id-2020-02-27-23.jsonl",
"coronavirus-tweet-id-2020-02-28-00.jsonl",
"coronavirus-tweet-id-2020-02-28-01.jsonl",
"coronavirus-tweet-id-2020-02-28-02.jsonl",
"coronavirus-tweet-id-2020-02-28-03.jsonl",
"coronavirus-tweet-id-2020-02-28-04.jsonl",
"coronavirus-tweet-id-2020-02-28-05.jsonl",
"coronavirus-tweet-id-2020-02-28-06.jsonl",
"coronavirus-tweet-id-2020-02-28-07.jsonl",
"coronavirus-tweet-id-2020-02-28-08.jsonl",
"coronavirus-tweet-id-2020-02-28-09.jsonl",
"coronavirus-tweet-id-2020-02-28-10.jsonl",
"coronavirus-tweet-id-2020-02-28-11.jsonl",
"coronavirus-tweet-id-2020-02-28-12.jsonl",
"coronavirus-tweet-id-2020-02-28-13.jsonl",
"coronavirus-tweet-id-2020-02-28-14.jsonl",
"coronavirus-tweet-id-2020-02-28-15.jsonl",
"coronavirus-tweet-id-2020-02-28-16.jsonl",
"coronavirus-tweet-id-2020-02-28-17.jsonl",
"coronavirus-tweet-id-2020-02-28-18.jsonl",
"coronavirus-tweet-id-2020-02-28-19.jsonl",
"coronavirus-tweet-id-2020-02-28-20.jsonl",
"coronavirus-tweet-id-2020-02-28-21.jsonl",
"coronavirus-tweet-id-2020-02-28-22.jsonl",
"coronavirus-tweet-id-2020-02-28-23.jsonl",
"coronavirus-tweet-id-2020-02-29-00.jsonl",
"coronavirus-tweet-id-2020-02-29-01.jsonl",
"coronavirus-tweet-id-2020-02-29-02.jsonl",
"coronavirus-tweet-id-2020-02-29-03.jsonl",
"coronavirus-tweet-id-2020-02-29-04.jsonl",
"coronavirus-tweet-id-2020-02-29-05.jsonl",
"coronavirus-tweet-id-2020-02-29-06.jsonl",
"coronavirus-tweet-id-2020-02-29-07.jsonl",
"coronavirus-tweet-id-2020-02-29-08.jsonl",
"coronavirus-tweet-id-2020-02-29-09.jsonl",
"coronavirus-tweet-id-2020-02-29-10.jsonl",
"coronavirus-tweet-id-2020-02-29-11.jsonl",
"coronavirus-tweet-id-2020-02-29-12.jsonl",
"coronavirus-tweet-id-2020-02-29-13.jsonl",
"coronavirus-tweet-id-2020-02-29-14.jsonl",
"coronavirus-tweet-id-2020-02-29-15.jsonl",
"coronavirus-tweet-id-2020-02-29-16.jsonl",
"coronavirus-tweet-id-2020-02-29-17.jsonl",
"coronavirus-tweet-id-2020-02-29-18.jsonl",
"coronavirus-tweet-id-2020-02-29-19.jsonl",
"coronavirus-tweet-id-2020-02-29-20.jsonl",
"coronavirus-tweet-id-2020-02-29-21.jsonl",
"coronavirus-tweet-id-2020-02-29-22.jsonl",
"coronavirus-tweet-id-2020-02-29-23.jsonl"]
df_covid = pd.read_json('./sample_tweets/02_2020/coronavirus-tweet-id-2020-02-27-00.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/02_2020/%s'%i,lines=True)
  print("finished reading: ", i)
  df_covid = pd.concat([df_covid, df_part])

df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
print(df_covid.shape, df_masks.shape)

finished reading:  coronavirus-tweet-id-2020-02-27-01.jsonl
finished reading:  coronavirus-tweet-id-2020-02-27-02.jsonl
finished reading:  coronavirus-tweet-id-2020-02-27-03.jsonl
finished reading:  coronavirus-tweet-id-2020-02-27-04.jsonl
finished reading:  coronavirus-tweet-id-2020-02-27-05.jsonl
finished reading:  coronavirus-tweet-id-2020-02-27-06.jsonl
finished reading:  coronavirus-tweet-id-2020-02-27-07.jsonl
finished reading:  coronavirus-tweet-id-2020-02-27-08.jsonl
finished reading:  coronavirus-tweet-id-2020-02-27-09.jsonl
finished reading:  coronavirus-tweet-id-2020-02-27-10.jsonl
finished reading:  coronavirus-tweet-id-2020-02-27-11.jsonl
finished reading:  coronavirus-tweet-id-2020-02-27-12.jsonl
finished reading:  coronavirus-tweet-id-2020-02-27-13.jsonl
finished reading:  coronavirus-tweet-id-2020-02-27-14.jsonl
finished reading:  coronavirus-tweet-id-2020-02-27-15.jsonl
finished reading:  coronavirus-tweet-id-2020-02-27-16.jsonl
finished reading:  coronavirus-tweet-id-

In [0]:
process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.head(50)
df_masks.to_csv('feb_masks_5.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(635981, 34) (5022, 5)


In [0]:
# MARCH MASKS EXPORT CSV - PART 1

files = [
"coronavirus-tweet-id-2020-03-01-01.jsonl",
"coronavirus-tweet-id-2020-03-01-02.jsonl",
"coronavirus-tweet-id-2020-03-01-03.jsonl",
"coronavirus-tweet-id-2020-03-01-04.jsonl",
"coronavirus-tweet-id-2020-03-01-05.jsonl",
"coronavirus-tweet-id-2020-03-01-06.jsonl",
"coronavirus-tweet-id-2020-03-01-07.jsonl",
"coronavirus-tweet-id-2020-03-01-08.jsonl",
"coronavirus-tweet-id-2020-03-01-09.jsonl",
"coronavirus-tweet-id-2020-03-01-10.jsonl",
"coronavirus-tweet-id-2020-03-01-11.jsonl",
"coronavirus-tweet-id-2020-03-01-12.jsonl",
"coronavirus-tweet-id-2020-03-01-13.jsonl",
"coronavirus-tweet-id-2020-03-01-14.jsonl",
"coronavirus-tweet-id-2020-03-01-15.jsonl",
"coronavirus-tweet-id-2020-03-01-16.jsonl",
"coronavirus-tweet-id-2020-03-01-17.jsonl",
"coronavirus-tweet-id-2020-03-01-18.jsonl",
"coronavirus-tweet-id-2020-03-01-19.jsonl",
"coronavirus-tweet-id-2020-03-01-20.jsonl",
"coronavirus-tweet-id-2020-03-01-21.jsonl",
"coronavirus-tweet-id-2020-03-01-22.jsonl",
"coronavirus-tweet-id-2020-03-01-23.jsonl",
"coronavirus-tweet-id-2020-03-02-00.jsonl",
"coronavirus-tweet-id-2020-03-02-01.jsonl",
"coronavirus-tweet-id-2020-03-02-02.jsonl",
"coronavirus-tweet-id-2020-03-02-03.jsonl",
"coronavirus-tweet-id-2020-03-02-04.jsonl",
"coronavirus-tweet-id-2020-03-02-05.jsonl",
"coronavirus-tweet-id-2020-03-02-06.jsonl",
"coronavirus-tweet-id-2020-03-02-07.jsonl",
"coronavirus-tweet-id-2020-03-02-08.jsonl",
"coronavirus-tweet-id-2020-03-02-09.jsonl",
"coronavirus-tweet-id-2020-03-02-10.jsonl",
"coronavirus-tweet-id-2020-03-02-11.jsonl",
"coronavirus-tweet-id-2020-03-02-12.jsonl",
"coronavirus-tweet-id-2020-03-02-13.jsonl",
"coronavirus-tweet-id-2020-03-02-14.jsonl",
"coronavirus-tweet-id-2020-03-02-15.jsonl",
"coronavirus-tweet-id-2020-03-02-16.jsonl",
"coronavirus-tweet-id-2020-03-02-17.jsonl",
"coronavirus-tweet-id-2020-03-02-18.jsonl",
"coronavirus-tweet-id-2020-03-02-19.jsonl",
"coronavirus-tweet-id-2020-03-02-20.jsonl",
"coronavirus-tweet-id-2020-03-02-21.jsonl",
"coronavirus-tweet-id-2020-03-02-22.jsonl",
"coronavirus-tweet-id-2020-03-02-23.jsonl",
"coronavirus-tweet-id-2020-03-03-00.jsonl",
"coronavirus-tweet-id-2020-03-03-01.jsonl",
"coronavirus-tweet-id-2020-03-03-02.jsonl",
"coronavirus-tweet-id-2020-03-03-03.jsonl",
"coronavirus-tweet-id-2020-03-03-04.jsonl",
"coronavirus-tweet-id-2020-03-03-05.jsonl",
"coronavirus-tweet-id-2020-03-03-06.jsonl",
"coronavirus-tweet-id-2020-03-03-07.jsonl",
"coronavirus-tweet-id-2020-03-03-08.jsonl",
"coronavirus-tweet-id-2020-03-03-09.jsonl",
"coronavirus-tweet-id-2020-03-03-10.jsonl",
"coronavirus-tweet-id-2020-03-03-11.jsonl",
"coronavirus-tweet-id-2020-03-03-12.jsonl",
"coronavirus-tweet-id-2020-03-03-13.jsonl",
"coronavirus-tweet-id-2020-03-03-14.jsonl",
"coronavirus-tweet-id-2020-03-03-15.jsonl",
"coronavirus-tweet-id-2020-03-03-16.jsonl",
"coronavirus-tweet-id-2020-03-03-17.jsonl",
"coronavirus-tweet-id-2020-03-03-18.jsonl",
"coronavirus-tweet-id-2020-03-03-19.jsonl",
"coronavirus-tweet-id-2020-03-03-20.jsonl",
"coronavirus-tweet-id-2020-03-03-21.jsonl",
"coronavirus-tweet-id-2020-03-03-22.jsonl",
"coronavirus-tweet-id-2020-03-03-23.jsonl",
"coronavirus-tweet-id-2020-03-04-00.jsonl",
"coronavirus-tweet-id-2020-03-04-01.jsonl",
"coronavirus-tweet-id-2020-03-04-02.jsonl",
"coronavirus-tweet-id-2020-03-04-03.jsonl",
"coronavirus-tweet-id-2020-03-04-04.jsonl",
"coronavirus-tweet-id-2020-03-04-05.jsonl",
"coronavirus-tweet-id-2020-03-04-06.jsonl",
"coronavirus-tweet-id-2020-03-04-07.jsonl",
"coronavirus-tweet-id-2020-03-04-08.jsonl",
"coronavirus-tweet-id-2020-03-04-09.jsonl"
]
df_covid = pd.read_json('./sample_tweets/03_2020/coronavirus-tweet-id-2020-03-01-00.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/03_2020/%s'%i,lines=True)
  print("finished reading: ", i)
  df_covid = pd.concat([df_covid, df_part])

df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
print(df_covid.shape, df_masks.shape)

finished reading:  coronavirus-tweet-id-2020-03-01-01.jsonl
finished reading:  coronavirus-tweet-id-2020-03-01-02.jsonl
finished reading:  coronavirus-tweet-id-2020-03-01-03.jsonl
finished reading:  coronavirus-tweet-id-2020-03-01-04.jsonl
finished reading:  coronavirus-tweet-id-2020-03-01-05.jsonl
finished reading:  coronavirus-tweet-id-2020-03-01-06.jsonl
finished reading:  coronavirus-tweet-id-2020-03-01-07.jsonl
finished reading:  coronavirus-tweet-id-2020-03-01-08.jsonl
finished reading:  coronavirus-tweet-id-2020-03-01-09.jsonl
finished reading:  coronavirus-tweet-id-2020-03-01-10.jsonl
finished reading:  coronavirus-tweet-id-2020-03-01-11.jsonl
finished reading:  coronavirus-tweet-id-2020-03-01-12.jsonl
finished reading:  coronavirus-tweet-id-2020-03-01-13.jsonl
finished reading:  coronavirus-tweet-id-2020-03-01-14.jsonl
finished reading:  coronavirus-tweet-id-2020-03-01-15.jsonl
finished reading:  coronavirus-tweet-id-2020-03-01-16.jsonl
finished reading:  coronavirus-tweet-id-

In [0]:
process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.head()
df_masks.to_csv('march_masks_1.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(764219, 34) (11787, 5)


In [0]:
# MARCH MASKS EXPORT CSV - PART 2

files = [
"coronavirus-tweet-id-2020-03-04-11.jsonl",
"coronavirus-tweet-id-2020-03-04-12.jsonl",
"coronavirus-tweet-id-2020-03-04-13.jsonl",
"coronavirus-tweet-id-2020-03-04-14.jsonl",
"coronavirus-tweet-id-2020-03-04-15.jsonl",
"coronavirus-tweet-id-2020-03-04-16.jsonl",
"coronavirus-tweet-id-2020-03-04-17.jsonl",
"coronavirus-tweet-id-2020-03-04-18.jsonl",
"coronavirus-tweet-id-2020-03-04-19.jsonl",
"coronavirus-tweet-id-2020-03-04-20.jsonl",
"coronavirus-tweet-id-2020-03-04-21.jsonl",
"coronavirus-tweet-id-2020-03-04-22.jsonl",
"coronavirus-tweet-id-2020-03-04-23.jsonl",
"coronavirus-tweet-id-2020-03-05-00.jsonl",
"coronavirus-tweet-id-2020-03-05-01.jsonl",
"coronavirus-tweet-id-2020-03-05-02.jsonl",
"coronavirus-tweet-id-2020-03-05-03.jsonl",
"coronavirus-tweet-id-2020-03-05-04.jsonl",
"coronavirus-tweet-id-2020-03-05-05.jsonl",
"coronavirus-tweet-id-2020-03-05-06.jsonl",
"coronavirus-tweet-id-2020-03-05-07.jsonl",
"coronavirus-tweet-id-2020-03-05-08.jsonl",
"coronavirus-tweet-id-2020-03-05-09.jsonl",
"coronavirus-tweet-id-2020-03-05-10.jsonl",
"coronavirus-tweet-id-2020-03-05-11.jsonl",
"coronavirus-tweet-id-2020-03-05-12.jsonl",
"coronavirus-tweet-id-2020-03-05-13.jsonl",
"coronavirus-tweet-id-2020-03-05-14.jsonl",
"coronavirus-tweet-id-2020-03-05-15.jsonl",
"coronavirus-tweet-id-2020-03-05-16.jsonl",
"coronavirus-tweet-id-2020-03-05-17.jsonl",
"coronavirus-tweet-id-2020-03-05-18.jsonl",
"coronavirus-tweet-id-2020-03-05-19.jsonl",
"coronavirus-tweet-id-2020-03-05-20.jsonl",
"coronavirus-tweet-id-2020-03-05-21.jsonl",
"coronavirus-tweet-id-2020-03-05-22.jsonl",
"coronavirus-tweet-id-2020-03-05-23.jsonl",
"coronavirus-tweet-id-2020-03-06-00.jsonl",
"coronavirus-tweet-id-2020-03-06-01.jsonl",
"coronavirus-tweet-id-2020-03-06-02.jsonl",
"coronavirus-tweet-id-2020-03-06-03.jsonl",
"coronavirus-tweet-id-2020-03-06-04.jsonl",
"coronavirus-tweet-id-2020-03-06-05.jsonl",
"coronavirus-tweet-id-2020-03-06-06.jsonl",
"coronavirus-tweet-id-2020-03-06-07.jsonl",
"coronavirus-tweet-id-2020-03-06-08.jsonl",
"coronavirus-tweet-id-2020-03-06-09.jsonl",
"coronavirus-tweet-id-2020-03-06-10.jsonl",
"coronavirus-tweet-id-2020-03-06-11.jsonl",
"coronavirus-tweet-id-2020-03-06-12.jsonl",
"coronavirus-tweet-id-2020-03-06-13.jsonl",
"coronavirus-tweet-id-2020-03-06-14.jsonl",
"coronavirus-tweet-id-2020-03-06-15.jsonl",
"coronavirus-tweet-id-2020-03-06-16.jsonl",
"coronavirus-tweet-id-2020-03-06-17.jsonl",
"coronavirus-tweet-id-2020-03-06-18.jsonl",
"coronavirus-tweet-id-2020-03-06-19.jsonl",
"coronavirus-tweet-id-2020-03-06-20.jsonl",
"coronavirus-tweet-id-2020-03-06-21.jsonl",
"coronavirus-tweet-id-2020-03-06-22.jsonl",
"coronavirus-tweet-id-2020-03-06-23.jsonl",
"coronavirus-tweet-id-2020-03-07-00.jsonl",
"coronavirus-tweet-id-2020-03-07-01.jsonl",
"coronavirus-tweet-id-2020-03-07-02.jsonl",
"coronavirus-tweet-id-2020-03-07-03.jsonl",
"coronavirus-tweet-id-2020-03-07-04.jsonl",
"coronavirus-tweet-id-2020-03-07-05.jsonl",
"coronavirus-tweet-id-2020-03-07-06.jsonl",
"coronavirus-tweet-id-2020-03-07-07.jsonl",
"coronavirus-tweet-id-2020-03-07-08.jsonl",
"coronavirus-tweet-id-2020-03-07-09.jsonl",
"coronavirus-tweet-id-2020-03-07-10.jsonl",
"coronavirus-tweet-id-2020-03-07-11.jsonl",
"coronavirus-tweet-id-2020-03-07-12.jsonl",
"coronavirus-tweet-id-2020-03-07-13.jsonl",
"coronavirus-tweet-id-2020-03-07-14.jsonl",
"coronavirus-tweet-id-2020-03-07-15.jsonl",
"coronavirus-tweet-id-2020-03-07-16.jsonl",
"coronavirus-tweet-id-2020-03-07-17.jsonl",
"coronavirus-tweet-id-2020-03-07-18.jsonl",
"coronavirus-tweet-id-2020-03-07-19.jsonl",
"coronavirus-tweet-id-2020-03-07-20.jsonl",
"coronavirus-tweet-id-2020-03-07-21.jsonl",
"coronavirus-tweet-id-2020-03-07-22.jsonl",
"coronavirus-tweet-id-2020-03-07-23.jsonl",
"coronavirus-tweet-id-2020-03-08-00.jsonl",
"coronavirus-tweet-id-2020-03-08-01.jsonl",
"coronavirus-tweet-id-2020-03-08-02.jsonl",
"coronavirus-tweet-id-2020-03-08-03.jsonl",
"coronavirus-tweet-id-2020-03-08-04.jsonl",
"coronavirus-tweet-id-2020-03-08-05.jsonl",
"coronavirus-tweet-id-2020-03-08-06.jsonl",
"coronavirus-tweet-id-2020-03-08-07.jsonl",
"coronavirus-tweet-id-2020-03-08-08.jsonl",
"coronavirus-tweet-id-2020-03-08-09.jsonl",
"coronavirus-tweet-id-2020-03-08-10.jsonl",
"coronavirus-tweet-id-2020-03-08-11.jsonl",
"coronavirus-tweet-id-2020-03-08-12.jsonl",
"coronavirus-tweet-id-2020-03-08-13.jsonl",
"coronavirus-tweet-id-2020-03-08-14.jsonl",
"coronavirus-tweet-id-2020-03-08-15.jsonl",
"coronavirus-tweet-id-2020-03-08-16.jsonl",
"coronavirus-tweet-id-2020-03-08-17.jsonl",
"coronavirus-tweet-id-2020-03-08-18.jsonl",
"coronavirus-tweet-id-2020-03-08-19.jsonl",
"coronavirus-tweet-id-2020-03-08-20.jsonl",
"coronavirus-tweet-id-2020-03-08-21.jsonl",
"coronavirus-tweet-id-2020-03-08-22.jsonl",
"coronavirus-tweet-id-2020-03-08-23.jsonl",
]
df_covid = pd.read_json('./sample_tweets/03_2020/coronavirus-tweet-id-2020-03-04-10.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/03_2020/%s'%i,lines=True)
  print("finished reading: ", i)
  df_covid = pd.concat([df_covid, df_part])

df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
print(df_covid.shape, df_masks.shape)

finished reading:  coronavirus-tweet-id-2020-03-04-11.jsonl
finished reading:  coronavirus-tweet-id-2020-03-04-12.jsonl
finished reading:  coronavirus-tweet-id-2020-03-04-13.jsonl
finished reading:  coronavirus-tweet-id-2020-03-04-14.jsonl
finished reading:  coronavirus-tweet-id-2020-03-04-15.jsonl
finished reading:  coronavirus-tweet-id-2020-03-04-16.jsonl
finished reading:  coronavirus-tweet-id-2020-03-04-17.jsonl
finished reading:  coronavirus-tweet-id-2020-03-04-18.jsonl
finished reading:  coronavirus-tweet-id-2020-03-04-19.jsonl
finished reading:  coronavirus-tweet-id-2020-03-04-20.jsonl
finished reading:  coronavirus-tweet-id-2020-03-04-21.jsonl
finished reading:  coronavirus-tweet-id-2020-03-04-22.jsonl
finished reading:  coronavirus-tweet-id-2020-03-04-23.jsonl
finished reading:  coronavirus-tweet-id-2020-03-05-00.jsonl
finished reading:  coronavirus-tweet-id-2020-03-05-01.jsonl
finished reading:  coronavirus-tweet-id-2020-03-05-02.jsonl
finished reading:  coronavirus-tweet-id-

In [0]:
process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.head(50)
df_masks.to_csv('march_masks_2.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(604569, 34) (8227, 5)


In [0]:
# MARCH MASKS EXPORT CSV - PART 3

files = [
"coronavirus-tweet-id-2020-03-09-01.jsonl",
"coronavirus-tweet-id-2020-03-09-02.jsonl",
"coronavirus-tweet-id-2020-03-09-03.jsonl",
"coronavirus-tweet-id-2020-03-09-04.jsonl",
"coronavirus-tweet-id-2020-03-09-05.jsonl",
"coronavirus-tweet-id-2020-03-09-06.jsonl",
"coronavirus-tweet-id-2020-03-09-07.jsonl",
"coronavirus-tweet-id-2020-03-09-08.jsonl",
"coronavirus-tweet-id-2020-03-09-09.jsonl",
"coronavirus-tweet-id-2020-03-09-10.jsonl",
"coronavirus-tweet-id-2020-03-09-11.jsonl",
"coronavirus-tweet-id-2020-03-09-12.jsonl",
"coronavirus-tweet-id-2020-03-09-13.jsonl",
"coronavirus-tweet-id-2020-03-09-14.jsonl",
"coronavirus-tweet-id-2020-03-09-15.jsonl",
"coronavirus-tweet-id-2020-03-09-16.jsonl",
"coronavirus-tweet-id-2020-03-09-17.jsonl",
"coronavirus-tweet-id-2020-03-09-18.jsonl",
"coronavirus-tweet-id-2020-03-09-19.jsonl",
"coronavirus-tweet-id-2020-03-09-20.jsonl",
"coronavirus-tweet-id-2020-03-09-21.jsonl",
"coronavirus-tweet-id-2020-03-09-22.jsonl",
"coronavirus-tweet-id-2020-03-09-23.jsonl",
"coronavirus-tweet-id-2020-03-10-00.jsonl",
"coronavirus-tweet-id-2020-03-10-01.jsonl",
"coronavirus-tweet-id-2020-03-10-02.jsonl",
"coronavirus-tweet-id-2020-03-10-03.jsonl",
"coronavirus-tweet-id-2020-03-10-04.jsonl",
"coronavirus-tweet-id-2020-03-10-05.jsonl",
"coronavirus-tweet-id-2020-03-10-06.jsonl",
"coronavirus-tweet-id-2020-03-10-07.jsonl",
"coronavirus-tweet-id-2020-03-10-08.jsonl",
"coronavirus-tweet-id-2020-03-10-09.jsonl",
"coronavirus-tweet-id-2020-03-10-10.jsonl",
"coronavirus-tweet-id-2020-03-10-11.jsonl",
"coronavirus-tweet-id-2020-03-10-12.jsonl",
"coronavirus-tweet-id-2020-03-10-13.jsonl",
"coronavirus-tweet-id-2020-03-10-14.jsonl",
"coronavirus-tweet-id-2020-03-10-15.jsonl",
"coronavirus-tweet-id-2020-03-10-16.jsonl",
"coronavirus-tweet-id-2020-03-10-17.jsonl",
"coronavirus-tweet-id-2020-03-10-18.jsonl",
"coronavirus-tweet-id-2020-03-10-19.jsonl",
"coronavirus-tweet-id-2020-03-10-20.jsonl",
"coronavirus-tweet-id-2020-03-10-21.jsonl",
"coronavirus-tweet-id-2020-03-10-22.jsonl",
"coronavirus-tweet-id-2020-03-10-23.jsonl",
"coronavirus-tweet-id-2020-03-11-00.jsonl",
"coronavirus-tweet-id-2020-03-11-01.jsonl",
"coronavirus-tweet-id-2020-03-11-02.jsonl",
"coronavirus-tweet-id-2020-03-11-03.jsonl",
"coronavirus-tweet-id-2020-03-11-04.jsonl",
"coronavirus-tweet-id-2020-03-11-05.jsonl",
"coronavirus-tweet-id-2020-03-11-06.jsonl",
"coronavirus-tweet-id-2020-03-11-07.jsonl",
"coronavirus-tweet-id-2020-03-11-08.jsonl",
"coronavirus-tweet-id-2020-03-11-09.jsonl",
"coronavirus-tweet-id-2020-03-11-10.jsonl",
"coronavirus-tweet-id-2020-03-11-11.jsonl",
"coronavirus-tweet-id-2020-03-11-12.jsonl",
"coronavirus-tweet-id-2020-03-11-13.jsonl",
"coronavirus-tweet-id-2020-03-11-14.jsonl",
"coronavirus-tweet-id-2020-03-11-15.jsonl",
"coronavirus-tweet-id-2020-03-11-16.jsonl",
"coronavirus-tweet-id-2020-03-11-17.jsonl",
"coronavirus-tweet-id-2020-03-11-18.jsonl",
"coronavirus-tweet-id-2020-03-11-19.jsonl",
"coronavirus-tweet-id-2020-03-11-20.jsonl",
"coronavirus-tweet-id-2020-03-11-21.jsonl",
"coronavirus-tweet-id-2020-03-11-22.jsonl",
"coronavirus-tweet-id-2020-03-11-23.jsonl",
"coronavirus-tweet-id-2020-03-12-00.jsonl",
"coronavirus-tweet-id-2020-03-12-01.jsonl",
"coronavirus-tweet-id-2020-03-12-02.jsonl",
"coronavirus-tweet-id-2020-03-12-03.jsonl",
"coronavirus-tweet-id-2020-03-12-04.jsonl",
"coronavirus-tweet-id-2020-03-12-05.jsonl",
"coronavirus-tweet-id-2020-03-12-06.jsonl",
"coronavirus-tweet-id-2020-03-12-07.jsonl",
"coronavirus-tweet-id-2020-03-12-08.jsonl",
"coronavirus-tweet-id-2020-03-12-09.jsonl",
"coronavirus-tweet-id-2020-03-12-10.jsonl",
"coronavirus-tweet-id-2020-03-12-11.jsonl",
"coronavirus-tweet-id-2020-03-12-12.jsonl",
"coronavirus-tweet-id-2020-03-12-13.jsonl",
"coronavirus-tweet-id-2020-03-12-14.jsonl",
"coronavirus-tweet-id-2020-03-12-15.jsonl",
"coronavirus-tweet-id-2020-03-12-16.jsonl",
"coronavirus-tweet-id-2020-03-12-17.jsonl",
"coronavirus-tweet-id-2020-03-12-18.jsonl",
"coronavirus-tweet-id-2020-03-12-19.jsonl",
"coronavirus-tweet-id-2020-03-12-20.jsonl",
"coronavirus-tweet-id-2020-03-12-21.jsonl",
"coronavirus-tweet-id-2020-03-12-22.jsonl",
"coronavirus-tweet-id-2020-03-12-23.jsonl",
]
df_covid = pd.read_json('./sample_tweets/03_2020/coronavirus-tweet-id-2020-03-09-00.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/03_2020/%s'%i,lines=True)
  print("finished reading: ", i)
  df_covid = pd.concat([df_covid, df_part])

df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
print(df_covid.shape, df_masks.shape)

finished reading:  coronavirus-tweet-id-2020-03-09-01.jsonl
finished reading:  coronavirus-tweet-id-2020-03-09-02.jsonl
finished reading:  coronavirus-tweet-id-2020-03-09-03.jsonl
finished reading:  coronavirus-tweet-id-2020-03-09-04.jsonl
finished reading:  coronavirus-tweet-id-2020-03-09-05.jsonl
finished reading:  coronavirus-tweet-id-2020-03-09-06.jsonl
finished reading:  coronavirus-tweet-id-2020-03-09-07.jsonl
finished reading:  coronavirus-tweet-id-2020-03-09-08.jsonl
finished reading:  coronavirus-tweet-id-2020-03-09-09.jsonl
finished reading:  coronavirus-tweet-id-2020-03-09-10.jsonl
finished reading:  coronavirus-tweet-id-2020-03-09-11.jsonl
finished reading:  coronavirus-tweet-id-2020-03-09-12.jsonl
finished reading:  coronavirus-tweet-id-2020-03-09-13.jsonl
finished reading:  coronavirus-tweet-id-2020-03-09-14.jsonl
finished reading:  coronavirus-tweet-id-2020-03-09-15.jsonl
finished reading:  coronavirus-tweet-id-2020-03-09-16.jsonl
finished reading:  coronavirus-tweet-id-

In [0]:
process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.head(50)
df_masks.to_csv('march_masks_3.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(395315, 34) (1550, 5)


In [0]:
# MARCH MASKS EXPORT CSV - PART 4
files = [
"coronavirus-tweet-id-2020-03-13-01.jsonl",
"coronavirus-tweet-id-2020-03-13-02.jsonl",
"coronavirus-tweet-id-2020-03-13-03.jsonl",
"coronavirus-tweet-id-2020-03-13-04.jsonl",
"coronavirus-tweet-id-2020-03-13-05.jsonl",
"coronavirus-tweet-id-2020-03-13-06.jsonl",
"coronavirus-tweet-id-2020-03-13-07.jsonl",
"coronavirus-tweet-id-2020-03-13-08.jsonl",
"coronavirus-tweet-id-2020-03-13-09.jsonl",
"coronavirus-tweet-id-2020-03-13-10.jsonl",
"coronavirus-tweet-id-2020-03-13-11.jsonl",
"coronavirus-tweet-id-2020-03-13-12.jsonl",
"coronavirus-tweet-id-2020-03-13-13.jsonl",
"coronavirus-tweet-id-2020-03-13-14.jsonl",
"coronavirus-tweet-id-2020-03-13-15.jsonl",
"coronavirus-tweet-id-2020-03-13-16.jsonl",
"coronavirus-tweet-id-2020-03-13-17.jsonl",
"coronavirus-tweet-id-2020-03-13-18.jsonl",
"coronavirus-tweet-id-2020-03-13-19.jsonl",
"coronavirus-tweet-id-2020-03-13-20.jsonl",
"coronavirus-tweet-id-2020-03-13-21.jsonl",
"coronavirus-tweet-id-2020-03-13-22.jsonl",
"coronavirus-tweet-id-2020-03-13-23.jsonl",
"coronavirus-tweet-id-2020-03-14-00.jsonl",
"coronavirus-tweet-id-2020-03-14-01.jsonl",
"coronavirus-tweet-id-2020-03-14-02.jsonl",
"coronavirus-tweet-id-2020-03-14-03.jsonl",
"coronavirus-tweet-id-2020-03-14-04.jsonl",
"coronavirus-tweet-id-2020-03-14-05.jsonl",
"coronavirus-tweet-id-2020-03-14-06.jsonl",
"coronavirus-tweet-id-2020-03-14-07.jsonl",
"coronavirus-tweet-id-2020-03-14-08.jsonl",
"coronavirus-tweet-id-2020-03-14-09.jsonl",
"coronavirus-tweet-id-2020-03-14-10.jsonl",
"coronavirus-tweet-id-2020-03-14-11.jsonl",
"coronavirus-tweet-id-2020-03-14-12.jsonl",
"coronavirus-tweet-id-2020-03-14-13.jsonl",
"coronavirus-tweet-id-2020-03-14-14.jsonl",
"coronavirus-tweet-id-2020-03-14-15.jsonl",
"coronavirus-tweet-id-2020-03-14-16.jsonl",
"coronavirus-tweet-id-2020-03-14-17.jsonl",
"coronavirus-tweet-id-2020-03-14-18.jsonl",
"coronavirus-tweet-id-2020-03-14-19.jsonl",
"coronavirus-tweet-id-2020-03-14-20.jsonl",
"coronavirus-tweet-id-2020-03-14-21.jsonl",
"coronavirus-tweet-id-2020-03-14-22.jsonl",
"coronavirus-tweet-id-2020-03-14-23.jsonl",
"coronavirus-tweet-id-2020-03-15-00.jsonl",
"coronavirus-tweet-id-2020-03-15-01.jsonl",
"coronavirus-tweet-id-2020-03-15-02.jsonl",
"coronavirus-tweet-id-2020-03-15-03.jsonl",
"coronavirus-tweet-id-2020-03-15-04.jsonl",
"coronavirus-tweet-id-2020-03-15-05.jsonl",
"coronavirus-tweet-id-2020-03-15-06.jsonl",
"coronavirus-tweet-id-2020-03-15-07.jsonl",
"coronavirus-tweet-id-2020-03-15-08.jsonl",
"coronavirus-tweet-id-2020-03-15-09.jsonl",
"coronavirus-tweet-id-2020-03-15-10.jsonl",
"coronavirus-tweet-id-2020-03-15-11.jsonl",
"coronavirus-tweet-id-2020-03-15-12.jsonl",
"coronavirus-tweet-id-2020-03-15-13.jsonl",
"coronavirus-tweet-id-2020-03-15-14.jsonl",
"coronavirus-tweet-id-2020-03-15-15.jsonl",
"coronavirus-tweet-id-2020-03-15-16.jsonl",
"coronavirus-tweet-id-2020-03-15-17.jsonl",
"coronavirus-tweet-id-2020-03-15-18.jsonl",
"coronavirus-tweet-id-2020-03-15-19.jsonl",
"coronavirus-tweet-id-2020-03-15-20.jsonl",
"coronavirus-tweet-id-2020-03-15-21.jsonl",
"coronavirus-tweet-id-2020-03-15-22.jsonl",
"coronavirus-tweet-id-2020-03-15-23.jsonl",
"coronavirus-tweet-id-2020-03-16-00.jsonl",
"coronavirus-tweet-id-2020-03-16-01.jsonl",
"coronavirus-tweet-id-2020-03-16-02.jsonl",
"coronavirus-tweet-id-2020-03-16-03.jsonl",
"coronavirus-tweet-id-2020-03-16-04.jsonl",
"coronavirus-tweet-id-2020-03-16-05.jsonl",
"coronavirus-tweet-id-2020-03-16-06.jsonl",
"coronavirus-tweet-id-2020-03-16-07.jsonl",
"coronavirus-tweet-id-2020-03-16-08.jsonl",
"coronavirus-tweet-id-2020-03-16-09.jsonl",
"coronavirus-tweet-id-2020-03-16-10.jsonl",
"coronavirus-tweet-id-2020-03-16-11.jsonl",
"coronavirus-tweet-id-2020-03-16-12.jsonl",
"coronavirus-tweet-id-2020-03-16-13.jsonl",
"coronavirus-tweet-id-2020-03-16-14.jsonl",
"coronavirus-tweet-id-2020-03-16-15.jsonl",
"coronavirus-tweet-id-2020-03-16-16.jsonl",
"coronavirus-tweet-id-2020-03-16-17.jsonl",
"coronavirus-tweet-id-2020-03-16-18.jsonl",
"coronavirus-tweet-id-2020-03-16-19.jsonl",
"coronavirus-tweet-id-2020-03-16-20.jsonl",
"coronavirus-tweet-id-2020-03-16-21.jsonl",
"coronavirus-tweet-id-2020-03-16-22.jsonl",
"coronavirus-tweet-id-2020-03-16-23.jsonl"
]

df_covid = pd.read_json('./sample_tweets/03_2020/coronavirus-tweet-id-2020-03-13-00.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/03_2020/%s'%i,lines=True)
  print("finished reading: ", i)
  df_covid = pd.concat([df_covid, df_part])

df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
print(df_covid.shape, df_masks.shape)

finished reading:  coronavirus-tweet-id-2020-03-13-01.jsonl
finished reading:  coronavirus-tweet-id-2020-03-13-02.jsonl
finished reading:  coronavirus-tweet-id-2020-03-13-03.jsonl
finished reading:  coronavirus-tweet-id-2020-03-13-04.jsonl
finished reading:  coronavirus-tweet-id-2020-03-13-05.jsonl
finished reading:  coronavirus-tweet-id-2020-03-13-06.jsonl
finished reading:  coronavirus-tweet-id-2020-03-13-07.jsonl
finished reading:  coronavirus-tweet-id-2020-03-13-08.jsonl
finished reading:  coronavirus-tweet-id-2020-03-13-09.jsonl
finished reading:  coronavirus-tweet-id-2020-03-13-10.jsonl
finished reading:  coronavirus-tweet-id-2020-03-13-11.jsonl
finished reading:  coronavirus-tweet-id-2020-03-13-12.jsonl
finished reading:  coronavirus-tweet-id-2020-03-13-13.jsonl
finished reading:  coronavirus-tweet-id-2020-03-13-14.jsonl
finished reading:  coronavirus-tweet-id-2020-03-13-15.jsonl
finished reading:  coronavirus-tweet-id-2020-03-13-16.jsonl
finished reading:  coronavirus-tweet-id-

In [0]:
process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.head(50)
df_masks.to_csv('march_masks_4.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(380663, 34) (1165, 5)


In [0]:
# MARCH MASKS EXPORT CSV - PART 5
files = [
"coronavirus-tweet-id-2020-03-17-01.jsonl",
"coronavirus-tweet-id-2020-03-17-02.jsonl",
"coronavirus-tweet-id-2020-03-17-03.jsonl",
"coronavirus-tweet-id-2020-03-17-04.jsonl",
"coronavirus-tweet-id-2020-03-17-05.jsonl",
"coronavirus-tweet-id-2020-03-17-06.jsonl",
"coronavirus-tweet-id-2020-03-17-07.jsonl",
"coronavirus-tweet-id-2020-03-17-08.jsonl",
"coronavirus-tweet-id-2020-03-17-09.jsonl",
"coronavirus-tweet-id-2020-03-17-10.jsonl",
"coronavirus-tweet-id-2020-03-17-11.jsonl",
"coronavirus-tweet-id-2020-03-17-12.jsonl",
"coronavirus-tweet-id-2020-03-17-13.jsonl",
"coronavirus-tweet-id-2020-03-17-14.jsonl",
"coronavirus-tweet-id-2020-03-17-15.jsonl",
"coronavirus-tweet-id-2020-03-17-16.jsonl",
"coronavirus-tweet-id-2020-03-17-17.jsonl",
"coronavirus-tweet-id-2020-03-17-18.jsonl",
"coronavirus-tweet-id-2020-03-17-19.jsonl",
"coronavirus-tweet-id-2020-03-17-20.jsonl",
"coronavirus-tweet-id-2020-03-17-21.jsonl",
"coronavirus-tweet-id-2020-03-17-22.jsonl",
"coronavirus-tweet-id-2020-03-17-23.jsonl",
"coronavirus-tweet-id-2020-03-18-00.jsonl",
"coronavirus-tweet-id-2020-03-18-01.jsonl",
"coronavirus-tweet-id-2020-03-18-02.jsonl",
"coronavirus-tweet-id-2020-03-18-03.jsonl",
"coronavirus-tweet-id-2020-03-18-04.jsonl",
"coronavirus-tweet-id-2020-03-18-05.jsonl",
"coronavirus-tweet-id-2020-03-18-06.jsonl",
"coronavirus-tweet-id-2020-03-18-07.jsonl",
"coronavirus-tweet-id-2020-03-18-08.jsonl",
"coronavirus-tweet-id-2020-03-18-09.jsonl",
"coronavirus-tweet-id-2020-03-18-10.jsonl",
"coronavirus-tweet-id-2020-03-18-11.jsonl",
"coronavirus-tweet-id-2020-03-18-12.jsonl",
"coronavirus-tweet-id-2020-03-18-13.jsonl",
"coronavirus-tweet-id-2020-03-18-14.jsonl",
"coronavirus-tweet-id-2020-03-18-15.jsonl",
"coronavirus-tweet-id-2020-03-18-16.jsonl",
"coronavirus-tweet-id-2020-03-18-17.jsonl",
"coronavirus-tweet-id-2020-03-18-18.jsonl",
"coronavirus-tweet-id-2020-03-18-19.jsonl",
"coronavirus-tweet-id-2020-03-18-20.jsonl",
"coronavirus-tweet-id-2020-03-18-21.jsonl",
"coronavirus-tweet-id-2020-03-18-22.jsonl",
"coronavirus-tweet-id-2020-03-18-23.jsonl",
"coronavirus-tweet-id-2020-03-19-00.jsonl",
"coronavirus-tweet-id-2020-03-19-01.jsonl",
"coronavirus-tweet-id-2020-03-19-02.jsonl",
"coronavirus-tweet-id-2020-03-19-03.jsonl",
"coronavirus-tweet-id-2020-03-19-04.jsonl",
"coronavirus-tweet-id-2020-03-19-05.jsonl",
"coronavirus-tweet-id-2020-03-19-06.jsonl",
"coronavirus-tweet-id-2020-03-19-07.jsonl",
"coronavirus-tweet-id-2020-03-19-08.jsonl",
"coronavirus-tweet-id-2020-03-19-09.jsonl",
"coronavirus-tweet-id-2020-03-19-10.jsonl",
"coronavirus-tweet-id-2020-03-19-11.jsonl",
"coronavirus-tweet-id-2020-03-19-12.jsonl",
"coronavirus-tweet-id-2020-03-19-13.jsonl",
"coronavirus-tweet-id-2020-03-19-14.jsonl",
"coronavirus-tweet-id-2020-03-19-15.jsonl",
"coronavirus-tweet-id-2020-03-19-16.jsonl",
"coronavirus-tweet-id-2020-03-19-17.jsonl",
"coronavirus-tweet-id-2020-03-19-18.jsonl",
"coronavirus-tweet-id-2020-03-19-19.jsonl",
"coronavirus-tweet-id-2020-03-19-20.jsonl",
"coronavirus-tweet-id-2020-03-19-21.jsonl",
"coronavirus-tweet-id-2020-03-19-22.jsonl",
"coronavirus-tweet-id-2020-03-19-23.jsonl",
"coronavirus-tweet-id-2020-03-20-00.jsonl",
"coronavirus-tweet-id-2020-03-20-01.jsonl",
"coronavirus-tweet-id-2020-03-20-02.jsonl",
"coronavirus-tweet-id-2020-03-20-03.jsonl",
"coronavirus-tweet-id-2020-03-20-04.jsonl",
"coronavirus-tweet-id-2020-03-20-05.jsonl",
"coronavirus-tweet-id-2020-03-20-06.jsonl",
"coronavirus-tweet-id-2020-03-20-07.jsonl",
"coronavirus-tweet-id-2020-03-20-08.jsonl",
"coronavirus-tweet-id-2020-03-20-09.jsonl",
"coronavirus-tweet-id-2020-03-20-10.jsonl",
"coronavirus-tweet-id-2020-03-20-11.jsonl",
"coronavirus-tweet-id-2020-03-20-12.jsonl",
"coronavirus-tweet-id-2020-03-20-13.jsonl",
"coronavirus-tweet-id-2020-03-20-14.jsonl",
"coronavirus-tweet-id-2020-03-20-15.jsonl",
"coronavirus-tweet-id-2020-03-20-16.jsonl",
"coronavirus-tweet-id-2020-03-20-17.jsonl",
"coronavirus-tweet-id-2020-03-20-18.jsonl",
"coronavirus-tweet-id-2020-03-20-19.jsonl",
"coronavirus-tweet-id-2020-03-20-20.jsonl",
"coronavirus-tweet-id-2020-03-20-21.jsonl",
"coronavirus-tweet-id-2020-03-20-22.jsonl",
"coronavirus-tweet-id-2020-03-20-23.jsonl",
]
df_covid = pd.read_json('./sample_tweets/03_2020/coronavirus-tweet-id-2020-03-17-00.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/03_2020/%s'%i,lines=True)
  print("finished reading: ", i)
  df_covid = pd.concat([df_covid, df_part])

df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
print(df_covid.shape, df_masks.shape)

finished reading:  coronavirus-tweet-id-2020-03-17-01.jsonl
finished reading:  coronavirus-tweet-id-2020-03-17-02.jsonl
finished reading:  coronavirus-tweet-id-2020-03-17-03.jsonl
finished reading:  coronavirus-tweet-id-2020-03-17-04.jsonl
finished reading:  coronavirus-tweet-id-2020-03-17-05.jsonl
finished reading:  coronavirus-tweet-id-2020-03-17-06.jsonl
finished reading:  coronavirus-tweet-id-2020-03-17-07.jsonl
finished reading:  coronavirus-tweet-id-2020-03-17-08.jsonl
finished reading:  coronavirus-tweet-id-2020-03-17-09.jsonl
finished reading:  coronavirus-tweet-id-2020-03-17-10.jsonl
finished reading:  coronavirus-tweet-id-2020-03-17-11.jsonl
finished reading:  coronavirus-tweet-id-2020-03-17-12.jsonl
finished reading:  coronavirus-tweet-id-2020-03-17-13.jsonl
finished reading:  coronavirus-tweet-id-2020-03-17-14.jsonl
finished reading:  coronavirus-tweet-id-2020-03-17-15.jsonl
finished reading:  coronavirus-tweet-id-2020-03-17-16.jsonl
finished reading:  coronavirus-tweet-id-

In [0]:
process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.head(50)
df_masks.to_csv('march_masks_5.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(362054, 34) (1452, 5)


In [0]:
# MARCH MASKS EXPORT CSV - PART 6
files = [
"coronavirus-tweet-id-2020-03-21-01.jsonl",
"coronavirus-tweet-id-2020-03-21-02.jsonl",
"coronavirus-tweet-id-2020-03-21-03.jsonl",
"coronavirus-tweet-id-2020-03-21-04.jsonl",
"coronavirus-tweet-id-2020-03-21-05.jsonl",
"coronavirus-tweet-id-2020-03-21-06.jsonl",
"coronavirus-tweet-id-2020-03-21-07.jsonl",
"coronavirus-tweet-id-2020-03-21-08.jsonl",
"coronavirus-tweet-id-2020-03-21-09.jsonl",
"coronavirus-tweet-id-2020-03-21-10.jsonl",
"coronavirus-tweet-id-2020-03-21-11.jsonl",
"coronavirus-tweet-id-2020-03-21-12.jsonl",
"coronavirus-tweet-id-2020-03-21-13.jsonl",
"coronavirus-tweet-id-2020-03-21-14.jsonl",
"coronavirus-tweet-id-2020-03-21-15.jsonl",
"coronavirus-tweet-id-2020-03-21-16.jsonl",
"coronavirus-tweet-id-2020-03-21-17.jsonl",
"coronavirus-tweet-id-2020-03-21-18.jsonl",
"coronavirus-tweet-id-2020-03-21-19.jsonl",
"coronavirus-tweet-id-2020-03-21-20.jsonl",
"coronavirus-tweet-id-2020-03-21-21.jsonl",
"coronavirus-tweet-id-2020-03-21-22.jsonl",
"coronavirus-tweet-id-2020-03-21-23.jsonl",
"coronavirus-tweet-id-2020-03-22-00.jsonl",
"coronavirus-tweet-id-2020-03-22-01.jsonl",
"coronavirus-tweet-id-2020-03-22-02.jsonl",
"coronavirus-tweet-id-2020-03-22-03.jsonl",
"coronavirus-tweet-id-2020-03-22-04.jsonl",
"coronavirus-tweet-id-2020-03-22-05.jsonl",
"coronavirus-tweet-id-2020-03-22-06.jsonl",
"coronavirus-tweet-id-2020-03-22-07.jsonl",
"coronavirus-tweet-id-2020-03-22-08.jsonl",
"coronavirus-tweet-id-2020-03-22-09.jsonl",
"coronavirus-tweet-id-2020-03-22-10.jsonl",
"coronavirus-tweet-id-2020-03-22-11.jsonl",
"coronavirus-tweet-id-2020-03-22-12.jsonl",
"coronavirus-tweet-id-2020-03-22-13.jsonl",
"coronavirus-tweet-id-2020-03-22-14.jsonl",
"coronavirus-tweet-id-2020-03-22-15.jsonl",
"coronavirus-tweet-id-2020-03-22-16.jsonl",
"coronavirus-tweet-id-2020-03-22-17.jsonl",
"coronavirus-tweet-id-2020-03-22-18.jsonl",
"coronavirus-tweet-id-2020-03-22-19.jsonl",
"coronavirus-tweet-id-2020-03-22-20.jsonl",
"coronavirus-tweet-id-2020-03-22-21.jsonl",
"coronavirus-tweet-id-2020-03-22-22.jsonl",
"coronavirus-tweet-id-2020-03-22-23.jsonl",
"coronavirus-tweet-id-2020-03-23-00.jsonl",
"coronavirus-tweet-id-2020-03-23-01.jsonl",
"coronavirus-tweet-id-2020-03-23-02.jsonl",
"coronavirus-tweet-id-2020-03-23-03.jsonl",
"coronavirus-tweet-id-2020-03-23-04.jsonl",
"coronavirus-tweet-id-2020-03-23-05.jsonl",
"coronavirus-tweet-id-2020-03-23-06.jsonl",
"coronavirus-tweet-id-2020-03-23-07.jsonl",
"coronavirus-tweet-id-2020-03-23-08.jsonl",
"coronavirus-tweet-id-2020-03-23-09.jsonl",
"coronavirus-tweet-id-2020-03-23-10.jsonl",
"coronavirus-tweet-id-2020-03-23-11.jsonl",
"coronavirus-tweet-id-2020-03-23-12.jsonl",
"coronavirus-tweet-id-2020-03-23-13.jsonl",
"coronavirus-tweet-id-2020-03-23-14.jsonl",
"coronavirus-tweet-id-2020-03-23-15.jsonl",
"coronavirus-tweet-id-2020-03-23-16.jsonl",
"coronavirus-tweet-id-2020-03-23-17.jsonl",
"coronavirus-tweet-id-2020-03-23-18.jsonl",
"coronavirus-tweet-id-2020-03-23-19.jsonl",
"coronavirus-tweet-id-2020-03-23-20.jsonl",
"coronavirus-tweet-id-2020-03-23-21.jsonl",
"coronavirus-tweet-id-2020-03-23-22.jsonl",
"coronavirus-tweet-id-2020-03-23-23.jsonl",
"coronavirus-tweet-id-2020-03-24-00.jsonl",
"coronavirus-tweet-id-2020-03-24-01.jsonl",
"coronavirus-tweet-id-2020-03-24-02.jsonl",
"coronavirus-tweet-id-2020-03-24-03.jsonl",
"coronavirus-tweet-id-2020-03-24-04.jsonl",
"coronavirus-tweet-id-2020-03-24-05.jsonl",
"coronavirus-tweet-id-2020-03-24-06.jsonl",
"coronavirus-tweet-id-2020-03-24-07.jsonl",
"coronavirus-tweet-id-2020-03-24-08.jsonl",
"coronavirus-tweet-id-2020-03-24-09.jsonl",
"coronavirus-tweet-id-2020-03-24-10.jsonl",
"coronavirus-tweet-id-2020-03-24-11.jsonl",
"coronavirus-tweet-id-2020-03-24-12.jsonl",
"coronavirus-tweet-id-2020-03-24-13.jsonl",
"coronavirus-tweet-id-2020-03-24-14.jsonl",
"coronavirus-tweet-id-2020-03-24-15.jsonl",
"coronavirus-tweet-id-2020-03-24-16.jsonl",
"coronavirus-tweet-id-2020-03-24-17.jsonl",
"coronavirus-tweet-id-2020-03-24-18.jsonl",
"coronavirus-tweet-id-2020-03-24-19.jsonl",
"coronavirus-tweet-id-2020-03-24-20.jsonl",
"coronavirus-tweet-id-2020-03-24-21.jsonl",
"coronavirus-tweet-id-2020-03-24-22.jsonl",
"coronavirus-tweet-id-2020-03-24-23.jsonl",
"coronavirus-tweet-id-2020-03-25-00.jsonl",
"coronavirus-tweet-id-2020-03-25-01.jsonl",
"coronavirus-tweet-id-2020-03-25-02.jsonl",
"coronavirus-tweet-id-2020-03-25-03.jsonl",
"coronavirus-tweet-id-2020-03-25-04.jsonl",
"coronavirus-tweet-id-2020-03-25-05.jsonl",
"coronavirus-tweet-id-2020-03-25-06.jsonl",
"coronavirus-tweet-id-2020-03-25-07.jsonl",
"coronavirus-tweet-id-2020-03-25-08.jsonl",
"coronavirus-tweet-id-2020-03-25-09.jsonl",
"coronavirus-tweet-id-2020-03-25-10.jsonl",
"coronavirus-tweet-id-2020-03-25-11.jsonl",
"coronavirus-tweet-id-2020-03-25-12.jsonl",
"coronavirus-tweet-id-2020-03-25-13.jsonl",
"coronavirus-tweet-id-2020-03-25-14.jsonl",
"coronavirus-tweet-id-2020-03-25-15.jsonl",
"coronavirus-tweet-id-2020-03-25-16.jsonl",
"coronavirus-tweet-id-2020-03-25-17.jsonl",
"coronavirus-tweet-id-2020-03-25-18.jsonl",
"coronavirus-tweet-id-2020-03-25-19.jsonl",
"coronavirus-tweet-id-2020-03-25-20.jsonl",
"coronavirus-tweet-id-2020-03-25-21.jsonl",
"coronavirus-tweet-id-2020-03-25-22.jsonl",
"coronavirus-tweet-id-2020-03-25-23.jsonl",
"coronavirus-tweet-id-2020-03-26-00.jsonl",
"coronavirus-tweet-id-2020-03-26-01.jsonl",
"coronavirus-tweet-id-2020-03-26-02.jsonl",
"coronavirus-tweet-id-2020-03-26-03.jsonl",
"coronavirus-tweet-id-2020-03-26-04.jsonl",
"coronavirus-tweet-id-2020-03-26-05.jsonl",
"coronavirus-tweet-id-2020-03-26-06.jsonl",
"coronavirus-tweet-id-2020-03-26-07.jsonl",
"coronavirus-tweet-id-2020-03-26-08.jsonl",
"coronavirus-tweet-id-2020-03-26-09.jsonl",
"coronavirus-tweet-id-2020-03-26-10.jsonl",
"coronavirus-tweet-id-2020-03-26-11.jsonl",
"coronavirus-tweet-id-2020-03-26-12.jsonl",
"coronavirus-tweet-id-2020-03-26-13.jsonl",
"coronavirus-tweet-id-2020-03-26-14.jsonl",
"coronavirus-tweet-id-2020-03-26-15.jsonl",
"coronavirus-tweet-id-2020-03-26-16.jsonl",
"coronavirus-tweet-id-2020-03-26-17.jsonl",
"coronavirus-tweet-id-2020-03-26-18.jsonl",
"coronavirus-tweet-id-2020-03-26-19.jsonl",
"coronavirus-tweet-id-2020-03-26-20.jsonl",
"coronavirus-tweet-id-2020-03-26-21.jsonl",
"coronavirus-tweet-id-2020-03-26-22.jsonl",
"coronavirus-tweet-id-2020-03-26-23.jsonl"
]
df_covid = pd.read_json('./sample_tweets/03_2020/coronavirus-tweet-id-2020-03-21-00.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/03_2020/%s'%i,lines=True)
  print("finished reading: ", i)
  df_covid = pd.concat([df_covid, df_part])

df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
print(df_covid.shape, df_masks.shape)

finished reading:  coronavirus-tweet-id-2020-03-21-01.jsonl
finished reading:  coronavirus-tweet-id-2020-03-21-02.jsonl
finished reading:  coronavirus-tweet-id-2020-03-21-03.jsonl
finished reading:  coronavirus-tweet-id-2020-03-21-04.jsonl
finished reading:  coronavirus-tweet-id-2020-03-21-05.jsonl
finished reading:  coronavirus-tweet-id-2020-03-21-06.jsonl
finished reading:  coronavirus-tweet-id-2020-03-21-07.jsonl
finished reading:  coronavirus-tweet-id-2020-03-21-08.jsonl
finished reading:  coronavirus-tweet-id-2020-03-21-09.jsonl
finished reading:  coronavirus-tweet-id-2020-03-21-10.jsonl
finished reading:  coronavirus-tweet-id-2020-03-21-11.jsonl
finished reading:  coronavirus-tweet-id-2020-03-21-12.jsonl
finished reading:  coronavirus-tweet-id-2020-03-21-13.jsonl
finished reading:  coronavirus-tweet-id-2020-03-21-14.jsonl
finished reading:  coronavirus-tweet-id-2020-03-21-15.jsonl
finished reading:  coronavirus-tweet-id-2020-03-21-16.jsonl
finished reading:  coronavirus-tweet-id-

In [0]:
process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.head(50)
df_masks.to_csv('march_masks_6.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(572977, 34) (4127, 5)


In [0]:
# MARCH MASKS EXPORT CSV - PART 7
files = [
"coronavirus-tweet-id-2020-03-27-01.jsonl",
"coronavirus-tweet-id-2020-03-27-02.jsonl",
"coronavirus-tweet-id-2020-03-27-03.jsonl",
"coronavirus-tweet-id-2020-03-27-04.jsonl",
"coronavirus-tweet-id-2020-03-27-05.jsonl",
"coronavirus-tweet-id-2020-03-27-06.jsonl",
"coronavirus-tweet-id-2020-03-27-07.jsonl",
"coronavirus-tweet-id-2020-03-27-08.jsonl",
"coronavirus-tweet-id-2020-03-27-09.jsonl",
"coronavirus-tweet-id-2020-03-27-10.jsonl",
"coronavirus-tweet-id-2020-03-27-11.jsonl",
"coronavirus-tweet-id-2020-03-27-12.jsonl",
"coronavirus-tweet-id-2020-03-27-13.jsonl",
"coronavirus-tweet-id-2020-03-27-14.jsonl",
"coronavirus-tweet-id-2020-03-27-15.jsonl",
"coronavirus-tweet-id-2020-03-27-16.jsonl",
"coronavirus-tweet-id-2020-03-27-17.jsonl",
"coronavirus-tweet-id-2020-03-27-18.jsonl",
"coronavirus-tweet-id-2020-03-27-19.jsonl",
"coronavirus-tweet-id-2020-03-27-20.jsonl",
"coronavirus-tweet-id-2020-03-27-21.jsonl",
"coronavirus-tweet-id-2020-03-27-22.jsonl",
"coronavirus-tweet-id-2020-03-27-23.jsonl",
"coronavirus-tweet-id-2020-03-28-00.jsonl",
"coronavirus-tweet-id-2020-03-28-01.jsonl",
"coronavirus-tweet-id-2020-03-28-02.jsonl",
"coronavirus-tweet-id-2020-03-28-03.jsonl",
"coronavirus-tweet-id-2020-03-28-04.jsonl",
"coronavirus-tweet-id-2020-03-28-05.jsonl",
"coronavirus-tweet-id-2020-03-28-06.jsonl",
"coronavirus-tweet-id-2020-03-28-07.jsonl",
"coronavirus-tweet-id-2020-03-28-08.jsonl",
"coronavirus-tweet-id-2020-03-28-09.jsonl",
"coronavirus-tweet-id-2020-03-28-10.jsonl",
"coronavirus-tweet-id-2020-03-28-11.jsonl",
"coronavirus-tweet-id-2020-03-28-12.jsonl",
"coronavirus-tweet-id-2020-03-28-13.jsonl",
"coronavirus-tweet-id-2020-03-28-14.jsonl",
"coronavirus-tweet-id-2020-03-28-15.jsonl",
"coronavirus-tweet-id-2020-03-28-16.jsonl",
"coronavirus-tweet-id-2020-03-28-17.jsonl",
"coronavirus-tweet-id-2020-03-28-18.jsonl",
"coronavirus-tweet-id-2020-03-28-19.jsonl",
"coronavirus-tweet-id-2020-03-28-20.jsonl",
"coronavirus-tweet-id-2020-03-28-21.jsonl",
"coronavirus-tweet-id-2020-03-28-22.jsonl",
"coronavirus-tweet-id-2020-03-28-23.jsonl",
"coronavirus-tweet-id-2020-03-29-00.jsonl",
"coronavirus-tweet-id-2020-03-29-01.jsonl",
"coronavirus-tweet-id-2020-03-29-02.jsonl",
"coronavirus-tweet-id-2020-03-29-03.jsonl",
"coronavirus-tweet-id-2020-03-29-04.jsonl",
"coronavirus-tweet-id-2020-03-29-05.jsonl",
"coronavirus-tweet-id-2020-03-29-06.jsonl",
"coronavirus-tweet-id-2020-03-29-07.jsonl",
"coronavirus-tweet-id-2020-03-29-08.jsonl",
"coronavirus-tweet-id-2020-03-29-09.jsonl",
"coronavirus-tweet-id-2020-03-29-10.jsonl",
"coronavirus-tweet-id-2020-03-29-11.jsonl",
"coronavirus-tweet-id-2020-03-29-12.jsonl",
"coronavirus-tweet-id-2020-03-29-13.jsonl",
"coronavirus-tweet-id-2020-03-29-14.jsonl",
"coronavirus-tweet-id-2020-03-29-15.jsonl",
"coronavirus-tweet-id-2020-03-29-16.jsonl",
"coronavirus-tweet-id-2020-03-29-17.jsonl",
"coronavirus-tweet-id-2020-03-29-18.jsonl",
"coronavirus-tweet-id-2020-03-29-19.jsonl",
"coronavirus-tweet-id-2020-03-29-20.jsonl",
"coronavirus-tweet-id-2020-03-29-21.jsonl",
"coronavirus-tweet-id-2020-03-29-22.jsonl",
"coronavirus-tweet-id-2020-03-29-23.jsonl",
"coronavirus-tweet-id-2020-03-30-00.jsonl",
"coronavirus-tweet-id-2020-03-30-01.jsonl",
"coronavirus-tweet-id-2020-03-30-02.jsonl",
"coronavirus-tweet-id-2020-03-30-03.jsonl",
"coronavirus-tweet-id-2020-03-30-04.jsonl",
"coronavirus-tweet-id-2020-03-30-05.jsonl",
"coronavirus-tweet-id-2020-03-30-06.jsonl",
"coronavirus-tweet-id-2020-03-30-07.jsonl",
"coronavirus-tweet-id-2020-03-30-08.jsonl",
"coronavirus-tweet-id-2020-03-30-09.jsonl",
"coronavirus-tweet-id-2020-03-30-10.jsonl",
"coronavirus-tweet-id-2020-03-30-11.jsonl",
"coronavirus-tweet-id-2020-03-30-12.jsonl",
"coronavirus-tweet-id-2020-03-30-13.jsonl",
"coronavirus-tweet-id-2020-03-30-14.jsonl",
"coronavirus-tweet-id-2020-03-30-15.jsonl",
"coronavirus-tweet-id-2020-03-30-16.jsonl",
"coronavirus-tweet-id-2020-03-30-17.jsonl",
"coronavirus-tweet-id-2020-03-30-18.jsonl",
"coronavirus-tweet-id-2020-03-30-19.jsonl",
"coronavirus-tweet-id-2020-03-30-20.jsonl",
"coronavirus-tweet-id-2020-03-30-21.jsonl",
"coronavirus-tweet-id-2020-03-30-22.jsonl",
"coronavirus-tweet-id-2020-03-30-23.jsonl",
"coronavirus-tweet-id-2020-03-31-00.jsonl",
"coronavirus-tweet-id-2020-03-31-01.jsonl",
"coronavirus-tweet-id-2020-03-31-02.jsonl",
"coronavirus-tweet-id-2020-03-31-03.jsonl",
"coronavirus-tweet-id-2020-03-31-04.jsonl",
"coronavirus-tweet-id-2020-03-31-05.jsonl",
"coronavirus-tweet-id-2020-03-31-06.jsonl",
"coronavirus-tweet-id-2020-03-31-07.jsonl",
"coronavirus-tweet-id-2020-03-31-08.jsonl",
"coronavirus-tweet-id-2020-03-31-09.jsonl",
"coronavirus-tweet-id-2020-03-31-10.jsonl",
"coronavirus-tweet-id-2020-03-31-11.jsonl",
"coronavirus-tweet-id-2020-03-31-12.jsonl",
"coronavirus-tweet-id-2020-03-31-13.jsonl",
"coronavirus-tweet-id-2020-03-31-14.jsonl",
"coronavirus-tweet-id-2020-03-31-15.jsonl",
"coronavirus-tweet-id-2020-03-31-16.jsonl",
"coronavirus-tweet-id-2020-03-31-17.jsonl",
"coronavirus-tweet-id-2020-03-31-18.jsonl",
"coronavirus-tweet-id-2020-03-31-19.jsonl",
"coronavirus-tweet-id-2020-03-31-20.jsonl",
"coronavirus-tweet-id-2020-03-31-21.jsonl",
"coronavirus-tweet-id-2020-03-31-22.jsonl",
"coronavirus-tweet-id-2020-03-31-23.jsonl"]
df_covid = pd.read_json('./sample_tweets/03_2020/coronavirus-tweet-id-2020-03-27-00.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/03_2020/%s'%i,lines=True)
  print("finished reading: ", i)
  df_covid = pd.concat([df_covid, df_part])

df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
print(df_covid.shape, df_masks.shape)

finished reading:  coronavirus-tweet-id-2020-03-27-01.jsonl
finished reading:  coronavirus-tweet-id-2020-03-27-02.jsonl
finished reading:  coronavirus-tweet-id-2020-03-27-03.jsonl
finished reading:  coronavirus-tweet-id-2020-03-27-04.jsonl
finished reading:  coronavirus-tweet-id-2020-03-27-05.jsonl
finished reading:  coronavirus-tweet-id-2020-03-27-06.jsonl
finished reading:  coronavirus-tweet-id-2020-03-27-07.jsonl
finished reading:  coronavirus-tweet-id-2020-03-27-08.jsonl
finished reading:  coronavirus-tweet-id-2020-03-27-09.jsonl
finished reading:  coronavirus-tweet-id-2020-03-27-10.jsonl
finished reading:  coronavirus-tweet-id-2020-03-27-11.jsonl
finished reading:  coronavirus-tweet-id-2020-03-27-12.jsonl
finished reading:  coronavirus-tweet-id-2020-03-27-13.jsonl
finished reading:  coronavirus-tweet-id-2020-03-27-14.jsonl
finished reading:  coronavirus-tweet-id-2020-03-27-15.jsonl
finished reading:  coronavirus-tweet-id-2020-03-27-16.jsonl
finished reading:  coronavirus-tweet-id-

In [0]:
process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.head(50)
df_masks.to_csv('march_masks_7.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(502856, 34) (5320, 5)


In [0]:
# APRIL MASKS EXPORT CSV - PART 1
files = [
"coronavirus-tweet-id-2020-04-01-01.jsonl",
"coronavirus-tweet-id-2020-04-01-02.jsonl",
"coronavirus-tweet-id-2020-04-01-03.jsonl",
"coronavirus-tweet-id-2020-04-01-04.jsonl",
"coronavirus-tweet-id-2020-04-01-05.jsonl",
"coronavirus-tweet-id-2020-04-01-06.jsonl",
"coronavirus-tweet-id-2020-04-01-07.jsonl",
"coronavirus-tweet-id-2020-04-01-08.jsonl",
"coronavirus-tweet-id-2020-04-01-09.jsonl",
"coronavirus-tweet-id-2020-04-01-10.jsonl",
"coronavirus-tweet-id-2020-04-01-11.jsonl",
"coronavirus-tweet-id-2020-04-01-12.jsonl",
"coronavirus-tweet-id-2020-04-01-13.jsonl",
"coronavirus-tweet-id-2020-04-01-14.jsonl",
"coronavirus-tweet-id-2020-04-01-15.jsonl",
"coronavirus-tweet-id-2020-04-01-16.jsonl",
"coronavirus-tweet-id-2020-04-01-17.jsonl",
"coronavirus-tweet-id-2020-04-01-18.jsonl",
"coronavirus-tweet-id-2020-04-01-19.jsonl",
"coronavirus-tweet-id-2020-04-01-20.jsonl",
"coronavirus-tweet-id-2020-04-01-21.jsonl",
"coronavirus-tweet-id-2020-04-01-22.jsonl",
"coronavirus-tweet-id-2020-04-01-23.jsonl",
"coronavirus-tweet-id-2020-04-02-00.jsonl",
"coronavirus-tweet-id-2020-04-02-01.jsonl",
"coronavirus-tweet-id-2020-04-02-02.jsonl",
"coronavirus-tweet-id-2020-04-02-03.jsonl",
"coronavirus-tweet-id-2020-04-02-04.jsonl",
"coronavirus-tweet-id-2020-04-02-05.jsonl",
"coronavirus-tweet-id-2020-04-02-06.jsonl",
"coronavirus-tweet-id-2020-04-02-07.jsonl",
"coronavirus-tweet-id-2020-04-02-08.jsonl",
"coronavirus-tweet-id-2020-04-02-09.jsonl",
"coronavirus-tweet-id-2020-04-02-10.jsonl",
"coronavirus-tweet-id-2020-04-02-11.jsonl",
"coronavirus-tweet-id-2020-04-02-12.jsonl",
"coronavirus-tweet-id-2020-04-02-13.jsonl",
"coronavirus-tweet-id-2020-04-02-14.jsonl",
"coronavirus-tweet-id-2020-04-02-15.jsonl",
"coronavirus-tweet-id-2020-04-02-16.jsonl",
"coronavirus-tweet-id-2020-04-02-17.jsonl",
"coronavirus-tweet-id-2020-04-02-18.jsonl",
"coronavirus-tweet-id-2020-04-02-19.jsonl",
"coronavirus-tweet-id-2020-04-02-20.jsonl",
"coronavirus-tweet-id-2020-04-02-21.jsonl",
"coronavirus-tweet-id-2020-04-02-22.jsonl",
"coronavirus-tweet-id-2020-04-02-23.jsonl",
"coronavirus-tweet-id-2020-04-03-00.jsonl",
"coronavirus-tweet-id-2020-04-03-01.jsonl",
"coronavirus-tweet-id-2020-04-03-02.jsonl",
"coronavirus-tweet-id-2020-04-03-03.jsonl",
"coronavirus-tweet-id-2020-04-03-04.jsonl",
"coronavirus-tweet-id-2020-04-03-05.jsonl",
"coronavirus-tweet-id-2020-04-03-06.jsonl",
"coronavirus-tweet-id-2020-04-03-07.jsonl",
"coronavirus-tweet-id-2020-04-03-08.jsonl",
"coronavirus-tweet-id-2020-04-03-09.jsonl",
"coronavirus-tweet-id-2020-04-03-10.jsonl",
"coronavirus-tweet-id-2020-04-03-11.jsonl",
"coronavirus-tweet-id-2020-04-03-12.jsonl",
"coronavirus-tweet-id-2020-04-03-13.jsonl",
"coronavirus-tweet-id-2020-04-03-14.jsonl",
"coronavirus-tweet-id-2020-04-03-15.jsonl",
"coronavirus-tweet-id-2020-04-03-16.jsonl",
"coronavirus-tweet-id-2020-04-03-17.jsonl",
"coronavirus-tweet-id-2020-04-03-18.jsonl",
"coronavirus-tweet-id-2020-04-03-19.jsonl",
"coronavirus-tweet-id-2020-04-03-20.jsonl",
"coronavirus-tweet-id-2020-04-03-21.jsonl",
"coronavirus-tweet-id-2020-04-03-22.jsonl",
"coronavirus-tweet-id-2020-04-03-23.jsonl",
"coronavirus-tweet-id-2020-04-04-00.jsonl",
"coronavirus-tweet-id-2020-04-04-01.jsonl",
"coronavirus-tweet-id-2020-04-04-02.jsonl",
"coronavirus-tweet-id-2020-04-04-03.jsonl",
"coronavirus-tweet-id-2020-04-04-04.jsonl",
"coronavirus-tweet-id-2020-04-04-05.jsonl",
"coronavirus-tweet-id-2020-04-04-06.jsonl",
"coronavirus-tweet-id-2020-04-04-07.jsonl",
"coronavirus-tweet-id-2020-04-04-08.jsonl",
"coronavirus-tweet-id-2020-04-04-09.jsonl",
"coronavirus-tweet-id-2020-04-04-10.jsonl",
"coronavirus-tweet-id-2020-04-04-11.jsonl",
"coronavirus-tweet-id-2020-04-04-12.jsonl",
"coronavirus-tweet-id-2020-04-04-13.jsonl",
"coronavirus-tweet-id-2020-04-04-14.jsonl",
"coronavirus-tweet-id-2020-04-04-15.jsonl",
"coronavirus-tweet-id-2020-04-04-16.jsonl",
"coronavirus-tweet-id-2020-04-04-17.jsonl",
"coronavirus-tweet-id-2020-04-04-18.jsonl",
"coronavirus-tweet-id-2020-04-04-19.jsonl",
"coronavirus-tweet-id-2020-04-04-20.jsonl",
"coronavirus-tweet-id-2020-04-04-21.jsonl",
"coronavirus-tweet-id-2020-04-04-22.jsonl",
"coronavirus-tweet-id-2020-04-04-23.jsonl",
"coronavirus-tweet-id-2020-04-05-00.jsonl",
"coronavirus-tweet-id-2020-04-05-01.jsonl",
"coronavirus-tweet-id-2020-04-05-02.jsonl",
"coronavirus-tweet-id-2020-04-05-03.jsonl",
"coronavirus-tweet-id-2020-04-05-04.jsonl",
"coronavirus-tweet-id-2020-04-05-05.jsonl",
"coronavirus-tweet-id-2020-04-05-06.jsonl",
"coronavirus-tweet-id-2020-04-05-07.jsonl",
"coronavirus-tweet-id-2020-04-05-08.jsonl",
"coronavirus-tweet-id-2020-04-05-09.jsonl",
"coronavirus-tweet-id-2020-04-05-10.jsonl",
"coronavirus-tweet-id-2020-04-05-11.jsonl",
"coronavirus-tweet-id-2020-04-05-12.jsonl",
"coronavirus-tweet-id-2020-04-05-13.jsonl",
"coronavirus-tweet-id-2020-04-05-14.jsonl",
"coronavirus-tweet-id-2020-04-05-15.jsonl",
"coronavirus-tweet-id-2020-04-05-16.jsonl",
"coronavirus-tweet-id-2020-04-05-17.jsonl",
"coronavirus-tweet-id-2020-04-05-18.jsonl",
"coronavirus-tweet-id-2020-04-05-19.jsonl",
"coronavirus-tweet-id-2020-04-05-20.jsonl",
"coronavirus-tweet-id-2020-04-05-21.jsonl",
"coronavirus-tweet-id-2020-04-05-22.jsonl",
"coronavirus-tweet-id-2020-04-05-23.jsonl",
]
df_covid = pd.read_json('./sample_tweets/04_2020/coronavirus-tweet-id-2020-04-01-00.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/04_2020/%s'%i,lines=True)
  print("finished reading: ", i)
  df_covid = pd.concat([df_covid, df_part])

df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
print(df_covid.shape, df_masks.shape)

finished reading:  coronavirus-tweet-id-2020-04-01-01.jsonl
finished reading:  coronavirus-tweet-id-2020-04-01-02.jsonl
finished reading:  coronavirus-tweet-id-2020-04-01-03.jsonl
finished reading:  coronavirus-tweet-id-2020-04-01-04.jsonl
finished reading:  coronavirus-tweet-id-2020-04-01-05.jsonl
finished reading:  coronavirus-tweet-id-2020-04-01-06.jsonl
finished reading:  coronavirus-tweet-id-2020-04-01-07.jsonl
finished reading:  coronavirus-tweet-id-2020-04-01-08.jsonl
finished reading:  coronavirus-tweet-id-2020-04-01-09.jsonl
finished reading:  coronavirus-tweet-id-2020-04-01-10.jsonl
finished reading:  coronavirus-tweet-id-2020-04-01-11.jsonl
finished reading:  coronavirus-tweet-id-2020-04-01-12.jsonl
finished reading:  coronavirus-tweet-id-2020-04-01-13.jsonl
finished reading:  coronavirus-tweet-id-2020-04-01-14.jsonl
finished reading:  coronavirus-tweet-id-2020-04-01-15.jsonl
finished reading:  coronavirus-tweet-id-2020-04-01-16.jsonl
finished reading:  coronavirus-tweet-id-

In [0]:
process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.head(50)
df_masks.to_csv('april_masks_1.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(504235, 34) (6521, 5)


In [0]:
# APRIL MASKS EXPORT CSV - PART 2
files = [
"coronavirus-tweet-id-2020-04-06-01.jsonl",
"coronavirus-tweet-id-2020-04-06-02.jsonl",
"coronavirus-tweet-id-2020-04-06-03.jsonl",
"coronavirus-tweet-id-2020-04-06-04.jsonl",
"coronavirus-tweet-id-2020-04-06-05.jsonl",
"coronavirus-tweet-id-2020-04-06-06.jsonl",
"coronavirus-tweet-id-2020-04-06-07.jsonl",
"coronavirus-tweet-id-2020-04-06-08.jsonl",
"coronavirus-tweet-id-2020-04-06-09.jsonl",
"coronavirus-tweet-id-2020-04-06-10.jsonl",
"coronavirus-tweet-id-2020-04-06-11.jsonl",
"coronavirus-tweet-id-2020-04-06-12.jsonl",
"coronavirus-tweet-id-2020-04-06-13.jsonl",
"coronavirus-tweet-id-2020-04-06-14.jsonl",
"coronavirus-tweet-id-2020-04-06-15.jsonl",
"coronavirus-tweet-id-2020-04-06-16.jsonl",
"coronavirus-tweet-id-2020-04-06-17.jsonl",
"coronavirus-tweet-id-2020-04-06-18.jsonl",
"coronavirus-tweet-id-2020-04-06-19.jsonl",
"coronavirus-tweet-id-2020-04-06-20.jsonl",
"coronavirus-tweet-id-2020-04-06-21.jsonl",
"coronavirus-tweet-id-2020-04-06-22.jsonl",
"coronavirus-tweet-id-2020-04-06-23.jsonl",
"coronavirus-tweet-id-2020-04-07-00.jsonl",
"coronavirus-tweet-id-2020-04-07-01.jsonl",
"coronavirus-tweet-id-2020-04-07-02.jsonl",
"coronavirus-tweet-id-2020-04-07-03.jsonl",
"coronavirus-tweet-id-2020-04-07-04.jsonl",
"coronavirus-tweet-id-2020-04-07-05.jsonl",
"coronavirus-tweet-id-2020-04-07-06.jsonl",
"coronavirus-tweet-id-2020-04-07-07.jsonl",
"coronavirus-tweet-id-2020-04-07-08.jsonl",
"coronavirus-tweet-id-2020-04-07-09.jsonl",
"coronavirus-tweet-id-2020-04-07-10.jsonl",
"coronavirus-tweet-id-2020-04-07-11.jsonl",
"coronavirus-tweet-id-2020-04-07-12.jsonl",
"coronavirus-tweet-id-2020-04-07-13.jsonl",
"coronavirus-tweet-id-2020-04-07-14.jsonl",
"coronavirus-tweet-id-2020-04-07-15.jsonl",
"coronavirus-tweet-id-2020-04-07-16.jsonl",
"coronavirus-tweet-id-2020-04-07-17.jsonl",
"coronavirus-tweet-id-2020-04-07-18.jsonl",
"coronavirus-tweet-id-2020-04-07-19.jsonl",
"coronavirus-tweet-id-2020-04-07-20.jsonl",
"coronavirus-tweet-id-2020-04-07-21.jsonl",
"coronavirus-tweet-id-2020-04-07-22.jsonl",
"coronavirus-tweet-id-2020-04-07-23.jsonl",
"coronavirus-tweet-id-2020-04-08-00.jsonl",
"coronavirus-tweet-id-2020-04-08-01.jsonl",
"coronavirus-tweet-id-2020-04-08-02.jsonl",
"coronavirus-tweet-id-2020-04-08-03.jsonl",
"coronavirus-tweet-id-2020-04-08-04.jsonl",
"coronavirus-tweet-id-2020-04-08-05.jsonl",
"coronavirus-tweet-id-2020-04-08-06.jsonl",
"coronavirus-tweet-id-2020-04-08-07.jsonl",
"coronavirus-tweet-id-2020-04-08-08.jsonl",
"coronavirus-tweet-id-2020-04-08-09.jsonl",
"coronavirus-tweet-id-2020-04-08-10.jsonl",
"coronavirus-tweet-id-2020-04-08-11.jsonl",
"coronavirus-tweet-id-2020-04-08-12.jsonl",
"coronavirus-tweet-id-2020-04-08-13.jsonl",
"coronavirus-tweet-id-2020-04-08-14.jsonl",
"coronavirus-tweet-id-2020-04-08-15.jsonl",
"coronavirus-tweet-id-2020-04-08-16.jsonl",
"coronavirus-tweet-id-2020-04-08-17.jsonl",
"coronavirus-tweet-id-2020-04-08-18.jsonl",
"coronavirus-tweet-id-2020-04-08-19.jsonl",
"coronavirus-tweet-id-2020-04-08-20.jsonl",
"coronavirus-tweet-id-2020-04-08-21.jsonl",
"coronavirus-tweet-id-2020-04-08-22.jsonl",
"coronavirus-tweet-id-2020-04-08-23.jsonl",
"coronavirus-tweet-id-2020-04-09-00.jsonl",
"coronavirus-tweet-id-2020-04-09-01.jsonl",
"coronavirus-tweet-id-2020-04-09-02.jsonl",
"coronavirus-tweet-id-2020-04-09-03.jsonl",
"coronavirus-tweet-id-2020-04-09-04.jsonl",
"coronavirus-tweet-id-2020-04-09-05.jsonl",
"coronavirus-tweet-id-2020-04-09-06.jsonl",
"coronavirus-tweet-id-2020-04-09-07.jsonl",
"coronavirus-tweet-id-2020-04-09-08.jsonl",
"coronavirus-tweet-id-2020-04-09-09.jsonl",
"coronavirus-tweet-id-2020-04-09-10.jsonl",
"coronavirus-tweet-id-2020-04-09-11.jsonl",
"coronavirus-tweet-id-2020-04-09-12.jsonl",
"coronavirus-tweet-id-2020-04-09-13.jsonl",
"coronavirus-tweet-id-2020-04-09-14.jsonl",
"coronavirus-tweet-id-2020-04-09-15.jsonl",
"coronavirus-tweet-id-2020-04-09-16.jsonl",
"coronavirus-tweet-id-2020-04-09-17.jsonl",
"coronavirus-tweet-id-2020-04-09-18.jsonl",
"coronavirus-tweet-id-2020-04-09-19.jsonl",
"coronavirus-tweet-id-2020-04-09-20.jsonl",
"coronavirus-tweet-id-2020-04-09-21.jsonl",
"coronavirus-tweet-id-2020-04-09-22.jsonl",
"coronavirus-tweet-id-2020-04-09-23.jsonl",
"coronavirus-tweet-id-2020-04-10-00.jsonl",
"coronavirus-tweet-id-2020-04-10-01.jsonl",
"coronavirus-tweet-id-2020-04-10-02.jsonl",
"coronavirus-tweet-id-2020-04-10-03.jsonl",
"coronavirus-tweet-id-2020-04-10-04.jsonl",
"coronavirus-tweet-id-2020-04-10-05.jsonl",
"coronavirus-tweet-id-2020-04-10-06.jsonl",
"coronavirus-tweet-id-2020-04-10-07.jsonl",
"coronavirus-tweet-id-2020-04-10-08.jsonl",
"coronavirus-tweet-id-2020-04-10-09.jsonl",
"coronavirus-tweet-id-2020-04-10-10.jsonl",
"coronavirus-tweet-id-2020-04-10-11.jsonl",
"coronavirus-tweet-id-2020-04-10-12.jsonl",
"coronavirus-tweet-id-2020-04-10-13.jsonl",
"coronavirus-tweet-id-2020-04-10-14.jsonl",
"coronavirus-tweet-id-2020-04-10-15.jsonl",
"coronavirus-tweet-id-2020-04-10-16.jsonl",
"coronavirus-tweet-id-2020-04-10-17.jsonl",
"coronavirus-tweet-id-2020-04-10-18.jsonl",
"coronavirus-tweet-id-2020-04-10-19.jsonl",
"coronavirus-tweet-id-2020-04-10-20.jsonl",
"coronavirus-tweet-id-2020-04-10-21.jsonl"]
df_covid = pd.read_json('./sample_tweets/04_2020/coronavirus-tweet-id-2020-04-06-00.jsonl',lines=True)
for i in files:
  df_part = pd.read_json('./sample_tweets/04_2020/%s'%i,lines=True)
  print("finished reading: ", i)
  df_covid = pd.concat([df_covid, df_part])

df_masks = df_covid[df_covid['full_text'].str.contains('mask')]
print(df_covid.shape, df_masks.shape)

finished reading:  coronavirus-tweet-id-2020-04-06-01.jsonl
finished reading:  coronavirus-tweet-id-2020-04-06-02.jsonl
finished reading:  coronavirus-tweet-id-2020-04-06-03.jsonl
finished reading:  coronavirus-tweet-id-2020-04-06-04.jsonl
finished reading:  coronavirus-tweet-id-2020-04-06-05.jsonl
finished reading:  coronavirus-tweet-id-2020-04-06-06.jsonl
finished reading:  coronavirus-tweet-id-2020-04-06-07.jsonl
finished reading:  coronavirus-tweet-id-2020-04-06-08.jsonl
finished reading:  coronavirus-tweet-id-2020-04-06-09.jsonl
finished reading:  coronavirus-tweet-id-2020-04-06-10.jsonl
finished reading:  coronavirus-tweet-id-2020-04-06-11.jsonl
finished reading:  coronavirus-tweet-id-2020-04-06-12.jsonl
finished reading:  coronavirus-tweet-id-2020-04-06-13.jsonl
finished reading:  coronavirus-tweet-id-2020-04-06-14.jsonl
finished reading:  coronavirus-tweet-id-2020-04-06-15.jsonl
finished reading:  coronavirus-tweet-id-2020-04-06-16.jsonl
finished reading:  coronavirus-tweet-id-

In [0]:
process_data(df_masks, False)
print(df_covid.shape, df_masks.shape)
df_masks.head(50)
df_masks.to_csv('april_masks_2.csv',index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-ve

(489234, 34) (4840, 5)


## 2) Cleaning the Twitter Sentiment dataset Sent140

In [0]:
cols = ['sentiment','id_str','created_at','query_string','user','full_text']
df_sentiment = pd.read_csv("./sentiment_data/training_1600000_processed_noemoticon.csv",header=None, names=cols, engine='python')
df_sentiment.drop(['query_string'], axis=1, inplace=True)
df_sentiment.drop_duplicates(subset= ['user','full_text'], keep='first', inplace=True)
df_sentiment.drop(['user'], axis=1, inplace=True)
print(df_sentiment.shape)
df_sentiment.head()

In [0]:
process_data(df_sentiment, True)
print(df_sentiment.shape)
df_sentiment.to_csv('./sentiment_data/cleaned_sentiment_training_data_v4.csv', index=False)

## Cleaning the Twitter GOP Debate Sentiment Dataset

In [0]:
df_gop = pd.read_csv('./sentiment_data/sentiment.csv')
cols = ['id', 'candidate', 'candidate_confidence', 'relevant_yn', 'relevant_yn_confidence', 'sentiment', 'sentiment_confidence', 'subject_matter', 'subject_matter_confidence', 'candidate_gold', 'name', 'relevant_yn_gold', 'retweet_count', 'sentiment_gold', 'subject_matter_gold', 'text',
        'tweet_coord', 'tweet_created', 'tweet_id', 'tweet_location', 'user_timezone']
df_gop = df_gop[['tweet_created', 'tweet_id', 'text', 'sentiment', 'sentiment_confidence']]
df_gop.rename(columns = {'text': 'full_text'}, inplace=True)
process_data(df_gop, True)

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

In [0]:
def convert_labels_numeric(row):
  if row == 'Negative':
    return 0
  elif row == 'Neutral':
    return 2
  elif row == 'Positive':
    return 4
df_gop = df_gop[['tweet_created', 'tweet_id', 'full_text','cleaned_text', 'sentiment_confidence', 'sentiment']]
df_gop['sentiment_encoded'] = df_gop['sentiment'].apply(convert_labels_numeric)
df_gop.head(50)

Unnamed: 0,tweet_created,tweet_id,full_text,cleaned_text,sentiment_confidence,sentiment,sentiment_encoded
0,2015-08-07 09:54:46 -0700,6.296972e+17,RT @NancyLeeGrahn: How did everyone feel about...,how did everyone feel about the climate change...,0.6578,Neutral,2
1,2015-08-07 09:54:46 -0700,6.296972e+17,RT @ScottWalker: Didn't catch the full #GOPdeb...,did not catch the full gopdebate last night he...,0.6333,Positive,4
2,2015-08-07 09:54:46 -0700,6.296972e+17,RT @TJMShow: No mention of Tamir Rice and the ...,no mention of tamir rice and the gopdebate was...,0.6629,Neutral,2
3,2015-08-07 09:54:45 -0700,6.296972e+17,RT @RobGeorge: That Carly Fiorina is trending ...,that carly fiorina is trending hours after he...,1.0,Positive,4
4,2015-08-07 09:54:45 -0700,6.296972e+17,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,gopdebate w delivered the highest ratings in t...,0.7045,Positive,4
5,2015-08-07 09:54:44 -0700,6.296972e+17,"RT @GregAbbott_TX: @TedCruz: ""On my first day ...",on my first day i will rescind every illegal e...,0.6332,Positive,4
6,2015-08-07 09:54:44 -0700,6.296972e+17,RT @warriorwoman91: I liked her and was happy ...,i liked her and was happy when i heard she was...,0.6761,Negative,0
7,2015-08-07 09:54:44 -0700,6.296972e+17,Going on #MSNBC Live with @ThomasARoberts arou...,going on msnbc live with around 2 pm et gopdebate,1.0,Neutral,2
8,2015-08-07 09:54:44 -0700,6.296972e+17,Deer in the headlights RT @lizzwinstead: Ben C...,deer in the headlights rt ben carson may be th...,0.6889,Negative,0
9,2015-08-07 09:54:42 -0700,6.296972e+17,RT @NancyOsborne180: Last night's debate prove...,last nights debate proved it gopdebate batsask...,0.6778,Negative,0


In [0]:
df_gop.to_csv('cleaned_sentiment_data.csv')

# Airline Sentiment Tweets

In [0]:
df_air = pd.read_csv('./sentiment_data/Tweets.csv')

In [0]:
df_air.head(10)

cols = ['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence', 'negativereason', 'negativereason_confidence', 'airline', 'airline_sentiment_gold', 
        'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord', 'tweet_created', 'tweet_location', 'user_timezone']

In [0]:
df_air = df_air[['tweet_id','tweet_created', 'airline_sentiment', 'text']]
df_air.rename(columns = {'text': 'full_text', 'airline_sentiment':'sentiment'}, inplace=True)
df_air.head()

Unnamed: 0,tweet_id,tweet_created,sentiment,full_text
0,570306133677760513,2015-02-24 11:35:52 -0800,neutral,@VirginAmerica What @dhepburn said.
1,570301130888122368,2015-02-24 11:15:59 -0800,positive,@VirginAmerica plus you've added commercials t...
2,570301083672813571,2015-02-24 11:15:48 -0800,neutral,@VirginAmerica I didn't today... Must mean I n...
3,570301031407624196,2015-02-24 11:15:36 -0800,negative,@VirginAmerica it's really aggressive to blast...
4,570300817074462722,2015-02-24 11:14:45 -0800,negative,@VirginAmerica and it's a really big bad thing...


In [0]:
def change_vals(row):
    if row == 'neutral' or row == 'positive':
        return "non-negative"
    else:
        return row
df_air['sentiment'] = df_air['sentiment'].apply(change_vals)

In [0]:
df_air['sentiment'].value_counts()

negative        9178
non-negative    5462
Name: sentiment, dtype: int64

In [0]:
process_data(df_air, True)

Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to detect the language reliably.
Detector is not able to dete

In [0]:
df_air.to_csv('cleaned_airlin_sentiment.csv', index=False)

In [0]:
df_air.head(50)

Unnamed: 0,tweet_id,tweet_created,sentiment,full_text,cleaned_text
1,570301130888122368,2015-02-24 11:15:59 -0800,non-negative,@VirginAmerica plus you've added commercials t...,plus you have added commercials to the experie...
2,570301083672813571,2015-02-24 11:15:48 -0800,non-negative,@VirginAmerica I didn't today... Must mean I n...,i did not today must mean i need to take anoth...
3,570301031407624196,2015-02-24 11:15:36 -0800,negative,@VirginAmerica it's really aggressive to blast...,it is really aggressive to blast obnoxious ent...
4,570300817074462722,2015-02-24 11:14:45 -0800,negative,@VirginAmerica and it's a really big bad thing...,and it is a really big bad thing about it
5,570300767074181121,2015-02-24 11:14:33 -0800,negative,@VirginAmerica seriously would pay $30 a fligh...,seriously would pay $30 a flight for seats tha...
6,570300616901320704,2015-02-24 11:13:57 -0800,non-negative,"@VirginAmerica yes, nearly every time I fly VX...",yes nearly every time i fly vx this ear worm w...
7,570300248553349120,2015-02-24 11:12:29 -0800,non-negative,@VirginAmerica Really missed a prime opportuni...,really missed a prime opportunity for men with...
9,570295459631263746,2015-02-24 10:53:27 -0800,non-negative,"@VirginAmerica it was amazing, and arrived an ...",it was amazing and arrived an hour early you a...
10,570294189143031808,2015-02-24 10:48:24 -0800,non-negative,@VirginAmerica did you know that suicide is th...,did you know that suicide is the second leadin...
11,570289724453216256,2015-02-24 10:30:40 -0800,non-negative,@VirginAmerica I &lt;3 pretty graphics. so muc...,i 3 pretty graphics so much better than minima...


In [0]:
# COMBINING THE FINAL TWITTER COVID MASK CLEANED DATA SET
files = ['jan_masks_1', 'jan_masks_2', 'feb_masks_1', 'feb_masks_2', 'feb_masks_3', 'feb_masks_4', 'feb_masks_5', 'march_masks_1',
          'march_masks_2',  'march_masks_3',  'march_masks_4',  'march_masks_5',  'march_masks_6',  'march_masks_7', 'april_masks_1', 'april_masks_2']
df_combined = pd.read_csv('./cleaned_covid19_mask_tweets_sampling/jan_masks_1.csv')
for i in files:
  df_part = pd.read_csv('./cleaned_covid19_mask_tweets_sampling/%s.csv'%i)
  df_combined = pd.concat([df_combined, df_part])

print(df_combined.shape)

(124277, 5)


In [0]:
df_combined.to_csv('./cleaned_covid19_mask_tweets_sampling/combined_cleaned_covid_mask_sample.csv', index=False)

In [0]:
df_combined.head(50)


Unnamed: 0,created_at,id_str,full_text,user_id,cleaned_text
0,2020-01-22 12:25:55+00:00,1219959373339906048,RT @SaintBurno: To wear or not to wear mask in...,1183485916821590016,to wear or not to wear mask in china either wa...
1,2020-01-22 13:45:44+00:00,1219979461224873984,"After tons of criticisms, #CathayPacific final...",885536695046754304,after tons of criticisms cathaypacific finally...
2,2020-01-22 14:07:26+00:00,1219984924897243136,RT @SaintBurno: To wear or not to wear mask in...,1165459197242163202,to wear or not to wear mask in china either wa...
3,2020-01-22 14:16:17+00:00,1219987150629896192,.@cathaypacific flight attendants demanding to...,83770449,flight attendants demanding to wear face mask...
4,2020-01-22 14:34:47+00:00,1219991805241880576,RT @SaintBurno: To wear or not to wear mask in...,996786531816824832,to wear or not to wear mask in china either wa...
5,2020-01-22 14:54:01+00:00,1219996646898991104,@chigrl Anecdote: Amazon ships MMM N95 masks t...,2288714288,anecdote amazon ships mmm n95 masks to hk and ...
6,2020-01-22 14:52:57+00:00,1219996378652467200,RT @onlyyoontv: All people in public places in...,60311843,all people in public places in wuhan are now r...
7,2020-01-22 14:30:42+00:00,1219990779923333120,RT @SaintBurno: To wear or not to wear mask in...,56075722,to wear or not to wear mask in china either wa...
8,2020-01-22 14:26:40+00:00,1219989763635068928,RT @SaintBurno: To wear or not to wear mask in...,1163964856984145920,to wear or not to wear mask in china either wa...
9,2020-01-22 15:24:53+00:00,1220004412023803904,"Just put on your masks please, HKERS, do not u...",973025903008628737,just put on your masks please hkers do not und...
