# Quick Exploratory Data Analysis

In [21]:
import numpy as np
import pandas as pd

# Importing Data
From my database of 10,747,549 tweets obtained with keywords "coronavirus" or "COVID-19", I'm working with the first 200,000 to make exploratory data analysis quick and easy.

In [1]:
# Some of the data outputted to csv from the MySQL database is corrupted; 
# the `error_bad_lines` parameter skips over those rows.
file = '../../../Documents/First200K_tweets.csv'
df = pd.read_csv(file, error_bad_lines=False)

In [28]:
df.head()

Unnamed: 0,tweet,location
0,To each and every person celebrating this news...,"Far From Home, WY"
1,Fusco-Jackson died a day before her test for c...,
2,"And yet asymptomati… https://t.co/dUlngepDQV""","North Dakota, USA"
3,Coronavirus is one thing. We clearly can’t be ...,"Sacramento, CA"
4,"With @ParentMail down, this also failed to be ...",Balham


In [29]:
# 633 rows were not imported because of some kind of corruption in the data
len(df)

199377

In [37]:
# Taking a look at the full text of the second tweet
df.iloc[1][0]

'Fusco-Jackson died a day before her test for coronavirus came back positive Saturday evening"'

# Dropping rows
Since my goal is to correlate the tweet's content with its location, I'm dropping every row with no location listed.

In [43]:
df = df.dropna()

In [44]:
# From the original 200K tweets, we're down to 144,545
len(df)

144545

In [273]:
# Lowercasing eliminates spelling differences due to capitalization variance
df.location = df.location.str.lower()

In [48]:
df.head()

Unnamed: 0,tweet,location
0,To each and every person celebrating this news...,"far from home, wy"
2,"And yet asymptomati… https://t.co/dUlngepDQV""","north dakota, usa"
3,Coronavirus is one thing. We clearly can’t be ...,"sacramento, ca"
4,"With @ParentMail down, this also failed to be ...",balham
5,I don’t understand why famous people who “ fee...,south jersey


In [61]:
# Pandas 1.0 allows Series to be converted to string datatype for faster processing
df = df.convert_dtypes()

In [62]:
df['tweet'].dtype

StringDtype

In [52]:
# I was curious about how many of these 144,000 tweets claim a Texas location.
# This cell outputs a dataframe of the tweets; its length is 5629 rows, a number I could
# also obtain by wrapping the statement below in `len()`

df[(df['location'].str.contains('texas'))|(df['location'].str.contains('tx'))]

Unnamed: 0,tweet,location
7,BREAKING NFL NEWS ⁦@CenTexBeat⁩ https://t.co/...,"waco, tx"
70,https://t.co/c1JEnxeJ5X Austin Area Food Bank\...,"texas, usa"
121,This tweet takes on a whole new level of “pres...,texas
131,@DonaldJTrumpJr The media? So Trump tv aka Fox...,texas
168,Check it out y'all it's @TumaTime in the Obser...,tx
...,...,...
199152,+/- doubling every 2 days,"k t boundary, tx"
199182,Analysis | Trump’s eruption at an NBC reporter...,"san antonio, tx"
199273,Being quarantined while married gives you valu...,"dallas, tx"
199293,Mortality Rate of Coronavirus in US Slips to 1...,texas


In [279]:
# Using Boolean methods, I can also get a count this way because False = 0 and True = 1
df['location'].str.contains('texas').sum() + df['location'].str.contains('TX').sum()

# Locations
This will be the most time-consuming portion of data cleaning, and is something I'm working on elsewhere. The 39,662 unique locations below are actually many variant spellings and specificity of a smaller number of identical locations. Also, roughly 20% of people put a joke location like "my kitchen" as their current location. My goal is to condense location data to areas of different sizes, from large city to state/province to nation, when such information is available. 

For example, `df[199152]['location'] = 'k t boundary, tx'`. I'll re-classify that as Texas, USA. With `df[199273]['location'] = 'dallas, tx',` I can add additional information to classify it as Dallas, Texas, USA. For `df[199367]['location'] = 'satx'`, I'll have to decide how much work I want to put into decoding cities with common abbreviations such as San Antonio, Texas, USA, vs. just putting the state and nation, i.e., Texas, USA. In some cases, it may be useful to have city-level information when it's available. 

Additional exploratory analysis will be necessary.

In [70]:
# Unique locations
df.location.nunique()

39662

# Cleaning Functions
I'm using these so I can create extra columns in the dataframe and see if there's any correlation between these subsets of data and the location.

In [266]:
def no_urls(string):
    """
    From a string, returns text that is cleaned of any URLs starting in 'http' or 'https'

    """
    wordlist = string.split()
    text = ' '.join(word for word in wordlist if not 'http' in word)
    return text.lower()

In [276]:
# The 'before'
df['tweet'][2]

'And yet asymptomati… https://t.co/dUlngepDQV"'

In [275]:
# After
no_urls(df['tweet'][2])

'and yet asymptomati…'

In [263]:
def hashtags(string):
    """
    From a string, returns any text that contains a hashtag

    """
    wordlist = string.split()
    text = ' '.join(word for word in wordlist if '#' in word)
    return text.lower()

In [260]:
def at_mention(string):
    """
    From a string, returns any text that contains a @

    """
    wordlist = string.split()
    text = ' '.join(word for word in wordlist if '@' in word)
    return text.lower()

In [238]:
def cleaned_tweet(string):
    """
    Removes URLs, hashtags, and @ mentions in tweets
    """
    wordlist = string.split()
    text = ' '.join(word for word in wordlist if 'http' not in word and '@' not in word and '#' not in word)
    return text.lower()

In [268]:
# The `apply` function lets me apply my function to every cell in the Pandas column
df['no_url'] = df['tweet'].apply(no_urls)

In [264]:
df['hashtags'] = df['tweet'].apply(hashtags)

In [261]:
df['at_mention'] = df['tweet'].apply(at_mention)

In [239]:
df['cleaned'] = df['tweet'].apply(cleaned_tweet)

In [265]:
df.head()

Unnamed: 0,tweet,location,no_url,hashtags,at_mention,cleaned
0,To each and every person celebrating this news...,"far from home, wy",To each and every person celebrating this news...,,,to each and every person celebrating this news...
2,"And yet asymptomati… https://t.co/dUlngepDQV""","north dakota, usa",And yet asymptomati…,,,and yet asymptomati…
3,Coronavirus is one thing. We clearly can’t be ...,"sacramento, ca",Coronavirus is one thing. We clearly can’t be ...,,,coronavirus is one thing. we clearly can’t be ...
4,"With @ParentMail down, this also failed to be ...",balham,"With @ParentMail down, this also failed to be ...",,@parentmail,"with down, this also failed to be sent out ear..."
5,I don’t understand why famous people who “ fee...,south jersey,I don’t understand why famous people who “ fee...,,,i don’t understand why famous people who “ fee...


In [272]:
# The number of unique hashtags and @ mentions is misleading; this is showing the unique ordering
# and inclusion of each in any given row. NLTK tokenization will give me a real total count of each.
df.nunique()

tweet         143491
location       39662
no_url        131330
hashtags       13319
at_mention     21731
cleaned       128825
dtype: int64

In [249]:
# Taking a look at the first five cleaned up tweets
# For the purposes of modeling, this is one of the datasets I'll try out.
[item for item in df['cleaned'][1:5]]

['and yet asymptomati…',
 'coronavirus is one thing. we clearly can’t be closing down golf courses though.',
 'with down, this also failed to be sent out earlier this afternoon (attachement to the letter)',
 'i don’t understand why famous people who “ feel like they’re getting a cold “ can automatically get tested for coro…']

In [250]:
# I'm joining the text with line returns so that the tweets will remain discrete.
clean_text = '\n'.join(tweet for tweet in df['cleaned'])

In [251]:
# These cleaned up texts for only 144,000 tweets still generate more than 11 million characters.
len(clean_text)

11270314

In [259]:
# Writing the text file for use elsewhere
with open('clean_text.txt', 'w') as outfile:
    outfile.write(clean_text)