# Step 2 - analyze the geo-tagged and place-tagged tweets

We are assuming tha all the archived tweets were read in in step 1, and an output csv file 
was writtten that contains only those tweets with a 'place' or a 'geo cordinates' included.

Now, we will start with that csv file, and work up some statistics and basic bar graphs
so we can get a better idea of the data by country or region or time.

In [None]:
# necessary imports
import pandas as pd 
import csv
import json
import os
import bz2
import gc
import pytz
import datetime as dt
import time
import math
import matplotlib
import matplotlib.pyplot as plt

# !pip install reverse_geocoder
import reverse_geocoder

# !pip install pycountry-convert
import pycountry_convert as pcountry




In [None]:
# import Geopandas

# There were lots of problems with getting geopandas to run consistently; it might fail one day or another. 
It's recommended to create an environment, install geopandas and try it.  If it doesn't work, try updating it,
uninstalling and reinstalling it, and anything else suggested on Stack overflow.  I found conda-forge to have the 
best successrate, but it would still break some days and require re-installation.

# !conda install -c conda-forge geopandas  (geopandas is a pain to install, use conda-forge for best chances of success)
# !pip install descartes
import descartes
import geopandas

# if you are having trouble, you can try upgrading and running with this command
# !pip install geopandas --upgrade 

# or, you can uninstall geopandas and start over again, probably using conda-forge


In [None]:
# get all csv files in this working directory
# they should be in there as one csv file per day but this will rather dumbly
# just import everything it sees in the current working directory

_, _, all_files_in_dir = next(os.walk("."))
csv_files = [ fi for fi in all_files_in_dir if (fi.lower().endswith(".csv") and ("copy" not in fi.lower())) ]
print(csv_files)

In [None]:
# Now read in all those csv files and create one big pandas dataframe
tweets = pd.DataFrame()

tic = time.perf_counter() # start a timer

#tweets = [pd.read_csv(f) for f in csv_files]
for f in csv_files:
    tw = pd.read_csv(f, dtype={'id': str}, parse_dates=['created_at']) 
    print("file {}, tweets {}". format(f, len(tw)))
    tweets = tweets.append( tw )
    
print("total tweets read in: {}".format(len(tweets)))
      
tweets.drop_duplicates()

gc.collect(2)

#print(tweets.describe())
#print(tweets.head())
#print(tweets.tail())
#print(tweets.columns)
print(tweets.info())

# how long did that take?
toc = time.perf_counter()
print(f"reading all the files took {toc - tic:0.4f} seconds")


In [None]:
# The tweets df has several lines with no places, but geo coordinates instead
# so let's look up the place name and country code given the geo coordinates
# and set that value back in the original dataframe

tic = time.perf_counter() # start a timer

coords_list = pd.DataFrame( tweets[tweets['place'].isna()][['id','coordinates']] )

for _, row in coords_list.iterrows():

    # find the lat and lon within the row
    line = eval(row['coordinates'])                 # read the row string in as a dict 
    lat = line['coordinates'][1]                    # twitter stores lat and lon backwards
    lon = line['coordinates'][0]

    # get the geo data - city, country, country code using geocoder
    gcode = reverse_geocoder.search( (lat, lon) )   
    print("coords {}, {} are in country code {}".format(lat, lon, gcode[0]['cc']))  
    
    # now let's format and set the name and country code values in "place" in the original tweets df,
    # while making sure we edit the original df and not a copy
    new_place_name = gcode[0]['admin1']
    if not new_place_name:
        new_place_name = gcode[0]['admin2']
    if not new_place_name:
        new_place_name = gcode[0]['name']

    new_place = {\
        'name: ' + new_place_name, \
        'country_code: ' + gcode[0]['cc'] }
    i = row['id']

    tweets.loc[ tweets['id']==i ,'place'] = str(new_place)
    #print(tweets.loc[ tweets['id']==i, 'place'])
    
# how long did that take?
toc = time.perf_counter()
print(f"iterating and determining place from geo coords took {toc - tic:0.4f} seconds")

gc.collect(2)

In [None]:
# Now, to make life easier, and to do this only once,
# let's make some separate columns for later manipulation

tic = time.perf_counter() # start a timer

# add a column each for the hour number and the day number
tweets['hour'] = tweets['created_at'].dt.hour
tweets['day']  = tweets['created_at'].dt.day

# let's also allocate space for country code and continent code columns here all at once
# to hopefully reduce the amount of internal memory rearrangment as we fill in the values later
tweets['cc']   = "--"
tweets['cont_code'] = "--"

# how long did that take?
toc = time.perf_counter()
print(f"adding the day/hour and allocating ccode columns took {toc - tic:0.4f} seconds")

tic = time.perf_counter() # start timer again

# for other data, we need to (unfrotunately) iterate in order to read the dicts inside a column
# and call multiple functions on them
# so let's iterate once and do multiple things at a time while in that iterating loop   
rows_updated = 0
for i, row in tweets.iterrows():
    try:
        # get the country code column
        ccode = eval(row['place'])['country_code']
        #print("cc:" + ccode)

        # get the continent
        continent_code = pcountry.country_alpha2_to_continent_code(ccode)
        #print("continent:" + continent_code)

        # determine the local datetime from the country code so we can convert to local time
        tz = pytz.country_timezones[ccode]
        local_dt = row['created_at'].astimezone(tz[0])

        # add the country code and continent code columns
        tweets.at[i,'cc'] = ccode.upper()
        tweets.at[i,'cont_code'] = continent_code.upper()

        # add local hour and day columns
        tweets.at[i,'local_hour'] = local_dt.hour
        tweets.at[i,'local_day']  = local_dt.day
    except:
        pass

    rows_updated += 1

    # uncomment when testing code changes
    #if rows_updated > 100:
    #    break

    # show progress every 1000 rows...
    if rows_updated % 1000 == 0:
        print("processed {} rows...".format(rows_updated))
       
print(tweets.info())
print("total rows updated: {}".format(rows_updated))

# now how long did the iteration take?
toc = time.perf_counter()
print(f"iterating and determining country/continent took {toc - tic:0.4f} seconds, or {((toc - tic)/rows_updated):0.4f} per row")

gc.collect(2)

# Save (or Load) Tweets DataFrame 
Since we have done so much time-consuming work so far, let's save or load from here
in case the env crashes or you need to close and reopen

In [None]:
# manually set this file name here when saving to pickle file
latest_pickle_file = 'tweets_dataframe.202105111550.pickle'

# uncomment this line to SAVE the dataframe to pickle file
#tweets.to_pickle(latest_pickle_file)

# uncomment this line to LOAD the dataframe from pickle file
#tweets = pd.read_pickle(latest_pickle_file)

# Looking at the data
Now let's explore the data a little bit to get a feel for it

In [None]:
# PICK A COUNTRY HERE 
# with two-letter country code 
# (for codes, see https://www.iban.com/country-codes)

country_to_plot = 'AR'
country_name = pcountry.country_alpha2_to_country_name(country_to_plot.upper())

# select data based on that country code
tweets_local = tweets[tweets['cc']==country_to_plot.upper()]

# now plot that country's data by number of tweets per local hour of day
if (not len(tweets_local) > 0):
    print("no data to plot")
else:
    try:
        # use the groupby command to count tweets per hour, no matter the day
        t1 = tweets_local.groupby([tweets_local.local_hour])['id'].count()
        print(t1.describe())
        #print(t)

    except:
        print("An error occured")

title_text = country_name + " (" + country_to_plot + ") " + "tweets per local hour of day, histogram"
ax = t1.plot.bar(title=title_text)

fig = ax.get_figure()
fig.savefig("tweets_by_localhour_for_" + country_to_plot + ".png")

In [None]:
# We can look at tweets per country

# group by country code, with the values being the count of tweets per country
t2 = tweets.groupby([tweets.cc])['id'].count()

# drop any empty rows
for i, j in t2.iteritems():
    if (i == "") or (i == "--"):
        t2.drop(i, inplace=True)

print(t2.describe())

# and plot the whole thing in alphabetical order
# plot needs to be pretty big in order to get all the countries in, legibly
ax2 = t2.plot.bar(figsize=(30,5), sort_columns=True, rot=90, title="Tweets by country")
ax2.set_xlabel('Country Code',fontdict={'fontsize':12})
plt.rc('xtick', labelsize=8)    # fontsize of the tick labels
plt.show()

fig = ax2.get_figure()
fig.savefig("tweets_by_country.png")

In [None]:
# some countries are real outliers, making the majority tough to see
# so let's switch to log scale on y axis to see the smaller ones
ax2log = t2.plot.bar(figsize=(30,5), logy=True, sort_columns=True, rot=90, title="Tweets by country (log)")
ax2log.set_xlabel('Country Code',fontdict={'fontsize':12})

# we'll need y axis major and minor grid lines to not lose the sense of enormous scale for th big countries
ax2log.grid(axis='y')
ax2log.grid('on', which='minor', axis='y' )

plt.rc('xtick', labelsize=8)    # fontsize of the tick labels
plt.show()

fig = ax2log.get_figure()
fig.savefig("tweets_by_country_logscale.png")

In [None]:
# we can do the same exercise by continent

# group by continent code, with the values being the count of tweets per country
t3 = tweets.groupby([tweets.cont_code])['id'].count()
print(t3.describe())

# and plot the whole thing in alphabetical order
# plot needs to be pretty big in order to get all the countries in, legibly
ax3 = t3.plot.bar(figsize=(7,5), sort_columns=True, rot=90, title="Tweets by continent")
ax3.set_xlabel('Continent Code',fontdict={'fontsize':12})
plt.rc('xtick', labelsize=8)    # fontsize of the tick labels
plt.show()
fig = ax3.get_figure()
fig.savefig("tweets_by_continent.png")

In [None]:
# we can do a basic map
gdf = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))

gdf = gdf.assign(cc='--')
gdf = gdf.assign(cont_code='--')

rows_updated = 0
for i, row in gdf.iterrows():
    try:
        try:
            # get the country code column
            ccode = pcountry.country_alpha3_to_country_alpha2(row['iso_a3'])
        except:
            try:
                ccode = pcountry.country_name_to_country_alpha2(row['name'])
            except:
                pass
            pass

        # get the continent
        continent_code = pcountry.country_alpha2_to_continent_code(ccode)

        # add the country code and continent code columns
        gdf.at[i,'cc'] = ccode.upper()
        gdf.at[i,'cont_code'] = continent_code.upper()
    except:
        print("error " + row['name'] + " " + row['iso_a3'])


gdf = gdf.merge(t2, how='left', on='cc')
gdf.rename(columns = {'id':'num_tweets'}, inplace = True)
gdf['num_tweets'] = gdf['num_tweets'].fillna(0)

print('Number of rows:', len(gdf))

# for debugging, tou can write a temporary csv file
#tempdf=pd.DataFrame(gdf)
#tempdf.to_csv("temp.csv" )

print(gdf.head())

# Create a map
fig, ax4 = plt.subplots(dpi=600)
gdf.plot(ax=ax4, column='num_tweets', cmap='RdYlGn_r', legend=True)
fig.savefig("world_tweets_by_country.png")


In [None]:
# we can do a slightly different map; using log function to better visualize 
# across all countries instead of letting the top few countries dominate the scale
import math
 
gdf = gdf.assign(log_num_tweets=0.0)

for i, row in gdf.iterrows():
    try:
        if float( row['num_tweets'] ) > 0:
            gdf.at[i,'log_num_tweets'] = math.log10(float( row['num_tweets']) )
        else:
            gdf.at[i,'log_num_tweets'] = 0
    except:
        print("error " + row['name'] + " " + str(row['num_tweets']))

print(gdf.head())

# Create a map
fig, ax5 = plt.subplots(dpi=1200)
gdf.plot(ax=ax5, column='log_num_tweets', cmap='RdYlGn_r', legend=True)
fig.savefig("world_log_tweets_by_country.png")

In [None]:
# that's it!