# Step 1 - Read and Import the Archived Twitter Data

The first step is to read and import all the archived data.  Download "spritzer" archived data from 
Raw data from:  https://archive.org/details/twitterstream
The data is quite large, and is stored in directory/file format as:
./yyyy/mm/dd/hh/{00-99}.json.bz2 

Since our dataframes will overload the computer memory if we read it all in at once, we'll need
to be careful about memory management.  For example, we can read in one directory at a time,
discard data that we don't want or need in foreseeable future, and save to a csv file; then
dump or re-use memory and go again.

After downloading hte data you want to analyze, run the portions of this file first, then garbage collect
or refresh your kernel to free up memory.  A csv from this ipynb file will be used as the basis for 
further analysis in parts 2 and 3.

In [None]:
# import necssary modules
import pandas as pd 
import csv
import json
import os
import bz2
import time


In [None]:
# Function to check if a 'place' or a 'coordinates' are included in a tweet
# One or both can exist in a tweet.  For this code, 'place' is checked first and, 
# if it exists, returns true before 'coordinates' is checked

def does_this_tweet_have_a_place(tweet):
    """Function to check if a 'place' or a 'coordinates' are included in a tweet"""

    if tweet['place']:
        country_code = (tweet['place']['country_code'])
        #print("country code: " + country_code)
        return True
    elif tweet['coordinates']:
        #print("geo coordinates: {}".format(tweet['coordinates']))
        return True
    else:
        return False

In [None]:
# Function to read in all the tweets from any one bz-zipped json  file
def read_tweets_from_bzfile(filename):
    """Function read in all the tweets from any one bz-zipped json  file"""

    # local variables
    tweets = []
    read_count = 0
    kept_count = 0

    # open and unzip the bz2 file
    with bz2.open(filename, "rb") as data_file:
        for line in data_file:
            try: 
                # load the tweet on this line of the file
                tweet = json.loads(line)
                read_count += 1
                #print(tweet['text'])

                # check if the tweet has a place or geo coordinates
                if does_this_tweet_have_a_place(tweet) :
                    tweet['file_path'] = filename
                    tweets.append(tweet)
                    kept_count += 1
            except:
                pass

    # print some outputs so we can watch it working
    print("file read: {}".format(filename))
    print(" tweets read in file: {}".format(read_count))
    print(" tweets kept from file: {} ".format(kept_count))
    if read_count != 0:
        print(" kept tweets rate: {:0>2f} %".format(100*kept_count/read_count))

    return tweets, read_count, kept_count

### uncomment and run this to test/debug the read_tweets_from_bzfile function using a single file
#tweets = []
#read_tweets_from_bzfile("00.json.bz2", tweets)

In [None]:
# Function to iterate through a directory, get all the archive files, and then 
# read them in one at a time
def read_tweets_from_datetimehour_dir(rootdir):
    """Function to iterate through a directory, get all the archive files, and then read them in one at a time"""

    # declare variables
    tweets = [] # keep tweets as an array for now for mem management
    num_read = 0
    num_kept = 0

    # will count the number of files as we go
    num_files_read = 0

    # iterate through the directories
    for directory, subdirectory, filenames in  os.walk(rootdir):
        #print(directory)

        # iterate through the filenames
        for filename in filenames:
            full_path_filename = os.path.join(directory, filename)

            # call the read tweets function and keep track of counters
            tw, nr, nk = read_tweets_from_bzfile(full_path_filename)

            # append to the tweets array
            tweets.extend( tw )  # important to use "extend" method 

            # increment the counters
            num_files_read += 1  
            num_read += nr
            num_kept += nk

            print(" files read so far in this dir: {}".format(num_files_read))
            print(" results so far in this dir: {} tweets".format(len(tweets)))

    print("done. size of tweets array: {}".format(len(tweets)))

    return tweets, num_read, num_kept, num_files_read  # return stats with the tweets array

In [None]:
# Function to check if a file exists; was used in development
def check_if_output_file_exists(filepath):
    if os.path.exists(filepath):
        print("file {} exists.".format(filepath))
        while True:
            if os.path.isfile( filepath ):
                overwrite = input('Delete old file? (If no, output will be appended)\n Y = yes, N = no\n')
                if 'y' in overwrite.lower():
                    os.remove(filepath)
                    return False
                elif 'n' in overwrite.lower():
                    return True
                else:
                    print('input not understood; please try again')
                    

# Main run block
Now that we've defined some key functions, we can run through it all.  This will take a while.

The current set up is to run a week's worth of data.  The data should be defined in the initial declarations of 
this block, with year, month, day, and hour.  Change according to the data you downloaded and want to analyze.
It's a good idea to restrict this to a smaller range for testing and verification before embarking on the
entire run you want to do.

In [None]:
# MAIN BLOCK

# instead of using os.walk, we'll specifically declare what we want to iterate through
# so that we have control of the size of this job, and to be flexible when we want to do
# smaller test runs

# we'll also create the df and save off the results by the hour, which is about the right size to not crash
# everything on a pentium i5 with 8GB of RAM
# but for sanity we'll make csv files by the day, so 7 files for the week

# set these variables to determine which directories will be read
# in this example, we are going with 1 week in December 2020
year = 2020
month = 12
day_start = 1
day_end = 7
hour_start = 0  # possible range: 0-23
hour_end = 23

# counters
total_tweets_read = 0
total_tweets_kept = 0
total_files_read = 0 

# other variables
dir = ""
output_csv_file = "tweets_with_places"

tic = time.perf_counter() # start a timer

# now start iterating through files
for day in range(day_start, day_end +1):
    
    # the dir/file structure is hard coded
    output_csv_file = "tweets_with_places_" + \
        str('{:0>4d}').format(year) + \
        str('{:0>2d}').format(month) + \
        str('{:0>2d}').format(day) + \
        ".csv"
    
    write_csv_header = True  # start with true, change to false after first write-out
    
    for hour in range (hour_start, hour_end +1):

        dir = os.path.join(str('{:0>4d}').format(year), \
              str('{:0>2d}').format(month), \
              str('{:0>2d}').format(day), \
              str('{:0>2d}').format(hour))
        print("starting new directory: " + dir)

        if os.path.exists(dir) == False:
            print("directory does not exist; moving on")
            break

        # read the file and get back only those witih places or geo coordinates
        tweets, tweets_read, tweets_kept, files_read = read_tweets_from_datetimehour_dir(dir)
        tweets_df = pd.DataFrame( tweets )
         
        # print some outputs and statistics
        #print(tweets_df.columns)
        print("total tweets: {}".format(tweets_read))
        tweets_df['created_at'] = pd.to_datetime(tweets_df['created_at'])
        print("date time range: {} to {}".format(\
            tweets_df['created_at'].min(),tweets_df['created_at'].max()))
        try:
            print("  percentage tweets kept for {} d {} h: {:0>2f} %".format(day, hour, 100*tweets_kept/tweets_read ))
        except:
            print("  no tweets read ")

        # increment the counters
        total_tweets_read = total_tweets_read + tweets_read
        total_tweets_kept = total_tweets_kept + tweets_kept
        total_files_read = total_files_read + files_read

        # we can still keep lots of information from the tweet while dropping lots of extraneous or 
        # repeated information; this saves file size
        if len(tweets_df) > 0:
            filtered_df = tweets_df[[\
                'created_at','id','text','source','user',\
                'geo','coordinates','place','entities','lang','file_path']]

            # write to the csv file
            filtered_df.to_csv(output_csv_file, mode='a', header=write_csv_header)

            write_csv_header = False  # don't write headers after the first time

            print("wrote to file")
        else:
            print("nothing written to file")

        # print stats for the 'hour' read in
        print("hour {} ended".format(hour))
        print("TOTAL tweets kept, tweets read: {}, {}".format(total_tweets_kept, total_tweets_read))
        print("TOTAL files read: {}".format(total_files_read))
        print("TOTAL percentage tweets kept: {:0>2f} %".format( 100*total_tweets_kept/total_tweets_read ))

    # print stats for the 'day' read in
    print("day {} ended".format(day))
    print("TOTAL tweets kept, tweets read: {}, {}".format(total_tweets_kept, total_tweets_read))
    print("TOTAL files read: {}".format(total_files_read))
    print("TOTAL percentage tweets kept: {:0>2f} %".format( 100*total_tweets_kept/total_tweets_read ))

# print overall stats
print("all files read")
print("TOTAL tweets kept, tweets read: {}, {}".format(total_tweets_kept, total_tweets_read))
print("TOTAL files read: {}".format(total_files_read))
print("TOTAL percentage tweets kept: {:0>2f} %".format( 100*total_tweets_kept/total_tweets_read ))
        
# how long did that take?
toc = time.perf_counter()
print(f"iterating and determining place from geo coords took {toc - tic:0.4f} seconds")
