## **Code Setup**

**Installing Dependencies**

In [1]:
#!pip install pathlib
#!pip install patool
#!pip install internetarchive
#!pip install elasticsearch

**Setup Variables**

In [2]:
from os import listdir

# Define path name for downloaded datasets
path_to_files = '../archive.org/'
#path_to_files = '../archive_test/'


# Define a list of identifiers for each dataset to be downloaded
months_list = listdir(path_to_files)

# Define index name for Elasticsearch
#es_index_name = 'tweet_results_2017-11'

# Define searchterms

searchterm = ('Misha Collins', 'misha collins', 'mishacollins', 'MishaCollins')

## **Code Pipeline**

**Extract Datasets**

In [3]:
# This code cell extracts in 2 steps all datasets to a predefined path: 
# 1) Extract all TAR files to get BZ2 files
# 2) Extract all BZ2 files to get JSON files + analyze tweet text from datasets

import os
import glob
import patoolib
from pathlib import Path

# Unzips all tar files
def extract_tar(path, month):
    path_to_files = path + month
    
    print("2. Unzip TAR files ...")
    
    for item in glob.glob(path_to_files + '/*.tar'): 
        dirpath = os.path.dirname(item)
        patoolib.extract_archive(item, outdir=dirpath)
        os.remove(os.fspath(item))
    return

# Unzips all bz2 files from the folders
def extract_gz(path, month_path, month, searchterm):
    path_to_files = path + month_path
    st = searchterm
    result_file = 'key_tweets_'+month+'.json'
    tweet_array = []
    
    print("3. Extracting gz files ...")
    
    for item in glob.glob(path_to_files + '/**/*.gz', recursive=True):
        dirpath = os.path.dirname(item)
        patoolib.extract_archive(item, outdir=dirpath)
        
        # Analyze tweets with function analyze_tweet_text()
        tweet_array = analyze_tweet_text(path_to_files, st)
        
        # Write key tweets to result file 
        # can i change this to dumping straight into a mongodb DB?
        with open(result_file, 'a', encoding="utf-8") as file:
            for it in tweet_array:
                file.write("%s\n" % json.dumps(it))
        
        os.remove(os.fspath(item))
    return

**Analyze Tweets**

In [4]:
# This code cell filters JSON files for tweets which contain the one of the terms in the predefined searchterm list and 
# extracts only the key attributes from relevant tweets
# Fuction analyze_tweet_text() gets called in the function extract_bz2()

import os
import json
from pathlib import Path
from glob import iglob

def analyze_tweet_text(path, searchterms):
    path_to_json = path
    st = searchterms
    tweets_final = [] # Array for final Tweets
    
    print("4. Analyze Tweets ...")

    # Search for JSON files
    print("     Importing data ...")
    rootdir = Path(path_to_json)
    json_files = list(rootdir.glob('**/*.json')) # List with all JSON Files

    # Start filter process
    print("     Filter process started ...")
    for index, js in enumerate(json_files):
        with open(js, encoding='utf-8') as json_file:
        #with open(os.path.join(path_to_json, js), encoding='utf-8') as json_file:
            for line in json_file:
                if line.strip():
                    tweet_line = json.loads(line)

                # 1. Filter: Check for deleted tweets
                    if 'source' in tweet_line:
                    #  print("Tweet exists")

                # 2. Filter: Check if tweet has more than 140 characters (truncated = true)
                        if tweet_line['truncated'] == True:
                            tweet_text = tweet_line['extended_tweet']['full_text']
                        else:
                            tweet_text = tweet_line['text']

                # 3. Filter: Check if text contains any of the searchterms
                        if any(s in tweet_text for s in st):
                            key_tweet = []

                            try:
                                key_tweet.append(tweet_line)
                                tweets_final.append(extract_key_info(key_tweet))
                                print(len(tweets_final))
                            except ValueError:
                                print("Decoding JSON has failed")
                        else:
                            continue
                    else:
                        continue
        

    os.remove(os.fspath(js))
    return tweets_final

**Extract Key Information**

In [5]:
# This code cell extracts only the key attributes from the key files 
# Fuction extract_key_info() gets called in the function analyze_tweet_text()

# don't do this, do the processing wih the mongodb file on the VM instead?
# only need created datetime, tweet text

import json

def extract_key_info(tweets):
    raw_tweets = tweets # Array for key tweets
    extracted_tweets = [] # Array for extracted tweets
    
    # Extract Date, ID, Text, User-ID, User-Name and User-Timezone
    for i in range(len(raw_tweets)):
        # Text Attributes - Check if text contains more than 140 characters
        if raw_tweets[i]['truncated'] == True:
            tweet_text = raw_tweets[i]['extended_tweet']['full_text']
        else:
            tweet_text = raw_tweets[i]['text']

        # Tweet Key Values
        tweet_date = raw_tweets[i]['created_at']
        #tweet_id = raw_tweets[i]['id']
        tweet_source = raw_tweets[i]['source']

        # Tweet User Attributes
        #tweet_user_id = raw_tweets[i]['user']['id']
        #tweet_user_name = raw_tweets[i]['user']['name']
        #tweet_user_location = raw_tweets[i]['user']['location']
        #tweet_user_url =  raw_tweets[i]['user']['url']
        #tweet_user_description = raw_tweets[i]['user']['description']
        #tweet_user_verified = raw_tweets[i]['user']['verified']
        #tweet_user_follower_count = raw_tweets[i]['user']['followers_count']
        #tweet_user_friends_count = raw_tweets[i]['user']['friends_count']
        #tweet_user_favourites_count = raw_tweets[i]['user']['favourites_count']
        #tweet_user_statuses_count = raw_tweets[i]['user']['statuses_count']
        #tweet_user_created_at = raw_tweets[i]['user']['created_at']
        #tweet_user_utc_offset = raw_tweets[i]['user']['utc_offset']
        #tweet_user_timezone = raw_tweets[i]['user']['time_zone']
        #tweet_user_geo_enabled = raw_tweets[i]['user']['geo_enabled']
        #tweet_user_language = raw_tweets[i]['user']['lang']

        # Tweet Attributes
        #tweet_geo = raw_tweets[i]['geo']
        #tweet_coordinates = raw_tweets[i]['coordinates']
        #tweet_place = raw_tweets[i]['place']
        #tweet_quote_count = raw_tweets[i]['quote_count']
        #tweet_reply_count = raw_tweets[i]['reply_count']
        #tweet_retweet_count = raw_tweets[i]['retweet_count']
        #tweet_favorite_count = raw_tweets[i]['favorite_count']
        #tweet_hastags = raw_tweets[i]['entities']['hastags']
        #tweet_urls = raw_tweets[i]['entities']['urls']
        #tweet_favorited = raw_tweets[i]['favorited']
        #tweet_retweeted = raw_tweets[i]['retweeted']
        tweet_language = raw_tweets[i]['lang']
        tweet_timestamp = raw_tweets[i]['timestamp_ms']

        # Create a new JSON-Object structure
        jsonobj = {
            "created_at": tweet_date,
            #"id": tweet_id,
            "text": tweet_text,
            "source": tweet_source,
            """
            "user": {
                "id": tweet_user_id,
                "name": tweet_user_name,
                "location": tweet_user_location,
                "url": tweet_user_url,
                "description": tweet_user_description,
                "verified": tweet_user_verified,
                "followers_count": tweet_user_follower_count,
                "friends_count": tweet_user_friends_count,
                "favourites_count": tweet_user_favourites_count,
                "statuses_count": tweet_user_statuses_count,
                "created_at": tweet_user_created_at,
                "utc_offset": tweet_user_utc_offset,
                "time_zone": tweet_user_timezone,
                "geo_enabled": tweet_user_geo_enabled,
                "lang": tweet_user_language,
                },
            """ 
            #"geo": tweet_geo,
            #"coordinates": tweet_coordinates,
            #"place": tweet_place,
            #"quote_count": tweet_quote_count,
            #"reply_count": tweet_reply_count,
            #"retweet_count": tweet_retweet_count,
            #"favorite_count": tweet_favorite_count,
            #"hastags": tweet_hastags,
            #"urls": tweet_urls,
            #"favorited": tweet_favorited,
            #"retweeted": tweet_retweeted,
            "lang": tweet_language,
            "timestamp_ms": tweet_timestamp,
        }

        extracted_tweets.append(jsonobj)

    return extracted_tweets

## **Execute Code**

In [6]:
month_list = [month[-7:] for month in months_list]
month_dict = dict(zip(month_list, months_list)) #im really running out of names

In [10]:
month_list

['2021-11',
 '2021-12',
 '2022-01',
 '2022-02',
 '2022-03',
 '2022-04',
 '2022-05',
 '2022-06',
 '2022-07',
 '2022-08',
 '2022-09',
 '2022-10',
 '2022-11']

In [11]:
month = month_list[-2] #yes i could do the whole year as a loop but i can't download that much at once so. month at a time.
month_dict[month]

'archiveteam-twitter-stream-2022-10'

In [None]:
# 1) Download datasets
#downloader(identifier_list, path_to_files)

# 2) Extract datasets to BZ2 files
extract_tar(path_to_files, month_dict[month])

# 3) Extract BZ2 files to JSON files and analyze files for relevant tweets
extract_gz(path_to_files, month_dict[month], month, searchterm) #calls function analyze_tweet_text()

# 4) Index relevant tweets to Elasticsearch
#index_to_es(es_index_name, rf_key_tweets)

2. Unzip TAR files ...
patool: Extracting ../archive.org/archiveteam-twitter-stream-2022-10\twitter-stream-20221001.tar ...
patool: running C:\WINDOWS\system32\tar.EXE --extract --file ../archive.org/archiveteam-twitter-stream-2022-10\twitter-stream-20221001.tar --directory ../archive.org/archiveteam-twitter-stream-2022-10
