In [None]:
#taken from this StackOverflow answer: https://stackoverflow.com/a/39225039
#and this Github repo: https://github.com/nsadawi/Download-Large-File-From-Google-Drive-Using-Python
import requests
import os
from tqdm import tqdm

def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)    

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    with open(destination, "wb") as f:
        for chunk in tqdm(
            response.iter_content(1048576),
            unit="MiB", desc=f"Downloading {destination}"
        ):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)
                
def download_multiple(files={}):
    """
    Downloads Gdrive files from IDs into specified paths
    Takes dictionary of {"filepath": "ID"}
    
    Note: 
    filepath must include filename, i.e. /directory/file.ext
    filename does not need to match filename on Gdrive
    """
    
    for fp, ID in files.items():
        dir_name = os.path.dirname(fp)
        
        if not os.path.exists(dir_name):
            os.mkdir(dir_name)
        
        download_file_from_google_drive(ID, fp)

In [None]:
"""
#for testing
files={ "1558M/model.ckpt.meta": "10HlhfjbVNSuchYa2hmk9U1OlSRK6ZZxy",
        "774M/model.ckpt.meta": "1-omUh72kLcFCKecKTn6RMduJh51dgnC-",
        "out/shchedule.pdf": "1SZ76oFavmUeXeee1GElfpfwULzOxVUEH"
      }
"""

# Actual downloads
files={ "774M/model.ckpt.data-00000-of-00001": "1-iNTSsuloHKMzZA-ZDLSHN88RbWa3WrN",
        "1558M/model.ckpt.data-00000-of-00001": "107pyhj1vKojyoiFS0GJTyo0AWZNntUyo"
      }

download_multiple(files)

# Twitter-stream download and processing

In [None]:
# Download generic tweets
from download_tools import download_files
downloads = {
    "twitter_stream_2018_11_01.tar": "https://archive.org/download/archiveteam-twitter-stream-2018-11/twitter_stream_2018_11_01.tar",
    "twitter_stream_2018_11_02.tar": "https://archive.org/download/archiveteam-twitter-stream-2018-11/twitter_stream_2018_11_02.tar",
    "twitter_stream_2018_11_03.tar": "https://archive.org/download/archiveteam-twitter-stream-2018-11/twitter_stream_2018_11_03.tar"
    }
location = "D:/twitter-stream/twitter-stream-2018-11"
download_files(downloads, location)

In [None]:
import tarfile
from glob import glob

work_dir = "D:/twitter-stream/twitter-stream-2018-11/"
save_dir = "D:/twitter-stream/twitter-stream-2018-11/"

for fp in glob(work_dir + "**/*.tar", recursive=True):
    print(f"Untarring {fp}")
    tarfile.open(fp).extractall(save_dir)

    

In [None]:
work_dir = input("work directory (source)")
save_dir = input("save directory (not file):")

dir_list = []

for root, _, filenames in os.walk(work_dir):
    for filename in filenames:
        if filename.endswith("bz2"):
            dir_list.append(os.path.join(root, filename))

print(len(dir_list), "files found")


tweet_list = p_map(process, dir_list, num_cpus=multiprocessing.cpu_count() - 1) # Leave 1 thread for other processes
tweet_list = sum(tweet_list, [])


print(len(tweet_list), 'Tweets gathered\n\n Samples:\n')
for tweet in tweet_list[:5]:
    print(tweet)


with open(os.path.join(save_dir, str(len(tweet_list)) + "tweets.csv"), 'w', encoding='utf-8') as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerow(tweet_list)