## Counting Number of Tweets Per Country Per Event
This notebook will go through all the dataframes with the geocoded tweets for the different events and count the number of tweets per country per event. This will be stored in a dataframe which will be pickled under `summary.pickle` in the Geocoded folder of each event. 

In [1]:
import os
import pandas as pd
import pickle
from tqdm import tqdm

In [2]:
cwd = os.getcwd()
path = os.path.join(cwd, "../../../Project Data/Tweets")
# Get all the files in the current working directory
folders = os.listdir(path)
# Keep only the folders excluding the checkpoints folder -> event folders
folders = [x for x in folders if os.path.isdir(os.path.join(path, x)) if "checkpoints" not in x if "DS_Store" not in x]

do_prints = False

# Get the country codes from the country mapping pickle. This will be used to init
# the dataframe which will contain the overall number of tweets per country per event. 

country_codes = pd.read_pickle("country_mapping.pickle")
if do_prints : print(type(list(set(country_codes.values()))[0]))
country_codes = [x for x in list(set(country_codes.values())) if type(x) is not float]

# Go through all the different events folders
for folder in folders:
    # Get all the files in the event folder
    files_path = os.path.join(path, folder, "Geocoded")
    located_files = [x for x in  os.listdir(files_path) if "Located" in x]
    
    # Create the first empty dataframe in which all the counts will be stored
    event_locations = pd.DataFrame(pd.Series(country_codes), columns = ["country"])
    event_locations.set_index("country", inplace = True)
    event_locations["text"] = 0
    event_locations["text"] = event_locations["text"]

    # Go through all the different files in the folder and process them.
    for pkl_file in tqdm(located_files):
        # Read the pickle file, groupby country and count the number of tweets then add
        # to the final df for the event
        df = pd.read_pickle(os.path.join(files_path,pkl_file))
        interm_df = df[["country", "text"]].groupby("country").count()
        event_locations = event_locations.add(interm_df, fill_value=0)

    # Pickle the event dataframes
    if do_prints: print(event_locations["text"].tolist())
    event_locations.to_pickle(os.path.join(files_path, "summary.pickle"))


100%|██████████| 333/333 [00:10<00:00, 32.12it/s]
100%|██████████| 28/28 [00:00<00:00, 55.42it/s] 
100%|██████████| 5/5 [00:00<00:00, 41.14it/s]
100%|██████████| 10/10 [00:00<00:00, 21.35it/s]
