In [1]:
import sys
import os
import numpy as np
import pandas as pd
import gzip
from script import days_in_month, hours_in_day, leap_year
import time
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import date, timedelta
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [58]:
def daily_num_posts_by_country(path_1, prefix_1, path_2, prefix_2, year, month, day):
    """
    @param path_n: str, file path (directory) to df n
    @param prefix_n: str, file prefix, "geography" for geography files, "bert_sentiment" for sentiment files
    @param year: int, year
    @param month: int, month
    @param day: int, day
    
    returns: a df of size (1, num_countries) representing number of common posts by countries on this day
    """
    num_posts_common = 0
    day_path_1 = ''.join([path_1, prefix_1, "_", str(year), "_", str(month), "_", str(day).zfill(2)])
    day_path_2 = ''.join([path_2, prefix_2, "_", str(year), "_", str(month), "_", str(day).zfill(2)])
    num_posts_by_country = None
    for hour in hours_in_day():
        try:
            with gzip.open(''.join([day_path_1, "_", hour, ".csv.gz"])) as f:
                posts_1 = pd.read_csv(f, sep="\t")
        except FileNotFoundError:
            print(''.join([day_path_1, "_", hour, ".csv.gz"]), "does not exist.")
            continue
        except pd.errors.EmptyDataError:
            print(''.join([day_path_1, "_", hour, ".csv.gz"]), "is empty.")
            continue
        try:
            with gzip.open(''.join([day_path_2, "_", hour, ".csv.gz"])) as f:
                posts_2 = pd.read_csv(f, sep="\t")
        except FileNotFoundError:
            print(''.join([day_path_2, "_", hour, ".csv.gz"]), "does not exist.")
            continue
        except pd.errors.EmptyDataError:
            print(''.join([day_path_2, "_", hour, ".csv.gz"]), "is empty.")
            continue
        common_posts = pd.merge(posts_1, posts_2, on="message_id", how="inner")
        if num_posts_by_country is None:
            num_posts_by_country = common_posts.groupby(['NAME_0']).size().to_frame().transpose()
        else:
            num_posts_by_country_this_hour = common_posts.groupby(['NAME_0']).size().to_frame().transpose()
            num_posts_by_country = pd.concat([num_posts_by_country, num_posts_by_country_this_hour], join="outer", sort=True)
    return num_posts_by_country.fillna(0).sum(axis=0).astype(int)


geo_path = "/srv/data/twitter_geography/2015/"
sent_path = "/srv/data/twitter_sentiment/2015/"
daily_num_posts_by_country(geo_path, "geography", sent_path, "bert_sentiment", 2015, 12, 10)

Afghanistan              52
Albania                  29
Algeria                 322
American Samoa            3
Angola                   81
                       ... 
Vietnam                 504
Virgin Islands, U.S.     52
Yemen                    77
Zambia                  161
Zimbabwe                 49
Length: 160, dtype: int64

In [59]:
a = daily_num_posts_by_country(geo_path, "geography", sent_path, "bert_sentiment", 2015, 12, 10)

In [43]:
b = pd.Series([2012, 12, 10], index=["year", "month", "day"])
b.append(a)

year                    2012
month                     12
day                       10
Afghanistan               52
Albania                   29
                        ... 
Vietnam                  504
Virgin Islands, U.S.      52
Yemen                     77
Zambia                   161
Zimbabwe                  49
Length: 163, dtype: int64

In [45]:
def generate_daily_num_posts_df_year(year, geo_dir, sent_dir, out_dir):
    """
    @param year: int, year to generate the dataframe for
    @param geo_dir: directory under which the geography data files are stored
    @param sent_dir: directory under which the sentiment data files are stored
    @param out_dir: directory to which the dataframe will be stored as csv file
    
    Generates 
    1) a csv file which includes the number of geotagged posts, sentiment posts, common psets by day for the givern year;
    2) missing file, empty file, corrupt file reports 
    
    """
    geo_path = ''.join([geo_dir, str(year), "/"])
    sent_path = ''.join([sent_dir, str(year), "/"])
    data = []
    for month in range(1, 13):
        for day in range(1, days_in_month(month, year) + 1):
            data.append(pd.Series([year, month, day], index=["year", "month", "day"]).append(daily_num_posts_by_country(geo_path, "geography", sent_path, "bert_sentiment", year, month, day)))
    df = pd.DataFrame(data=data).fillna(0).astype(int)
    df.to_csv(''.join([out_dir, "num_posts_summary_by_country", str(year), ".csv"]))

In [57]:
geo_dir = "/srv/data/twitter_geography/"
sent_dir = "/srv/data/twitter_sentiment/"
out_dir = "../output/"

year = 2012

geo_path = ''.join([geo_dir, str(year), "/"])
sent_path = ''.join([sent_dir, str(year), "/"])
data = []
for month in range(1, 2):
    for day in range(1, 6):
        data.append(pd.Series([year, month, day], index=["year", "month", "day"]).append(daily_num_posts_by_country(geo_path, "geography", sent_path, "bert_sentiment", year, month, day)))
df = pd.DataFrame(data=data).fillna(0).astype(int) 
df

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,year,month,day,Algeria,Angola,Argentina,Australia,Austria,Azerbaijan,Bangladesh,...,Suriname,Tunisia,Vanuatu,Zimbabwe,French Guiana,Guinea-Bissau,Sudan,Tajikistan,Guinea,Mauritania
0,2012,1,1,1,1,36,93,19,2,6,...,0,0,0,0,0,0,0,0,0,0
1,2012,1,2,0,0,68,170,16,1,2,...,0,0,0,0,0,0,0,0,0,0
2,2012,1,3,1,0,77,185,21,1,3,...,1,2,1,2,0,0,0,0,0,0
3,2012,1,4,0,1,75,142,32,4,5,...,0,3,0,0,1,1,2,1,0,0
4,2012,1,5,1,0,75,169,27,2,4,...,0,3,0,2,0,0,1,0,1,1
