# Geotweet Data Summary

Author: @SirenaYu

This notebook explores the Geotweet data and generates summary dataframes which include the number of geotagged posts, posts with sentiment scores, and common posts by day. The summary dataframes are saved in a csv format, and each year has it own csv file.

The notebook is divided into 3 sections:
* twitter_geography Folder Analysis: This section explores the data in twitter_geography folder.
* twitter_sentiment Folder Analysis: This section explores the data in twitter_sentiment folder.
* Master Dataframe: This section tests a method to generate master dataframes with number of geotagged, sentiment, and common posts, and then uses ths method to generate dataframes from 2012 to 2021.

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import gzip
from script import days_in_month, hours_in_day
import time

# twitter_geography Folder Analysis

## twitter_geography/2015

In [None]:
geo_path = "/srv/data/twitter_geography/2015/"
files = [f for f in os.listdir(geo_path) if os.path.isfile(os.path.join(geo_path, f))]
print("There are a total of", len(files), "files in twitter_geography/2015.")

### Sample Dataframe 

In [None]:
with gzip.open(''.join([geo_path, "geography_2015_1_10_00.csv.gz"])) as f:
    geo_posts = pd.read_csv(f, sep="\t")
        
geo_posts.head()

### Summary Dataframe 

In [None]:
def num_posts_on_day(path, prefix, year, month, day):
    num_posts = 0
    day_path = ''.join([path, prefix, "_", str(year), "_", str(month), "_", str(day).zfill(2)])
    for hour in hours_in_day():
        try:
            with gzip.open(''.join([day_path, "_", hour, ".csv.gz"])) as f:
                posts = pd.read_csv(f, sep="\t")
                num_posts += len(posts)
        except FileNotFoundError:
            print(''.join([day_path, "_", hour, ".csv.gz"]), "does not exist.")
    return num_posts

In [None]:
df = []
for month in range(1, 13):
    for day in range(1, days_in_month(month, 2015) + 1):
        df.append([2015, month, day, num_posts_on_day(geo_path, "geography", 2015, month, day)])

In [None]:
summary_df = pd.DataFrame(data=df,    # values 
              columns=["year", "month", "day", "num_posts"])
summary_df

In [None]:
summary_df.to_csv("twitter_geography_2015_summary.csv")

# twitter_sentiment Folder Analysis

## twitter_sentiment/2015 

In [None]:
sent_path = "/srv/data/twitter_sentiment/2015/"
files = [f for f in os.listdir(sent_path) if os.path.isfile(os.path.join(sent_path, f))]
print("There are a total of", len(files), "files in twitter_sentiment/2015.")

### Sample Dataframe 

In [None]:
with gzip.open(''.join([sent_path, "bert_sentiment_2015_1_10_00.csv.gz"])) as f:
    sent_posts = pd.read_csv(f, sep="\t")
        
sent_posts.head()

### Summary Dataframe

In [None]:
df = []
for month in range(1, 13):
    for day in range(1, days_in_month(month, 2015) + 1):
        df.append([2015, month, day, num_posts_on_day(path, "bert_sentiment", 2015, month, day)])

In [None]:
summary_df = pd.DataFrame(data=df,    # values 
             columns=["year", "month", "day", "num_posts"]) 

In [None]:
summary_df

In [None]:
summary_df.to_csv("twitter_sentiment_2015_summary.csv")

# Master Dataframe 

In [None]:
def num_posts_on_day_two_df(path_1, prefix_1, path_2, prefix_2, year, month, day):
    num_posts_1 = 0
    num_posts_2 = 0
    num_posts_common = 0
    day_path_1 = ''.join([path_1, prefix_1, "_", str(year), "_", str(month), "_", str(day).zfill(2)])
    day_path_2 = ''.join([path_2, prefix_2, "_", str(year), "_", str(month), "_", str(day).zfill(2)])
    for hour in hours_in_day():
        start_time = time.time()
        try:
            with gzip.open(''.join([day_path_1, "_", hour, ".csv.gz"])) as f:
                posts_1 = pd.read_csv(f, sep="\t")
                num_posts_1 += len(posts_1)
        except FileNotFoundError:
            print(''.join([day_path_1, "_", hour, ".csv.gz"]), "does not exist.")
            continue
        except pd.errors.EmptyDataError:
            print(''.join([day_path_1, "_", hour, ".csv.gz"]), "is empty.")
            continue
        try:
            with gzip.open(''.join([day_path_2, "_", hour, ".csv.gz"])) as f:
                posts_2 = pd.read_csv(f, sep="\t")
                num_posts_2 += len(posts_2)
        except FileNotFoundError:
            print(''.join([day_path_2, "_", hour, ".csv.gz"]), "does not exist.")
            continue
        except pd.errors.EmptyDataError:
            print(''.join([day_path_2, "_", hour, ".csv.gz"]), "is empty.")
            continue
        num_posts_common += len(pd.merge(posts_1, posts_2, on="message_id", how="inner"))
    return num_posts_1, num_posts_2, num_posts_common

### Sample Call

In [None]:
num_posts_on_day_two_df(geo_path, "geography", sent_path, "bert_sentiment", 2015, 12, 10)

## 2015 

In [None]:
df = []
for month in range(1, 13):
    for day in range(1, days_in_month(month, 2015) + 1):
        df.append([2015, month, day] + list(num_posts_on_day_two_df(geo_path, "geography", sent_path, "bert_sentiment", 2015, month, day)))

In [None]:
master_summary_df = pd.DataFrame(data=df,    # values 
             columns=["year", "month", "day", "num_geo_posts", "num_sent_posts", "num_common_posts"]) 
master_summary_df

In [None]:
master_summary_df.to_csv("master_2015_summary.csv")

## 2018

In [None]:
geo_path_2018 = "/srv/data/twitter_geography/2018/"
sent_path_2018 = "/srv/data/twitter_sentiment/2018/"

In [None]:
num_posts_on_day_two_df(geo_path_2018, "geography", sent_path_2018, "bert_sentiment", 2018, 10, 13)

In [None]:
df = []
for month in range(1, 13):
    for day in range(1, days_in_month(month, 2018) + 1):
        print(month, day)
        df.append([2018, month, day] + list(num_posts_on_day_two_df(geo_path_2018, "geography", sent_path_2018, "bert_sentiment", 2018, month, day)))

In [None]:
master_summary_df_2018 = pd.DataFrame(data=df,    # values 
             columns=["year", "month", "day", "num_geo_posts", "num_sent_posts", "num_common_posts"]) 
master_summary_df_2018

In [None]:
master_summary_df_2018.to_csv("master_2018_summary.csv")

## 1/10, 2012-2021 

In [None]:
df = []
for year in range(2012, 2022):
    geo_path = ''.join(["/srv/data/twitter_geography/", str(year), "/"])
    sent_path = ''.join(["/srv/data/twitter_sentiment/", str(year), "/"])
    df.append([year, 1, 10] + list(num_posts_on_day_two_df(geo_path, "geography", sent_path, "bert_sentiment", year, 1, 10)))

In [None]:
master_summary_df_1_10_2012_2021 = pd.DataFrame(
    data=df,
    columns=["year", "month", "day", "num_geo_posts", "num_sent_posts", "num_common_posts"]
)
master_summary_df_1_10_2012_2021

In [None]:
master_summary_df_1_10_2012_2021.to_csv('../output/master_2012_2021_1_10_summary.csv')

## Method to Run 2012 - 2020 

In [None]:
def generate_master_df_year(year):
    geo_path = ''.join(["/srv/data/twitter_geography/", str(year), "/"])
    sent_path = ''.join(["/srv/data/twitter_sentiment/", str(year), "/"])
    df = []
    for month in range(1, 13):
        for day in range(1, days_in_month(month, year) + 1):
            df.append([year, month, day] + list(num_posts_on_day_two_df(geo_path, "geography", sent_path, "bert_sentiment", year, month, day)))
    master_summary_df = pd.DataFrame(data=df,    # values 
             columns=["year", "month", "day", "num_geo_posts", "num_sent_posts", "num_common_posts"]) 
    master_summary_df.to_csv(''.join(["../output/master_", str(year), "_summary.csv"]))

In [None]:
for year in range(2012, 2021):
    generate_master_df_year(year)