# Missing File Analysis

Author: @SirenaYu

In missing_file_analysis, the following definition is used:

* A file is defined as <b>missing</b> if the filepath does not exist in the directory.
* A file is define as <b>empty</b> if the filepath exists but the file is empty.
* A file is defined as <b>corrupted</b> if the number of posts it contains is less than the lower of 1) 10,000 or 2) the bottom 10th percentile number of posts from that year.

The notebook generates the following reports:
* Lists of missing files, empty files, and corrupted files each in the form of a csv file. Example of such csv file is attached in the notebook.
* Number of missing files, empty files, and corrupted files from each year, stored in a dataframe. The dataframe is then stored as a csv file.

In [None]:
import sys
import os
import numpy as np
import pandas as pd
import gzip
from script import days_in_month, hours_in_day

# 2012, Geography Folder

In [None]:
missing_files = []
empty_files = []
num_posts = []

for month in range(1, 13):
    for day in range(1, days_in_month(month, 2012)+1):
        for hour in range(24):
            file_name = ''.join(["/srv/data/twitter_geography/2012/geography_2012_", str(month), "_", str(day).zfill(2), "_", str(hour).zfill(2), ".csv.gz"])
            if not os.path.exists(file_name):
                missing_files.append(file_name)
            else:
                try: 
                    with gzip.open(file_name) as f:
                        posts = pd.read_csv(f, sep="\t")
                        num_posts.append(len(posts))
                except pd.errors.EmptyDataError:
                    empty_files.append(file_name)
                    continue
                

In [None]:
print("There are a total of", len(missing_files), "missing files from 2012 geography files.")
print("There are a total of", len(empty_files), "empty files from 2012 geography files.")
print("The average number of posts from each file is", sum(num_posts)/len(num_posts), "from 2012 geography files.")

In [None]:
bottom_10_percentile = pd.Series(num_posts).quantile(0.1)
threshold = min(10000, bottom_10_percentile)
threshold

In [None]:
corrupted_files = []

for month in range(1, 13):
    for day in range(1, days_in_month(month, 2012)+1):
        for hour in range(24):
            file_name = ''.join(["/srv/data/twitter_geography/2012/geography_2012_", str(month), "_", str(day).zfill(2), "_", str(hour).zfill(2), ".csv.gz"])
            if os.path.exists(file_name):
                try: 
                    with gzip.open(file_name) as f:
                        posts = pd.read_csv(f, sep="\t")
                        if len(posts) < threshold:
                            corrupted_files.append(file_name)
                except pd.errors.EmptyDataError:
                    continue

In [None]:
print("There are a total of", len(corrupted_files), "corrputed files from 2012.")

In [None]:
missing_files_df = pd.DataFrame(data=pd.Series(missing_files),
                               columns=["missing_files"])
empty_files_df = pd.DataFrame(data=pd.Series(empty_files),
                               columns=["empty_files"])
corrupted_files_df = pd.DataFrame(data=pd.Series(corrupted_files),
                               columns=["corrupted_files"])

### Example of List of Missing Files 

In [None]:
missing_files_df

In [None]:
missing_files_df.to_csv(''.join(["../output/missing_file_report/missing_files_", str(2012), "_geography.csv"]))
empty_files_df.to_csv(''.join(["../output/missing_file_report/empty_files_", str(2012), "_geography.csv"]))
corrupted_files_df.to_csv(''.join(["../output/missing_file_report/corrupted_files_", str(2012), "_geography.csv"]))

# All years, 2012-2021

In [None]:
def saving_list_of_files(year, folder):
    """
    @param year: int, year
    @param folder: str, "geography" for geography folder, "sentiment" for sentiment folder
    """
    missing_files = []
    empty_files = []
    num_posts = []
    if folder == "geography":
        path = "/srv/data/twitter_geography/"
        prefix = "geography"
    else:
        path = "/srv/data/twitter_sentiment/"
        prefix = "bert_sentiment"
    file_name_to_num_post = dict()
    
    for month in range(1, 13):
        for day in range(1, days_in_month(month, year)+1):
            for hour in range(24):
                file_name = ''.join([path, str(year), "/", prefix, "_", str(year), "_", str(month), "_", str(day).zfill(2), "_", str(hour).zfill(2), ".csv.gz"])
                if not os.path.exists(file_name):
                    missing_files.append(file_name)
                else:
                    try: 
                        with gzip.open(file_name) as f:
                            posts = pd.read_csv(f, sep="\t")
                            num_posts.append(len(posts))
                            file_name_to_num_post[file_name] = len(posts)
                    except pd.errors.EmptyDataError:
                        empty_files.append(file_name)
                        continue
                        
    bottom_10_percentile = pd.Series(num_posts).quantile(0.1)
    threshold = min(10000, bottom_10_percentile)
    
    corrupted_files = []
    for month in range(1, 13):
        for day in range(1, days_in_month(month, year)+1):
            for hour in range(24):
                file_name = ''.join([path, str(year), "/", prefix, "_", str(year), "_", str(month), "_", str(day).zfill(2), "_", str(hour).zfill(2), ".csv.gz"])
                if file_name in file_name_to_num_post:
                    if file_name_to_num_post[file_name] < threshold:
                        corrupted_files.append(file_name)
    missing_files_df = pd.DataFrame(data=pd.Series(missing_files),
                               columns=["missing_files"])
    empty_files_df = pd.DataFrame(data=pd.Series(empty_files),
                               columns=["empty_files"])
    corrupted_files_df = pd.DataFrame(data=pd.Series(corrupted_files),
                               columns=["corrupted_files"])
    missing_files_df.to_csv(''.join(["../output/missing_file_report/missing_files_", str(year), "_", folder,".csv"]))
    empty_files_df.to_csv(''.join(["../output/missing_file_report/empty_files_", str(year), "_", folder, ".csv"]))
    corrupted_files_df.to_csv(''.join(["../output/missing_file_report/corrupted_files_", str(year), "_", folder, ".csv"]))

### Saving Lists of Missing, Empty, Corrupted Files 

In [None]:
for year in range(2012, 2022):
    saving_list_of_files(year, "geography")
    saving_list_of_files(year, "sentiment")

###  Generating Missing File Report 

In [None]:
data = []

for year in range(2012, 2022):
    for folder in ["geography", "sentiment"]:
        missing_files_df = pd.read_csv(''.join(["../output/missing_file_report/missing_files_", str(year), "_", folder,".csv"]))
        empty_files_df = pd.read_csv(''.join(["../output/missing_file_report/empty_files_", str(year), "_", folder, ".csv"]))
        corrupted_files_df = pd.read_csv(''.join(["../output/missing_file_report/corrupted_files_", str(year), "_", folder, ".csv"]))
        data.append([year, folder, len(missing_files_df), len(empty_files_df), len(corrupted_files_df)])

In [None]:
missing_file_report_df = pd.DataFrame(data=data,
                                     columns=["year", "folder", "num_missing_files", "num_empty_files", "num_corrupted_files"])

In [None]:
missing_file_report_df

In [None]:
missing_file_report_df.to_csv("../output/missing_file_report/missing_file_report.csv")