In [253]:
import json
import pandas as pd
import plotly.graph_objs as go
import plotly.express as px
import numpy as np

In [167]:
with open('foiaRequests.json') as json_file:
    json_data = json.load(json_file)
    df = pd.DataFrame(json_data)

In [168]:
df = df.rename(columns={'id':'foia_id'})
df = df.set_index('foia_id')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 56459 entries, 49504 to 60945
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   title                  56459 non-null  object 
 1   slug                   56459 non-null  object 
 2   status                 56459 non-null  object 
 3   embargo                56459 non-null  bool   
 4   permanent_embargo      56459 non-null  bool   
 5   user                   56459 non-null  int64  
 6   username               56459 non-null  object 
 7   agency                 56459 non-null  int64  
 8   datetime_submitted     56252 non-null  object 
 9   date_due               40989 non-null  object 
 10  days_until_due         9728 non-null   float64
 11  date_followup          11315 non-null  object 
 12  datetime_done          37654 non-null  object 
 13  date_embargo           12322 non-null  object 
 14  tracking_id            56459 non-null  object 
 15

# df_comms

In [249]:
def expandDefault(row):
    return row[0]

df_comms = pd.DataFrame(df['communications']).explode('communications')
df_comms = df_comms[:]
df_comms = df_comms.apply(expandDefault, result_type='expand', axis=1)

In [250]:
def cleanText(text):
    return text.replace('\n', ' ') \
            .replace('\r', ' ') \
            .replace('  ', ' ') \
            .strip()

def defaultFalse(item):
    if item == True:
        return True
    return False

def defaultNone(item):
    return defaultXToY(item, 'none', None)

def defaultXToY(item, x, y):
    if item == x:
        return y
    return item

df_comms = df_comms.reset_index()
df_comms = df_comms.set_index(['foia_id', 'datetime'])
df_comms = df_comms.sort_index()
df_comms['comm_status'] = df_comms['status']
del df_comms['to_user']
del df_comms['status']
del df_comms['from_user']
del df_comms['foia']
df_comms = df_comms.dropna(subset=['subject'])
df_comms['likely_foia'] = df_comms['likely_foia'].map(defaultFalse)
df_comms['response'] = df_comms['response'].map(defaultFalse)
df_comms['autogenerated'] = df_comms['autogenerated'].map(defaultFalse)
df_comms['thanks'] = df_comms['thanks'].map(defaultFalse)
df_comms['full_html'] = df_comms['full_html'].map(defaultFalse)
df_comms['response'] = df_comms['response'].map(defaultFalse)
df_comms['delivered'] = df_comms['delivered'].map(defaultNone)
df_comms[['communication', 'subject']] = df_comms[['communication', 'subject']].applymap(cleanText)
df_comms.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 586200 entries, (6, '2010-05-20T00:00:00') to (88178, '2020-02-14T13:40:22.761735')
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   subject        586200 non-null  object
 1   response       586200 non-null  bool  
 2   autogenerated  586200 non-null  bool  
 3   thanks         586200 non-null  bool  
 4   full_html      586200 non-null  bool  
 5   communication  586200 non-null  object
 6   likely_foia    586200 non-null  bool  
 7   files          586200 non-null  object
 8   delivered      530422 non-null  object
 9   comm_status    179854 non-null  object
dtypes: bool(5), object(5)
memory usage: 34.2+ MB


# df_comms_lex

In [268]:
def expandCommsStats(row):
    commLenChars = len( str(row['communication']) )
    commLenWords = len( str(row['communication']).split(' ') )
    if commLenChars == 0:
        commLenWords = 0
    subjLenChars = len( str(row['subject']) )
    subjLenWords = len( str(row['subject']).split(' ') )
    if subjLenChars == 0:
        subjLenWords = 0
    return {'msg_num_chars': commLenChars, 'msg_num_words': commLenWords, 'subj_num_chars': subjLenChars, 'subj_num_words': subjLenWords}

df_comms_lex = df_comms
df_comms_lex = df_comms_lex.apply(expandCommsStats, result_type='expand', axis=1)
df_comms_lex

Unnamed: 0_level_0,Unnamed: 1_level_0,msg_num_chars,msg_num_words,subj_num_chars,subj_num_words
foia_id,datetime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6,2010-05-20T00:00:00,1399,236,0,0
6,2011-05-04T12:16:53,173,32,0,0
6,2011-05-05T14:02:58,792,138,0,0
7,2010-05-25T00:00:00,1287,217,0,0
7,2010-06-04T00:00:00,904,167,0,0
...,...,...,...,...,...
88174,2020-02-14T13:40:20.959915,966,160,0,0
88175,2020-02-14T13:40:21.391006,966,160,0,0
88176,2020-02-14T13:40:21.737532,966,160,0,0
88177,2020-02-14T13:40:22.402361,966,160,0,0


In [269]:
df_comms_lex.describe()

Unnamed: 0,msg_num_chars,msg_num_words,subj_num_chars,subj_num_words
count,586200.0,586200.0,586200.0,586200.0
mean,665.935024,107.549954,53.625121,7.378582
std,1807.917192,298.006052,43.977807,6.075632
min,0.0,0.0,0.0,0.0
25%,285.0,51.0,0.0,0.0
50%,325.0,57.0,52.0,7.0
75%,537.0,82.0,87.0,12.0
max,149140.0,36372.0,255.0,42.0


# df_files

In [239]:
def expandDefault(row):
    return row[0]

df_files = df_comms.reset_index()
df_files = df_files.set_index(['foia_id'])
del df_files['datetime']
df_files = pd.DataFrame(df_files['files'].explode())
df_files = df_files.dropna()
df_files = df_files.apply(expandDefault, result_type='expand', axis=1)
df_files.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 238112 entries, 7 to 88142
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           238112 non-null  int64 
 1   ffile        238112 non-null  object
 2   datetime     238112 non-null  object
 3   title        238112 non-null  object
 4   source       238112 non-null  object
 5   description  238112 non-null  object
 6   access       238112 non-null  object
 7   doc_id       238112 non-null  object
 8   pages        238112 non-null  int64 
dtypes: int64(2), object(7)
memory usage: 18.2+ MB


In [240]:
df_files['file_id'] = df_files['id']
del df_files['id']
df_files = df_files.reset_index()
df_files = df_files.set_index(['foia_id', 'file_id'])
df_files['pages_'] = df_files['pages'].map(lambda a: defaultXToY(a, 0, 1))
df_files['pages'] = df_files['pages'].map(lambda a: defaultXToY(a, 0, None))
df_files = df_files.applymap(lambda a: defaultXToY(a, '', None))
df_files.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 238112 entries, (7, 2572) to (88142, 842670)
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   ffile        238110 non-null  object 
 1   datetime     238112 non-null  object 
 2   title        238112 non-null  object 
 3   source       226535 non-null  object 
 4   description  3279 non-null    object 
 5   access       238112 non-null  object 
 6   doc_id       156386 non-null  object 
 7   pages        150165 non-null  float64
 8   pages_       238112 non-null  int64  
dtypes: float64(1), int64(1), object(7)
memory usage: 20.3+ MB


# df_files_stats

In [241]:
df_files_stats = df_files
df_files_stats = df_files_stats.reset_index()
df_files_stats = df_files_stats[['foia_id', 'pages', 'pages_']]
df_files_stats['num_files'] = 1
df_files_stats = df_files_stats.groupby(by=['foia_id']).sum()
df_files_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44746 entries, 7 to 88142
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pages      44746 non-null  float64
 1   pages_     44746 non-null  int64  
 2   num_files  44746 non-null  int64  
dtypes: float64(1), int64(2)
memory usage: 1.4 MB


In [242]:
df_files_stats.sum()

pages        2858562.0
pages_       2946509.0
num_files     238112.0
dtype: float64