In [1]:
import zipfile
import fecfile
from fecfile.fecparser import Fecfile
import random
import glob
from pathlib import Path
import pandas as pd
import sys
from collections import defaultdict
import os


In [22]:
class MyFecfile(object):
    def __init__(self, fname, encoding = "unicode"):
        self.fname = fname
        self.date = fname.parts[1]
        self.basename = fname.with_suffix('').name
        # self.data = fecfile.from_file(fname)
            
    def cache_forms(self):
        z = self.typed_frames()
        for frame in z:
            try:
                form = frame.form_type[0]
            except IndexError:
                continue
            except AttributeError:
                if 'rec_type' in frame.columns:
                    # Usually "TEXT". So whatevs.
                    continue
                else:
                    print(frame.columns)
                    continue
            dir = f"cache/{self.date}/{form}"
            if not os.path.exists(dir):
                os.makedirs(dir)
            frame.to_parquet(f"{dir}/{self.basename}.parquet", allow_truncated_timestamps=True)
        cached_forms.add(self.basename)
        
    def __repr__(self):
        return "<FEC filing>" + str(self.fname)
    
 
    def typed_frames(self, encoding = None):
        file = fecfile.fecparser.Fecfile(self.fname)
        file.prepare_itemization_buffers()
        frames = []
        for key in file.forms.keys():
            key = key.decode("utf-8")
            try:
                frame = file.to_pandas(key)
                frames.append(frame)
            except ValueError: # Duplicate names--others may be more pernicious.
                continue
            except:
                print(key)
                raise
                continue
        return frames
    


In [19]:
Path("fec/20200305/1388022.fec").exists()

True

In [20]:
class Datefecs():
    def __init__(self, date):
        self.path = Path(f"raw/{date}")
        self.date = date
    def wrapup(self):
        """
        Combine all the files of the same form type together for any given date.
        """
        date = self.date
        base = Path(f"cache/{date}")
        for form_group in base.glob("*"):
            formtype = form_group.name
            parquet_loc = base / Path(formtype).with_suffix(".parquet")
            if parquet_loc.is_file():
                continue
            files = []
            for filename in form_group.glob("*.parquet"):
                try:
                    file = pd.read_parquet(filename)
                except:
                    print(f"File: {filename}")
                    raise
                file['source_file'] = filename.with_suffix('').name
                files.append(file)
            if (len(files)==0):
                continue
            parquet_loc = base / Path(formtype).with_suffix(".parquet")
            pd.concat(files).to_parquet(parquet_loc, allow_truncated_timestamps=True)
    def filings(self):
        for f in Path(f"fec/{self.date}").glob("*.fec"):
            yield Fecfile(f)
    def nth(self, ix):
        for i, r in enumerate(self.filings()):
            if i == ix:
                return r
    def cache_children(self, force = False):
        for f in Path(f"fec/{self.date}").glob("*.fec"):
            if f.with_suffix('').name in cached_forms and not force:
                continue
            filing = MyFecfile(f)
            filing.cache_forms()
    
r = Datefecs("20200305")
r.cache_children()
r.wrapup()

In [6]:
%load_ext autoreload
%autoreload 2

In [7]:
fecfile.__path__

['/Users/bschmidt/Dropbox/lib/fecfile/fecfile']

In [8]:
all_paths = [z for z in Path("fec").glob("**/*.fec")]
all_dirs = [z for z in Path("fec").glob("*")]

random.shuffle(all_paths)
random.shuffle(all_dirs)

cached_forms = set([z.with_suffix('').with_suffix('').name for z in Path("cache").glob("**/*.parquet")])

len(cached_forms)

115341

In [10]:
ncalls  tottime  percall  cumtime  percall filename:lineno(function)
   177015    5.207    0.000   32.687    0.000 fecparser.py:164(parse_line)
  7873592    4.181    0.000    4.515    0.000 cache.py:66(getTypeMapping)
  7873592    3.506    0.000   19.048    0.000 fecparser.py:187(getTyped)
 15785593    2.873    0.000    2.873    0.000 {method 'startswith' of 'str' objects}
  1753430    2.868    0.000    2.868    0.000 {method 'replace' of 'datetime.datetime' objects}
   178140    2.667    0.000    7.618    0.000 fecparser.py:118(fields_from_line)
   175343    1.882    0.000    3.367    0.000 _strptime.py:318(_strptime)
  7892234    1.598    0.000    3.041    0.000 fecparser.py:128(<lambda>)
   175343    1.530    0.000    6.803    0.000 tzinfo.py:258(localize)
8767364/8766948    0.674    0.000    0.674    0.000 {built-in method builtins.len}
   177124    0.600    0.000   33.548    0.000 fecparser.py:68(iter_lines)
   350686    0.525    0.000    1.881    0.000 tzinfo.py:193(fromutc)
   350511    0.510    0.000    0.629    0.000 {method 'add' of 'set' objects}
   701516    0.459    0.000    0.459    0.000 {built-in method builtins.max}
   179266    0.423    0.000    0.423    0.000 {method 'split' of 'str' objects}
     1125    0.396    0.000   34.010    0.030 fecparser.py:45(loads)
   701372    0.379    0.000    0.379    0.000 {built-in method _bisect.bisect_right}
   350686    0.366    0.000    2.794    0.000 tzinfo.py:203(normalize)
   175343    0.350    0.000    3.717    0.000 _strptime.py:574(_strptime_datetime)
   287530    0.289    0.000    0.289    0.000 {method 'match' of 're.Pattern' objects}
   175343    0.201    0.000    3.918    0.000 {built-in method strptime}
   175343    0.196    0.000    0.196    0.000 {built-in method _locale.setlocale}
   971325    0.195    0.000    0.195    0.000 {method 'strip' of 'str' objects}
   175343    0.180    0.000    0.266    0.000 locale.py:384(normalize)
   175979    0.166    0.000    0.193    0.000 cache.py:39(getMapping)
        1    0.162    0.162   34.357   34.357 <ipython-input-13-922219bc4e05>:32(cache_children)
   430949    0.143    0.000    0.143    0.000 {method 'replace' of 'str' objects}
   175343    0.136    0.000    0.710    0.000 locale.py:575(getlocale)
   175343    0.135    0.000    0.135    0.000 {method 'groupdict' of 're.Match' objects}
   175343    0.127    0.000    0.837    0.000 _strptime.py:26(_getlang)
   702098    0.123    0.000    0.123    0.000 {method 'get' of 'dict' objects}
   350212    0.119    0.000    0.119    0.000 tzinfo.py:396(utcoffset)
   175343    0.112    0.000    0.378    0.000 locale.py:467(_parse_localename)
   112190    0.102    0.000    0.214    0.000 re.py:271(_compile)
   175999    0.101    0.000    0.101    0.000 fecparser.py:17(__init__)
   606292    0.100    0.000    0.100    0.000 {method 'lower' of 'str' objects}
   356280    0.081    0.000    0.081    0.000 {built-in method builtins.chr}
     1125    0.066    0.000    0.078    0.000 {built-in method io.open}
   466570    0.063    0.000    0.063    0.000 {built-in method builtins.isinstance}
   350686    0.054    0.000    0.054    0.000 {method 'toordinal' of 'datetime.date' objects}
   176005    0.053    0.000    0.053    0.000 {method 'upper' of 'str' objects}
     3918    0.048    0.000    0.334    0.000 cache.py:50(getTypeMapping_from_regex)
   112187    0.047    0.000    0.309    0.000 re.py:170(match)
   112188    0.047    0.000    0.075    0.000 types.py:164(__get__)
   181031    0.032    0.000    0.032    0.000 {method 'append' of 'list' objects}
   187208    0.030    0.000    0.030    0.000 {method 'keys' of 'dict' objects}
   112188    0.028    0.000    0.028    0.000 enum.py:628(value)
   175343    0.025    0.000    0.025    0.000 {method 'pop' of 'set' objects}
   175343    0.024    0.000    0.024    0.000 {method 'end' of 're.Match' objects}
     1125    0.024    0.000    0.039    0.000 {method 'read' of '_io.TextIOWrapper' objects}
   175343    0.023    0.000    0.023    0.000 {method 'weekday' of 'datetime.date' objects}
     1125    0.020    0.000   34.147    0.030 __init__.py:90(from_file)
     1125    0.013    0.000    0.013    0.000 {built-in method _codecs.utf_8_decode}
    58494    0.011    0.000    0.011    0.000 {method 'endswith' of 'str' objects}
     1125    0.007    0.000   34.165    0.030 <ipython-input-12-c8fea67ea934>:2(__init__)
     2250    0.007    0.000    0.018    0.000 pathlib.py:846(with_suffix)
     1126    0.005    0.000    0.013    0.000 pathlib.py:523(_select_from)
  320/219    0.004    0.000    0.010    0.000 sre_parse.py:469(_parse)
     6750    0.004    0.000    0.004    0.000 pathlib.py:790(name)
     3375    0.004    0.000    0.006    0.000 pathlib.py:677(_from_parsed_parts)
 

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 3)

In [24]:
all_dirs = [z for z in Path("fec").glob("*")]
one_year = [dir.name for dir in all_dirs if "2018" in dir.name]
random.shuffle(one_year)
print(len(one_year))


365


In [25]:
cached_forms.remove('1377999')

In [26]:
cached_forms
for date in one_year:
    print(date)
    try:
        r = Datefecs(date)
        r.cache_children()
        r.wrapup()
    except KeyboardInterrupt:
        break
    except:
        print(f"Error on {date}")
        raise

20180422
20180406
20180224
20180217
20181120
20180826
20180628
20181108
20180601
20180202
20181104
20180128
20180629
20180131
20180221
20180105
20181204
20180503
20180630
20181130
20180626
20180423
20180617
20180712
20180312
20180917
20180122
20180818
20180910
20180506
20180830
20180426
20180526
20180915
20181118
20181215
20180319
20180704
20180924
20180528
20180730
20181006
20181119
20180926
20180809
20180211
20180404
20180208
20181229
20180206
20181106
20180829
20180824
20180623
20180424
20180904
20180323
20180811
20180512
20181210
20180301
20180525
20181112
20180317
20181030
20181203
20180804
20180508
20180618
20181105
20181219
20180831
20180101
20180930
20181001
20180725
20180416
20180517
20180218
20180815
20181124
20180908
20180413
20180204
20180511
20181205
20180410
20180201
20180311
20180531
20181231
20180928
20180706
20180620
__________
b'SB30B\n'
Error on 20180620


ValueError: not enough values to unpack (expected 2, got 1)

In [None]:
#r = Datefecs("20181013")
#r.cache_children()
#r.wrapup()


In [None]:
for i, dir in enumerate(Path("fec").glob("201*")):
    print(dir)
    filings = dir.glob("*.fec")
    for file in filings:
        try:
           Fecfile(file).cache_forms()
        except UnicodeDecodeError:
            pass
    
        


In [None]:
all = [
    
]

for p in Path("cache").glob("2019*/SA1[17]*.parquet"):
    d = pd.read_parquet(p)
    v = pd.DataFrame({
        'recipient': d.filer_committee_id_number, 
        'donor': (d.contributor_first_name + d.contributor_last_name + d.contributor_organization_name + d.contributor_zip_code.str[:5]).str.upper(),
        'amt': d.contribution_amount
    })
    # Prefer FEC ids to mashing together names.
    v.loc[d.donor_committee_fec_id != "", 'donor'] = d.donor_committee_fec_id[d.donor_committee_fec_id != ""]
    v.loc[d.donor_candidate_fec_id != "", 'donor'] = d.donor_candidate_fec_id[d.donor_candidate_fec_id != ""]
    
    all.append((v))
    if len(all) % 25 == 0:
        print((p, len(all)))
 #   if len(all) > 200:
 #       break

In [None]:
totals = pd.concat(all)
counts = totals.groupby(["recipient", "donor"])['amt'].sum().reset_index()


In [None]:
counts.to_parquet("total_2019_network.parquet")

In [None]:
import pandas as pd
counts = pd.read_parquet("total_2019_network.parquet")

In [None]:
# Filter down to donors with at least 5 recipients and vice versa
thresh = 3
last_shape = 0
current_shape = counts.shape[0]
while last_shape != current_shape:

    print(current_shape)
    last_shape = counts.shape[0]
    counts['recipient_count']= counts.recipient.map(counts.recipient.value_counts())
    counts = counts[counts['recipient_count'] >= thresh]
    counts['donor_count']= counts.donor.map(counts.donor.value_counts())
    counts = counts[counts['donor_count'] >= thresh]
    counts = counts[counts.amt > 0] # Yerg negative is just confusing
    current_shape = counts.shape[0]
    

In [None]:
individual_table = counts.groupby("recipient")['amt'].sum().reset_index().rename(columns = {"recipient":"ix", "amt": "receipts"}).merge(
    counts.groupby("donor")['amt'].sum().reset_index().rename(columns = {"donor":"ix", "amt": "donations"}), how='outer').fillna(0)

In [None]:
committees = pd.read_csv("committee_summary_2020.csv")[["CMTE_ID", "CMTE_NM", "CMTE_CITY", "CMTE_ST", "ORG_TP"]]
committees

In [None]:
metadata = individual_table.merge(committees, left_on = "ix", right_on = "CMTE_ID", how = "left")

In [None]:
id_list = dinodct()

In [None]:
counts.sort_values("amt", ascending = False)

In [None]:
import numpy as np
sparse1 = np.zeros((counts.shape[0], 3), np.int64)

In [None]:
contribs = open("/Users/bschmidt/contribs.edgelist", "w")
for i, row in enumerate(counts.iterrows()):
    recipient, donor, amount, n , _ = row[1]
    if not recipient in id_list:
        id_list[recipient] = len(id_list)
    if not donor in id_list:
        id_list[donor] = len(id_list)
    contribs.write(f"{id_list[recipient]}\t{id_list[donor]}\t{np.log(amount)}\n")
    sparse1[i] = [id_list[recipient], id_list[donor], np.log(amount)]


In [None]:
!/Users/bschmidt/snap/examples/node2vec/node2vec -i:/Users/bschmidt/contribs.edgelist -v -o:/Users/bschmidt/embedding_1.bin -l:20 -k:4 -d:32

In [None]:
# I want the backwards lookup, too.
reversed_dict = {v: k for k, v in id_list.items()}

In [None]:
#import scipy.sparse
#sparsified = scipy.sparse.coo_matrix((sparse1[:,2], (sparse1[:,1], sparse1[:,0])))

In [None]:
import numpy as np

In [None]:
x = SRP.Vector_file("/Users/bschmidt/embedding_1.bin").to_matrix()

In [None]:
x['matrix'].shape

In [None]:
import umap
embedded = umap.UMAP(n_neighbors=20, random_state=1, low_memory=False).fit_transform(x['matrix'])


In [None]:
em2 = pd.DataFrame(embedded, columns = ['x', 'y'])
em2['ix'] = range(em2.shape[0])
em2['ix'].replace(reversed_dict, inplace=True)

In [None]:
em2 = em2.merge(metadata)

In [None]:
em2['label'] = em2['ix']
del em2['ix']

In [None]:
em2.to_csv("~/embedded_2.csv", index = False)

In [None]:
em2.sort_values("receipts", ascending = False)

In [None]:
import altair as alt

In [None]:
alt.Chart(em2.sort_values("receipts", ascending = False).head(300)).mark_circle().encode(x = "x", y = "y", tooltip="CMTE_NM", size="receipts")