In [9]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

In [10]:
src = "../data"
dst = "../data"

# KnowWho candidate list

## Clean candidate list

In [11]:
# apparently from https://kw1.knowwho.com/candidate-data/candidate-lists/
fname = "candidates_exp09142022.csv"

profiles = pd.read_csv(Path(src, fname), parse_dates=["ELECTIONDATE"])
profiles = profiles.dropna(subset=["TWITTER2"])
dropcols = ["ELECTIONYEAR", "CYCLEFILINGDATE", "BUILDING", "ROOM",
            "STREET1", "STREET2", "CITY", "ZIPCODE", "PHONE", "FAX",
            "FECLINKNUM", "PROFILE", "INSTAGRAM", "LINKEDIN", "FACEBOOK",
            "WEBSITE", "YOUTUBE", "EMAIL", "TWITTER", "TWITTER2", "FACEBOOK2",
            'PREFIX', 'FIRSTNAME', 'MIDDLENAME', 'LASTNAME', 'SUFFIX', 
            'NICKNAME', "DISTRICTDESC", "STATE", "LASTUPDATED", "PERSONID"
           ]

profiles = profiles.drop(columns=dropcols)
races = ["US Representative", "US Senator", "US Delegate"]
profiles = profiles[profiles["RACE"].isin(races)]
profiles = profiles[profiles["PARTY"].isin(["R", "D"])]
profiles = profiles.dropna(subset=["ELECTIONDATE"])

In [12]:
profiles["ELECTIONCODE"].value_counts()

ELECTIONCODE
P    1609
S       7
G       2
R       1
Name: count, dtype: int64

In [13]:
# this is a miscoding and the ELECTIONCODE should be "P" as
# Rebecca J. Viagran ran in the Democratic primaries in Texas on 2022-03-01
# similar to other Democratic candidates
# source: https://ballotpedia.org/Texas%27_35th_Congressional_District_election,_2022_(March_1_Democratic_primary)
profiles[profiles["ELECTIONCODE"] == "R"]

Unnamed: 0,INCUMBENT,STATUS,ELECTIONCODE,ELECTIONDATE,STATEABBR,LEVEL,CHAMBER,DISTRICT,RACE,ETHNICITY,GENDER,PARTY,screen_name,name
3526,,Running,R,2022-03-01,TX,FL,FH,35.0,US Representative,,Female,D,viagrancongress,Rebecca J. Viagran


In [14]:
profiles["ELECTIONCODE"] = profiles["ELECTIONCODE"].replace({"R":"P"})

In [15]:
# we exclude the special election that was held for Florida's 20th Congressional
# District on Jan 11, 2022.
# source: https://ballotpedia.org/Florida%27s_20th_Congressional_District_special_election,_2022
profiles[profiles["ELECTIONCODE"] == "G"]

Unnamed: 0,INCUMBENT,STATUS,ELECTIONCODE,ELECTIONDATE,STATEABBR,LEVEL,CHAMBER,DISTRICT,RACE,ETHNICITY,GENDER,PARTY,screen_name,name
843,,Defeated in Special Election,G,2022-01-11,FL,FL,FH,20.0,US Representative,White/Caucasian,Male,R,votemariner,Jason Mariner
856,Y,Won Special Election,G,2022-01-11,FL,FL,FH,20.0,US Representative,Black/African American,Female,D,sheila4congress,Sheila Cherfilus-McCormick


In [16]:
# we exclude the special election that was held for California's 22th 
# Congressional District on Apr 5, 2022.
# source: https://ballotpedia.org/California%27s_22nd_Congressional_District_special_election,_2022

# we exclude the special election that was held for Florida's 20th Congressional 
# District on Nov 2, 2022.
# source: https://ballotpedia.org/Natalia_Allen
profiles[profiles["ELECTIONCODE"] == "S"]

Unnamed: 0,INCUMBENT,STATUS,ELECTIONCODE,ELECTIONDATE,STATEABBR,LEVEL,CHAMBER,DISTRICT,RACE,ETHNICITY,GENDER,PARTY,screen_name,name
381,,Withdrew From Race,S,2022-04-05,CA,FL,FH,22.0,US Representative,,Male,R,nathanmagsigca,Nathan Magsig
384,,Running,S,2022-04-05,CA,FL,FH,22.0,US Representative,Black/African American,Male,D,lourinhubbard,Lourin Hubbard
385,,Running,S,2022-04-05,CA,FL,FH,22.0,US Representative,Hispanic/Latino,Male,D,ericg1247,Eric Garcia
386,,Withdrew From Race,S,2022-04-05,CA,FL,FH,22.0,US Representative,,Male,D,philarballo,Phil Arballo
389,,Running,S,2022-04-05,CA,FL,FH,22.0,US Representative,Asian/Pacific American,Female,R,elizabethheng,Elizabeth Heng
392,,Withdrew From Race,S,2022-04-05,CA,FL,FH,22.0,US Representative,Hispanic/Latino,Male,R,johnestrada,John A. Estrada
847,,Failed To Qualify,S,2022-11-02,FL,FL,FH,20.0,US Representative,,Female,D,nataliaallenfl,Natalia I. Allen


In [19]:
profiles = profiles[profiles["ELECTIONCODE"] == "P"]

In [20]:
# "primaries" on 2023-11-08 are all from Louisiana. Louisiana elections use the 
# Louisiana majority-vote system. All candidates compete in the same primary, 
# and a candidate can win the election outright by receiving more than 50% of 
# the vote. If no candidate does, the top two vote recipients from the primary 
# advance to the general election, regardless of their partisan affiliation.
# source: https://ballotpedia.org/Louisiana's_3rd_Congressional_District_election,_2022
profiles[profiles["ELECTIONDATE"] == pd.to_datetime("2022-11-08")]

Unnamed: 0,INCUMBENT,STATUS,ELECTIONCODE,ELECTIONDATE,STATEABBR,LEVEL,CHAMBER,DISTRICT,RACE,ETHNICITY,GENDER,PARTY,screen_name,name
2200,Y,Running,P,2022-11-08,LA,FL,FH,1.0,US Representative,White/Caucasian,Male,R,stevescalisegop,Steve Scalise
2202,Y,Running,P,2022-11-08,LA,FL,FH,2.0,US Representative,Black/African American,Male,D,troyc4congress,Troy Carter
2204,,Withdrew From Race,P,2022-11-08,LA,FL,FH,3.0,US Representative,White/Caucasian,Male,D,grangerforla,Dustin Granger
2205,Y,Running,P,2022-11-08,LA,FL,FH,3.0,US Representative,White/Caucasian,Male,R,captclayhiggins,Clay Higgins
2207,Y,Running,P,2022-11-08,LA,FL,FH,4.0,US Representative,White/Caucasian,Male,R,mikejohnson,Mike Johnson
2208,Y,Running,P,2022-11-08,LA,FL,FH,5.0,US Representative,White/Caucasian,Female,R,jbletlow,Julia Letlow
2209,Y,Running,P,2022-11-08,LA,FL,FH,6.0,US Representative,White/Caucasian,Male,R,garretgraves,Garret Graves
2215,,Running,P,2022-11-08,LA,FL,FS,,US Senator,Black/African American,Male,D,garychambersjr,Gary Chambers
2216,Y,Running,P,2022-11-08,LA,FL,FS,,US Senator,White/Caucasian,Male,R,johnkennedyla,John Kennedy


In [21]:
profiles["ELECTIONCODE"].value_counts()

ELECTIONCODE
P    1610
Name: count, dtype: int64

In [23]:
profiles = profiles.drop_duplicates(subset=["name", "PARTY", "STATEABBR", "screen_name"])
len(profiles)

1596

In [24]:
len(profiles[profiles["INCUMBENT"] == "Y"])

401

In [25]:
len(profiles["screen_name"].dropna())

1596

In [26]:
# there seem to be a few errors in the KnowHow data, which we manually correct
profile_counts = profiles["screen_name"].value_counts().reset_index()
profile_counts = profile_counts[profile_counts["count"] > 1]
profiles[profiles["screen_name"].isin(profile_counts["screen_name"])]

Unnamed: 0,INCUMBENT,STATUS,ELECTIONCODE,ELECTIONDATE,STATEABBR,LEVEL,CHAMBER,DISTRICT,RACE,ETHNICITY,GENDER,PARTY,screen_name,name
186,,Running,P,2022-05-24,AL,FL,FS,,US Senator,Black/African American,Male,D,brandaundean,Brandaun L. Dean
262,,Running,P,2022-08-02,AZ,FL,FH,7.0,US Representative,White/Caucasian,Male,R,realjeffzink,Jeffrey N. Zink
293,,Running,P,2022-05-24,AR,FL,FH,2.0,US Representative,Black/African American,Female,D,ncartwrightar,Quintessa Hathaway
294,,Withdrew From Race,P,2022-05-24,AR,FL,FH,2.0,US Representative,,Male,D,ncartwrightar,Nicolas Cartwright
311,,Running,P,2022-06-07,CA,FL,FH,1.0,US Representative,,Male,D,realjeffzink,David Leon Zink
398,,Running,P,2022-06-07,CA,FL,FH,23.0,US Representative,White/Caucasian,Male,D,brunoamato_1,Bruno Amato
1887,,Running,P,2022-08-02,KS,FL,FS,,US Senator,White/Caucasian,Female,R,farrforus,Joan E. Farr
2634,,Running,P,2022-06-07,NJ,FL,FH,7.0,US Representative,,Male,R,rikmehta_nj,Rikin Mehta
2642,Y,Withdrew From Race,P,2022-06-07,NJ,FL,FH,8.0,US Representative,Hispanic/Latino,Male,D,rikmehta_nj,Albio Sires
2657,,Withdrew From Race,P,2022-06-07,NM,FL,FH,2.0,US Representative,White/Caucasian,Male,R,mike4congress2,Michael Rakebrandt


In [27]:
# Brandaun L. Dean ran in AL, not PA. Source: https://ballotpedia.org/Brandaun_Dean
# Michael Rakebrandt withdrew from race in NM
profiles = profiles.drop(index=[3192, 2657])

# Twitter profile entry for David Leon Zink is wrong and belongs to Jeffrey N. Zink
profiles.loc[311, "screen_name"] = np.nan
# Twitter profile entry for Quintessa Hathaway is wrong and belongs to Nicolas Cartwright
profiles.loc[293, "screen_name"] = np.nan
# Twitter profile entry for Eric M. Bruno is wrong and belongs to Bruno Amato
profiles.loc[3102, "screen_name"] = np.nan
# Twitter profile entry for Albio Sires is wrong and belongs to Rikin Mehta
profiles.loc[2642, "screen_name"] = np.nan

# Joan E. Farr ran for senate in both OK and KS. 
# Source: https://ballotpedia.org/Joan_Farr

In [28]:
# sanity check: only Joan Farr should remain
profile_counts = profiles["screen_name"].value_counts().reset_index()
profile_counts = profile_counts[profile_counts["count"] > 1]
profiles[profiles["screen_name"].isin(profile_counts["screen_name"])]

Unnamed: 0,INCUMBENT,STATUS,ELECTIONCODE,ELECTIONDATE,STATEABBR,LEVEL,CHAMBER,DISTRICT,RACE,ETHNICITY,GENDER,PARTY,screen_name,name
1887,,Running,P,2022-08-02,KS,FL,FS,,US Senator,White/Caucasian,Female,R,farrforus,Joan E. Farr
3051,,Running,P,2022-06-28,OK,FL,FS,,US Senator,White/Caucasian,Female,R,farrforus,Joan E. Farr


In [29]:
fname = "KnowWho_profiles_clean.csv"
profiles.to_csv(Path(dst, fname), index=False)