In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from IPython.core.display import display, HTML
import os
%matplotlib inline

In [2]:
# Notebook Styling 
sns.set()
pd.options.display.max_columns = None
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.float_format',lambda x: '%.5f' % x)

In [3]:
CSV_PATH = os.path.join('data', 'fivethirtyeight', 'dem_candidates_2018_prim.csv')
df = pd.read_csv(CSV_PATH, encoding='latin1', parse_dates=['Race Primary Election Date'])
df.head()

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Gender,Partisan Lean,Primary %,Won Primary,Race,Veteran?,LGBTQ?,Elected Official?,Self-Funder?,STEM?,Obama Alum?
0,Anthony White (Alabama),AL,Governor of Alabama,Governor,Regular,2018-06-05,Lost,,,Male,-28.88,3.42,No,Nonwhite,Yes,No,No,No,No,No
1,Christopher Countryman,AL,Governor of Alabama,Governor,Regular,2018-06-05,Lost,,,Male,-28.88,1.74,No,White,No,Yes,No,No,No,No
2,"Doug ""New Blue"" Smith",AL,Governor of Alabama,Governor,Regular,2018-06-05,Lost,,,Male,-28.88,3.27,No,White,Yes,No,No,No,No,No
3,James C. Fields,AL,Governor of Alabama,Governor,Regular,2018-06-05,Lost,,,Male,-28.88,8.0,No,Nonwhite,Yes,No,Yes,No,No,No
4,Sue Bell Cobb,AL,Governor of Alabama,Governor,Regular,2018-06-05,Lost,,,Female,-28.88,28.98,No,White,No,No,Yes,No,No,No


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 811 entries, 0 to 810
Data columns (total 20 columns):
Candidate                     811 non-null object
State                         811 non-null object
District                      811 non-null object
Office Type                   811 non-null object
Race Type                     811 non-null object
Race Primary Election Date    811 non-null datetime64[ns]
Primary Status                807 non-null object
Primary Runoff Status         811 non-null object
General Status                807 non-null object
Gender                        811 non-null object
Partisan Lean                 811 non-null float64
Primary %                     801 non-null float64
Won Primary                   785 non-null object
Race                          655 non-null object
Veteran?                      800 non-null object
LGBTQ?                        800 non-null object
Elected Official?             800 non-null object
Self-Funder?                  811 n

In [5]:
df.isnull().sum()

Candidate                       0
State                           0
District                        0
Office Type                     0
Race Type                       0
Race Primary Election Date      0
Primary Status                  4
Primary Runoff Status           0
General Status                  4
Gender                          0
Partisan Lean                   0
Primary %                      10
Won Primary                    26
Race                          156
Veteran?                       11
LGBTQ?                         11
Elected Official?              11
Self-Funder?                    0
STEM?                          11
Obama Alum?                     1
dtype: int64

We can see the counts of missing values in the printout above. Understandably, race is the most common missing value. Per the list of unique values below, we see that the race category has been grouped as 'white' and 'nonwhite'.

In [6]:
df['Race'].unique()

array(['Nonwhite', 'White', nan], dtype=object)

In [7]:
sum(df['Race Primary Election Date'] > pd.datetime.now())

0

Ok, so we see that all of the included primaries have already occurred (none of them happened after today).

In [8]:
df.head()

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Gender,Partisan Lean,Primary %,Won Primary,Race,Veteran?,LGBTQ?,Elected Official?,Self-Funder?,STEM?,Obama Alum?
0,Anthony White (Alabama),AL,Governor of Alabama,Governor,Regular,2018-06-05,Lost,,,Male,-28.88,3.42,No,Nonwhite,Yes,No,No,No,No,No
1,Christopher Countryman,AL,Governor of Alabama,Governor,Regular,2018-06-05,Lost,,,Male,-28.88,1.74,No,White,No,Yes,No,No,No,No
2,"Doug ""New Blue"" Smith",AL,Governor of Alabama,Governor,Regular,2018-06-05,Lost,,,Male,-28.88,3.27,No,White,Yes,No,No,No,No,No
3,James C. Fields,AL,Governor of Alabama,Governor,Regular,2018-06-05,Lost,,,Male,-28.88,8.0,No,Nonwhite,Yes,No,Yes,No,No,No
4,Sue Bell Cobb,AL,Governor of Alabama,Governor,Regular,2018-06-05,Lost,,,Female,-28.88,28.98,No,White,No,No,Yes,No,No,No


In [9]:
no_dupes = len(df.loc[:,['Candidate','District']].drop_duplicates())
print('Without duplicated candidate names, there are {} entries in the data set'.format(no_dupes))
print('With duplicated candidate names, there are    {} entries in the data set'.format(len(df)))

Without duplicated candidate names, there are 801 entries in the data set
With duplicated candidate names, there are    811 entries in the data set


From the printout, we can see that either 10 candidates match on both name and district. It seems very strange that multiple people would have the same name and be running in the same district. This deserves further inspection.

Per inspection, we see that these cases involve people running in both a regular and special election.  

In [10]:
df[df.loc[:,['Candidate','District']].duplicated(keep=False)]

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Gender,Partisan Lean,Primary %,Won Primary,Race,Veteran?,LGBTQ?,Elected Official?,Self-Funder?,STEM?,Obama Alum?
304,Bill Wild,MI,U.S. House Michigan District 13,Representative,Regular,2018-08-07,Lost,,,Male,60.72,14.0,No,White,No,No,Yes,No,No,No
305,Bill Wild,MI,U.S. House Michigan District 13,Representative,Special,2018-08-07,Lost,,,Male,60.72,15.2,No,White,No,No,Yes,No,No,No
306,Brenda Jones,MI,U.S. House Michigan District 13,Representative,Special,2018-08-07,Advanced,,On the Ballot,Female,60.72,37.7,Yes,Nonwhite,No,No,Yes,No,No,No
307,Brenda Jones,MI,U.S. House Michigan District 13,Representative,Regular,2018-08-07,Lost,,,Female,60.72,29.2,No,Nonwhite,No,No,Yes,No,No,No
309,Ian Conyers,MI,U.S. House Michigan District 13,Representative,Regular,2018-08-07,Lost,,,Male,60.72,6.3,No,Nonwhite,No,No,Yes,No,No,No
310,Ian Conyers,MI,U.S. House Michigan District 13,Representative,Special,2018-08-07,Lost,,,Male,60.72,11.2,No,Nonwhite,No,No,Yes,No,No,No
311,Rashida Tlaib,MI,U.S. House Michigan District 13,Representative,Regular,2018-08-07,Advanced,,On the Ballot,Female,60.72,33.2,Yes,Nonwhite,No,No,Yes,No,No,Yes
312,Rashida Tlaib,MI,U.S. House Michigan District 13,Representative,Special,2018-08-07,Lost,,,Female,60.72,35.9,No,Nonwhite,No,No,Yes,No,No,Yes
500,Danny O'Connor,OH,U.S. House Ohio District 12,Representative,Regular,2018-05-08,Advanced,,On the Ballot,Male,-13.63,40.52,Yes,,No,No,Yes,No,No,No
501,Danny O'Connor,OH,U.S. House Ohio District 12,Representative,Special,2018-05-08,Advanced,,On the Ballot,Male,-13.63,40.94,Yes,,No,No,Yes,No,No,No


In [11]:
cols = ['Office Type', 'Race Type', 'Primary Status', 'Primary Runoff Status',
       'General Status', 'Gender',  'Won Primary',
       'Race', 'Veteran?', 'LGBTQ?', 'Elected Official?', 'Self-Funder?',
       'STEM?', 'Obama Alum?']
for feature in cols:
    print('Feature: {:^21s}; # of Unique Vals: {};  Unique Values: {}'
          .format(feature, len(df[feature].unique()), df[feature].unique()))

Feature:      Office Type     ; # of Unique Vals: 3;  Unique Values: ['Governor' 'Representative' 'Senator']
Feature:       Race Type      ; # of Unique Vals: 2;  Unique Values: ['Regular' 'Special']
Feature:    Primary Status    ; # of Unique Vals: 3;  Unique Values: ['Lost' 'Advanced' nan]
Feature: Primary Runoff Status; # of Unique Vals: 4;  Unique Values: ['None' 'Lost' 'Advanced' 'On the Ballot']
Feature:    General Status    ; # of Unique Vals: 3;  Unique Values: ['None' 'On the Ballot' nan]
Feature:        Gender        ; # of Unique Vals: 2;  Unique Values: ['Male' 'Female']
Feature:      Won Primary     ; # of Unique Vals: 3;  Unique Values: ['No' 'Yes' nan]
Feature:         Race         ; # of Unique Vals: 3;  Unique Values: ['Nonwhite' 'White' nan]
Feature:       Veteran?       ; # of Unique Vals: 3;  Unique Values: ['Yes' 'No' nan]
Feature:        LGBTQ?        ; # of Unique Vals: 3;  Unique Values: ['No' 'Yes' nan]
Feature:   Elected Official?  ; # of Unique Vals: 3;  Uniq

From a quick inspection of the data, we see that all of the above features are categorical. 

2 of the features ('Office Type' and 'Primary Runoff Status') are multicategory features, while the remainder are binary features, although most of them have missing values that need to be addressed.

## Missing Data

In [12]:
# Pre Adjustment
df['Race'].isnull().sum()

156

### Race

I want to see if race is an important feature, so I'll do some leg work to fill in the data. Race (or at least racial appearance) can be identified by sight, and people who run for office tend to be photographed. I've used appearing white as my test for whiteness, and I've required 2 distinct photos on 2 distinct sites as my test (to make sure that the photo is correctly named).

So far, I've given up on finding photos of 
* Michael Brown (MD)
* Jack Schofield Jr. (NV)
* Cody James Slatzer-Rose (OH)
* Joseph Schenkenfelder (TN)

Interesting things:
* The confirmation picture I found for [David Matthew Hullum](http://www.nwajailbirds.com/2018-03-28-stacy-charles-digby-arrested-for-sexual-indecency-with-a-child-and-sexual-assault.html) was a mugshot from March 28th, 2018. 

In [13]:
df.loc[(df['Candidate'] == 'Danny O\'Connor') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'John Russell') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Jackie Patton') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Zach Scott') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Ed Albertson') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Doug Wilson') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Joseph Schiavoni') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Paul Ray') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Michael Milisits') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Robert Klepinger') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Theresa Gasper') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'John Peters') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Betsy Rader') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Rick Neal') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Rob Jarvis') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Aaron Godfrey') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Grant Goodrich') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Mark Dent') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Susan Moran Palmer') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Janet Everhard') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Jill Schiller') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'William Smith') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'John Wilson') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Janet Garrett') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'James Neu Jr.') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'John Michael Galbraith') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Shawna Roberts') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Werner Lange') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Ken Harbaugh') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Patrick Pikus') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Bill Ebben') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Matthew Guyette') & (df['State'] == 'OH'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Ted Jones') & (df['State'] == 'OH'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'Craig Olson') & (df['State'] == 'ME'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Jared Golden') & (df['State'] == 'ME'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Lucas St. Clair') & (df['State'] == 'ME'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Zak Ringelstein') & (df['State'] == 'ME'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'Michael McCarthy (Michigan)') & (df['State'] == 'MI'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'Ed Andres') & (df['State'] == 'MO'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Kenneth Hatfield') & (df['State'] == 'MO'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Vincent Jennings') & (df['State'] == 'MO'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'Randy Wadkins') & (df['State'] == 'MS'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Michael Aycox') & (df['State'] == 'MS'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Michael Evans') & (df['State'] == 'MS'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Jensen Bohren') & (df['State'] == 'MS'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'Grant Kier') & (df['State'] == 'MT'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Jared Pettinato') & (df['State'] == 'MT'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'John Heenan') & (df['State'] == 'MT'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'John Meyer') & (df['State'] == 'MT'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Kathleen Williams') & (df['State'] == 'MT'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Lynda Moss') & (df['State'] == 'MT'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'Mac Schneider') & (df['State'] == 'ND'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'Dennis Crawford') & (df['State'] == 'NE'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Jessica McClure') & (df['State'] == 'NE'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Brad Ashford') & (df['State'] == 'NE'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Kara Eastman') & (df['State'] == 'NE'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Paul Theobald') & (df['State'] == 'NE'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Chris Janicek') & (df['State'] == 'NE'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Frank Svoboda') & (df['State'] == 'NE'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Jane Raybould') & (df['State'] == 'NE'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Larry Marvin') & (df['State'] == 'NE'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'Alison Heslin') & (df['State'] == 'NJ'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Mark Washburne') & (df['State'] == 'NJ'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Mikie Sherrill') & (df['State'] == 'NJ'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Mitchell Cobert') & (df['State'] == 'NJ'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Nathan Kleinman') & (df['State'] == 'NJ'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Josh Welle') & (df['State'] == 'NJ'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Tom Malinowski') & (df['State'] == 'NJ'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'Madeline Hildebrandt') & (df['State'] == 'NM'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'Chris Giunchigliani') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'David Jones') & (df['State'] == 'NV'), 'Race'] = 'Nonwhite'
df.loc[(df['Candidate'] == 'John Bonaventura') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Steve Sisolak') & (df['State'] == 'NV'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'Clint Koble') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Jesse Hurley') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Patrick Fogarty') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Rick Shepherd') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Vance Alm') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Eric Stoltz') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Guy Pinjuv') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Jack Love') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Michael Weiss') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Richard Hart') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Steve Schiffman') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Susie Lee') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Amy Vilela') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'John Anzalone') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Sid Zeller') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Daniel Burleigh') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Jacky Rosen') & (df['State'] == 'NV'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'David Drew Knight') & (df['State'] == 'NV'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'David Pechefsky') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Elaine DiMasi') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Perry Gershon') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Max Rose') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Paul Sperling') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Zach Emig') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'David Clegg') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Erin Collier') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Gareth Rhodes') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Jeff Beals') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Pat Ryan') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Liuba Grechen Shirley') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Dylan Ratigan') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Emily Martz') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Katie Wilson') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Tedra Cobb') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Anthony Brindisi') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Eddie Sundquist') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Ian Golden') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Linda Andrei') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Max Della Pia') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Tracy Mitrano') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Dana Balter') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Joseph Morelle') & (df['State'] == 'NY'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Rachel Barnhart') & (df['State'] == 'NY'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'Amanda Douglas') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'David Matthew Hullum') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Gwendolyn Fields') & (df['State'] == 'OK'), 'Race'] = 'Nonwhite'
df.loc[(df['Candidate'] == 'Mark Keeter') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Tim Gilpin') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Clay Padgett') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Virginia Jenner') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Frankie Robbins') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Murray Thibodeaux') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Fred Gipson') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Mallory Varner') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Mary Brannon') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Roxann Klutts') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Eddie Porter') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Elysabeth Britt') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Kendra Horn') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Leona Kelley-Leonard') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Tom Guild') & (df['State'] == 'OK'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Tyson Todd Meade') & (df['State'] == 'OK'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'Eric Burnette') & (df['State'] == 'OR'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Jim Crary') & (df['State'] == 'OR'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Michael Byrne') & (df['State'] == 'OR'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Raz Mason') & (df['State'] == 'OR'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Timothy White') & (df['State'] == 'OR'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'Robert Multari') & (df['State'] == 'PA'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Ronald DiNicola') & (df['State'] == 'PA'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Richard Lazer') & (df['State'] == 'PA'), 'Race'] = 'White'

df.loc[(df['Candidate'] == 'Christopher Finley') & (df['State'] == 'TN'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Peter Heffernan') & (df['State'] == 'TN'), 'Race'] = 'White'
df.loc[(df['Candidate'] == 'Gary Davis') & (df['State'] == 'TN'), 'Race'] = 'White'

In [14]:
df[df['Race'].isnull()]

Unnamed: 0,Candidate,State,District,Office Type,Race Type,Race Primary Election Date,Primary Status,Primary Runoff Status,General Status,Gender,Partisan Lean,Primary %,Won Primary,Race,Veteran?,LGBTQ?,Elected Official?,Self-Funder?,STEM?,Obama Alum?
271,Michael Brown,MD,U.S. House Maryland District 1,Representative,Regular,2018-06-26,Lost,,,Male,-29.66,15.0,No,,,,,No,,
428,Jack Schofield Jr.,NV,U.S. House Nevada District 2,Representative,Regular,2018-06-12,Lost,,,Male,-13.78,7.5,No,,No,No,No,No,No,No
525,Cody James Slatzer-Rose,OH,U.S. House Ohio District 4,Representative,Regular,2018-05-08,Lost,,,Male,-31.23,16.15,No,,No,No,No,No,Yes,No
639,Joseph Schenkenfelder,TN,U.S. House Tennessee District 2,Representative,Regular,2018-08-02,Lost,,,Male,-38.18,4.51,No,,,,,No,,No


In [15]:
print('Feature: {:30s}, Number of Unique Vals: {},  Unique Values: {}'
          .format('Race', len(df['Race'].unique()), df['Race'].unique()))

Feature: Race                          , Number of Unique Vals: 3,  Unique Values: ['Nonwhite' 'White' nan]


In [16]:
# Post Adjustment
df['Race'].isnull().sum()

4