In [1]:
import pandas as pd
import numpy as np
import os
import folium
from sklearn.preprocessing import OneHotEncoder

In [2]:
FOLDER_PATH = "data"
def print_sources():
    onlyfiles = [f for f in os.listdir(FOLDER_PATH) if os.path.isfile(os.path.join(FOLDER_PATH, f))]
    return onlyfiles

In [3]:
def load_csv(path):
    return pd.read_csv(os.path.join(FOLDER_PATH, path))

In [4]:
def combine_third_parties(df):
    third_f = (df['party'] != 'DEM') & (df['party'] != 'REP')
    df.loc[third_f,'party'] = '3RD'
    return df

In [5]:
def combine_party_and_encode(df):
    df['county_state'] = df['county'] +'_'+ df['state']
    df = df.groupby(['county', 'state', 'won','county_state','party'], as_index=False)['total_votes'].sum()
    df = df.sort_values(['county_state','party'])
    #One hot encoder
    cat_encoder = OneHotEncoder()
    party_cat = df[['party']]
    one_hot_party = cat_encoder.fit_transform(party_cat)
    one_hot_party = one_hot_party.toarray()
    df[['3RD','DEM','REP']] = pd.DataFrame(one_hot_party, index=df.index)
    df['index_col'] = df.index
    return df

In [6]:
print_sources()

['governors_county.csv',
 'governors_county_candidate.csv',
 'governors_state.csv',
 'house_candidate.csv',
 'house_state.csv',
 'president_county.csv',
 'president_county_candidate.csv',
 'president_state.csv',
 'senate_county.csv',
 'senate_county_candidate.csv',
 'senate_state.csv']

In [7]:
house_data = load_csv('house_candidate.csv')
president_data = load_csv('president_county_candidate.csv')
senate_data = load_csv('senate_county_candidate.csv')

In [18]:
def combine_party_and_encode2(df):
    df['county_state'] = df['county'] +'_'+ df['state']
    votes = df.groupby(['county_state','party'], as_index=False).sum()
    print(type(votes))
    votes = votes.sort_values(['county_state','party']).reset_index(drop=True)
    df = df.drop_duplicates(['county_state','party'])
    df = df.sort_values(['county_state','party']).reset_index(drop=True)
    df[['party_votes']] = votes['total_votes']
    #print(df.head())
    #One hot encoder
    cat_encoder = OneHotEncoder()
    party_cat = df[['party']]
    one_hot_party = cat_encoder.fit_transform(party_cat)
    one_hot_party = one_hot_party.toarray()
    df[['3RD','DEM','REP']] = pd.DataFrame(one_hot_party, index=df.index)
    return df, votes

In [19]:
president_data_3 = combine_third_parties(president_data)

In [20]:
president_data_p, votes = combine_party_and_encode2(president_data_3)

<class 'pandas.core.frame.DataFrame'>
            state            county     candidate party  total_votes    won  \
0  South Carolina  Abbeville County  Jo Jorgensen   3RD           88  False   
1  South Carolina  Abbeville County     Joe Biden   DEM         4101  False   
2  South Carolina  Abbeville County  Donald Trump   REP         8215   True   
3           Maine             Abbot  Jo Jorgensen   3RD            7  False   
4           Maine             Abbot     Joe Biden   DEM          121  False   

                      county_state  party_votes  
0  Abbeville County_South Carolina          117  
1  Abbeville County_South Carolina         4101  
2  Abbeville County_South Carolina         8215  
3                      Abbot_Maine            8  
4                      Abbot_Maine          121  


In [22]:
print(president_data_p.head())

            state            county     candidate party  total_votes    won  \
0  South Carolina  Abbeville County  Jo Jorgensen   3RD           88  False   
1  South Carolina  Abbeville County     Joe Biden   DEM         4101  False   
2  South Carolina  Abbeville County  Donald Trump   REP         8215   True   
3           Maine             Abbot  Jo Jorgensen   3RD            7  False   
4           Maine             Abbot     Joe Biden   DEM          121  False   

                      county_state  party_votes  3RD  DEM  REP  
0  Abbeville County_South Carolina          117  1.0  0.0  0.0  
1  Abbeville County_South Carolina         4101  0.0  1.0  0.0  
2  Abbeville County_South Carolina         8215  0.0  0.0  1.0  
3                      Abbot_Maine            8  1.0  0.0  0.0  
4                      Abbot_Maine          121  0.0  1.0  0.0  


In [116]:
votes['total_votes']

0         117
1        4101
2        8215
3           8
4         121
         ... 
13894      25
13895    1490
13896     481
13897      21
13898     404
Name: total_votes, Length: 13899, dtype: int64

In [25]:
president_data.sort_values('county').head(20)

Unnamed: 0,state,county,candidate,party,total_votes,won,county_state
17481,South Carolina,Abbeville County,Donald Trump,REP,8215,True,Abbeville County_South Carolina
17485,South Carolina,Abbeville County,Rocky De La Fuente,3RD,6,False,Abbeville County_South Carolina
17484,South Carolina,Abbeville County,Howie Hawkins,3RD,23,False,Abbeville County_South Carolina
17483,South Carolina,Abbeville County,Jo Jorgensen,3RD,88,False,Abbeville County_South Carolina
17482,South Carolina,Abbeville County,Joe Biden,DEM,4101,False,Abbeville County_South Carolina
7226,Maine,Abbot,Joe Biden,DEM,121,False,Abbot_Maine
7228,Maine,Abbot,Howie Hawkins,3RD,1,False,Abbot_Maine
7227,Maine,Abbot,Jo Jorgensen,3RD,7,False,Abbot_Maine
7225,Maine,Abbot,Donald Trump,REP,288,True,Abbot_Maine
7230,Maine,Abbot,Write-ins,3RD,0,False,Abbot_Maine


In [231]:
president_data_p[president_data_p['state'] == 'Florida'].iloc[210]

KeyError: 'state'

In [232]:
df = df.groupby(['county_state','party'], as_index=False).sum()
df.head()

Unnamed: 0,county_state,party,total_votes,won
0,Abbeville County_South Carolina,3RD,117,0
1,Abbeville County_South Carolina,DEM,4101,0
2,Abbeville County_South Carolina,REP,8215,1
3,Abbot_Maine,3RD,8,0
4,Abbot_Maine,DEM,121,0


In [92]:
df.head()

Unnamed: 0,county_state,party,total_votes,won
0,Abbeville County_South Carolina,3RD,117,0
1,Abbeville County_South Carolina,DEM,4101,0
2,Abbeville County_South Carolina,REP,8215,1
3,Abbot_Maine,3RD,8,0
4,Abbot_Maine,DEM,121,0


In [96]:
cat_encoder = OneHotEncoder()
party_cat = df[['party']]
x = cat_encoder.fit_transform(party_cat)
x = x.toarray()

In [98]:
pd.DataFrame(x)

Unnamed: 0,0,1,2
0,1.0,0.0,0.0
1,0.0,1.0,0.0
2,0.0,0.0,1.0
3,1.0,0.0,0.0
4,0.0,1.0,0.0
...,...,...,...
13894,0.0,1.0,0.0
13895,0.0,0.0,1.0
13896,1.0,0.0,0.0
13897,0.0,1.0,0.0


In [94]:
df[['1','2','3']] = pd.DataFrame([x])

ValueError: Columns must be same length as key

In [61]:
df.size

311480

In [62]:
x.shape

(31148, 3)