In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

## Retrieving the data

We needed data regarding which party is in control in each state. We scraped this information from Wikipedia.

In [2]:
url = "https://en.wikipedia.org/wiki/Political_party_strength_in_U.S._states"

req = requests.get(url)
soup = BeautifulSoup(req.content, "html.parser")
print(soup)


<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Political party strength in U.S. states - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"6498d345-4e9d-4869-84f4-6ff2b6b6f5e8","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Political_party_strength_in_U.S._states","wgTitle":"Political party strength in U.S. states","wgCurRevisionId":965316519,"wgRevisionId":965316519,"wgArticleId":898541,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 maint: date and year","Use American English from March 2019","All Wikipedia articles w

The data we need is in a table, where each row is contained in a _tr_ tag.

In [3]:
trs = soup.find_all("table", class_="sortable wikitable")[0].find_all("tr")

In [4]:
headers = []

for th in trs[0].find_all("th"):
    headers.append(th.text.strip())

headers

['State',
 '2016 presidentialelection',
 'Governor',
 'State Senate',
 'State House',
 'SeniorU.S. Senator',
 'JuniorU.S. Senator',
 'U.S. House of Representatives',
 'Partisan split (as of 2018[update])']

In [5]:
state_cards = [] 
for tr in trs[1:len(trs)]:
    state_cards.append(tr.find_all("td"))

state_cards

[[<td><a href="/wiki/Alabama" title="Alabama">Alabama</a>
  </td>,
  <td style="background-color:#FFB6B6">Republican
  </td>,
  <td style="background-color:#FFB6B6">Republican
  </td>,
  <td style="background-color:#FFB6B6">Republican 27–8
  </td>,
  <td style="background-color:#FFB6B6">Republican 77–28
  </td>,
  <td style="background-color:#FFB6B6">Republican
  </td>,
  <td style="background-color:#B0CEFF">Democratic
  </td>,
  <td style="background-color:#FFB6B6">Republican 6–1
  </td>,
  <td style="background-color:#FFB6B6">Republican<br/>52–35<sup class="reference" id="cite_ref-gallup_9-0"><a href="#cite_note-gallup-9">[a]</a></sup>
  </td>],
 [<td><a href="/wiki/Alaska" title="Alaska">Alaska</a>
  </td>,
  <td style="background-color:#FFB6B6">Republican
  </td>,
  <td style="background-color:#FFB6B6">Republican
  </td>,
  <td style="background-color:#FFB6B6">Republican 13–7
  </td>,
  <td style="background:thistle;">Coalition 23–17<sup class="reference" id="cite_ref-alaska_10-0">

The next step was extracting the information from the tr elements and storing them in a dataframe.
We had to take into account that one of the rows has 1 cell fewer than the others (the one corresponding to Nebraska), where two cells are merged into one (State Senate and State House).

In [6]:
dict_for_df = {}

for i in range(0, 9):
    list_of_values = []
    for j in range(len(state_cards)):
        try:
            list_of_values.append(state_cards[j][i].text.strip())
        except:
            list_of_values.append(pd.NA)
    dict_for_df[headers[i]] = list_of_values

In [7]:
dict_for_df

{'State': ['Alabama',
  'Alaska',
  'Arizona',
  'Arkansas',
  'California',
  'Colorado',
  'Connecticut',
  'Delaware',
  'Florida',
  'Georgia',
  'Hawaii',
  'Idaho',
  'Illinois',
  'Indiana',
  'Iowa',
  'Kansas',
  'Kentucky',
  'Louisiana',
  'Maine',
  'Maryland',
  'Massachusetts',
  'Michigan',
  'Minnesota',
  'Mississippi',
  'Missouri',
  'Montana',
  'Nebraska',
  'Nevada',
  'New Hampshire',
  'New Jersey',
  'New Mexico',
  'New York',
  'North Carolina',
  'North Dakota',
  'Ohio',
  'Oklahoma',
  'Oregon',
  'Pennsylvania',
  'Rhode Island',
  'South Carolina',
  'South Dakota',
  'Tennessee',
  'Texas',
  'Utah',
  'Vermont',
  'Virginia',
  'Washington',
  'West Virginia',
  'Wisconsin',
  'Wyoming'],
 '2016 presidentialelection': ['Republican',
  'Republican',
  'Republican',
  'Republican',
  'Democratic',
  'Democratic',
  'Democratic',
  'Democratic',
  'Republican',
  'Republican',
  'Democratic',
  'Republican',
  'Democratic',
  'Republican',
  'Republican',

In [8]:
party_data = pd.DataFrame(dict_for_df)

party_data

Unnamed: 0,State,2016 presidentialelection,Governor,State Senate,State House,SeniorU.S. Senator,JuniorU.S. Senator,U.S. House of Representatives,Partisan split (as of 2018[update])
0,Alabama,Republican,Republican,Republican 27–8,Republican 77–28,Republican,Democratic,Republican 6–1,Republican52–35[a]
1,Alaska,Republican,Republican,Republican 13–7,Coalition 23–17[b],Republican,Republican,Republican,Republican24.3–13.3[c][9]
2,Arizona,Republican,Republican,Republican 17–13,Republican 31–29,Democratic,Republican,Democratic 5–4,Republican34.9–32.5[c][10]
3,Arkansas,Republican,Republican,Republican 26–9,Republican 76–24,Republican,Republican,Republican 4,Republican48–35[a]
4,California,Democratic,Democratic,Democratic 29–11,Democratic 61–19,Democratic,Democratic,Democratic 46–7,Democratic45.3–23.9[c][11]
5,Colorado,Democratic,Democratic,Democratic 19–16,Democratic 41–24,Democratic,Republican,Democratic 4–3,Democratic30.4–28.0[c][12]
6,Connecticut,Democratic,Democratic,Democratic 22–14,Democratic 91–60,Democratic,Democratic,Democratic 5,Democratic36.6–21.0[c][13]
7,Delaware,Democratic,Democratic,Democratic 12–9,Democratic 26–15,Democratic,Democratic,Democratic,Democratic47.6–27.7[c][14]
8,Florida,Republican,Republican,Republican 23–17,Republican 73–47,Republican,Republican,Republican 14–13,Democratic37.4–35.3[c][15]
9,Georgia,Republican,Republican,Republican 35–21,Republican 105–75,Republican,Republican,Republican 9–5,Democratic43–42[a]


## Cleaning the data

Before proceeding, we had to fix the row corresponding to Nebraska. This was done by shifting the contents of columns 5-8 to the right and replacing the contents of the "Senate House" column for that row with "Unicameral nonpartisan legislature".

In [9]:
what_we_have = ["SeniorU.S. Senator", "JuniorU.S. Senator", "U.S. House of Representatives", "Partisan split (as of 2018[update])"]
what_we_want = ["State House", "SeniorU.S. Senator", "JuniorU.S. Senator", "U.S. House of Representatives"]

In [10]:
for i in range(0,4):
    party_data.loc[26, what_we_have[i]] = party_data.loc[26, what_we_want[i]]

In [11]:
party_data

Unnamed: 0,State,2016 presidentialelection,Governor,State Senate,State House,SeniorU.S. Senator,JuniorU.S. Senator,U.S. House of Representatives,Partisan split (as of 2018[update])
0,Alabama,Republican,Republican,Republican 27–8,Republican 77–28,Republican,Democratic,Republican 6–1,Republican52–35[a]
1,Alaska,Republican,Republican,Republican 13–7,Coalition 23–17[b],Republican,Republican,Republican,Republican24.3–13.3[c][9]
2,Arizona,Republican,Republican,Republican 17–13,Republican 31–29,Democratic,Republican,Democratic 5–4,Republican34.9–32.5[c][10]
3,Arkansas,Republican,Republican,Republican 26–9,Republican 76–24,Republican,Republican,Republican 4,Republican48–35[a]
4,California,Democratic,Democratic,Democratic 29–11,Democratic 61–19,Democratic,Democratic,Democratic 46–7,Democratic45.3–23.9[c][11]
5,Colorado,Democratic,Democratic,Democratic 19–16,Democratic 41–24,Democratic,Republican,Democratic 4–3,Democratic30.4–28.0[c][12]
6,Connecticut,Democratic,Democratic,Democratic 22–14,Democratic 91–60,Democratic,Democratic,Democratic 5,Democratic36.6–21.0[c][13]
7,Delaware,Democratic,Democratic,Democratic 12–9,Democratic 26–15,Democratic,Democratic,Democratic,Democratic47.6–27.7[c][14]
8,Florida,Republican,Republican,Republican 23–17,Republican 73–47,Republican,Republican,Republican 14–13,Democratic37.4–35.3[c][15]
9,Georgia,Republican,Republican,Republican 35–21,Republican 105–75,Republican,Republican,Republican 9–5,Democratic43–42[a]


In [12]:
party_data.loc[26, "State House"] = party_data.loc[26, "State Senate"]

In [13]:
party_data

Unnamed: 0,State,2016 presidentialelection,Governor,State Senate,State House,SeniorU.S. Senator,JuniorU.S. Senator,U.S. House of Representatives,Partisan split (as of 2018[update])
0,Alabama,Republican,Republican,Republican 27–8,Republican 77–28,Republican,Democratic,Republican 6–1,Republican52–35[a]
1,Alaska,Republican,Republican,Republican 13–7,Coalition 23–17[b],Republican,Republican,Republican,Republican24.3–13.3[c][9]
2,Arizona,Republican,Republican,Republican 17–13,Republican 31–29,Democratic,Republican,Democratic 5–4,Republican34.9–32.5[c][10]
3,Arkansas,Republican,Republican,Republican 26–9,Republican 76–24,Republican,Republican,Republican 4,Republican48–35[a]
4,California,Democratic,Democratic,Democratic 29–11,Democratic 61–19,Democratic,Democratic,Democratic 46–7,Democratic45.3–23.9[c][11]
5,Colorado,Democratic,Democratic,Democratic 19–16,Democratic 41–24,Democratic,Republican,Democratic 4–3,Democratic30.4–28.0[c][12]
6,Connecticut,Democratic,Democratic,Democratic 22–14,Democratic 91–60,Democratic,Democratic,Democratic 5,Democratic36.6–21.0[c][13]
7,Delaware,Democratic,Democratic,Democratic 12–9,Democratic 26–15,Democratic,Democratic,Democratic,Democratic47.6–27.7[c][14]
8,Florida,Republican,Republican,Republican 23–17,Republican 73–47,Republican,Republican,Republican 14–13,Democratic37.4–35.3[c][15]
9,Georgia,Republican,Republican,Republican 35–21,Republican 105–75,Republican,Republican,Republican 9–5,Democratic43–42[a]


Now that that's fixed, we proceeded to removing all the numbers and wikipedia links/references.

In [14]:
def remove_stuff(column):
    column = pd.Series(np.where(party_data[column].str.startswith("Republican"), 
                                "Republican", 
                                party_data[column]))
    column = pd.Series(np.where(column.str.startswith("Democratic"), 
                                "Democratic", 
                                column))
    column = pd.Series(np.where(column.str.startswith("Unicameral nonpartisan legislature"), 
                                "Unicameral nonpartisan legislature", 
                                column))
    column = pd.Series(np.where(column.str.startswith("Tied"), 
                                "Tied", 
                                column))
    column = pd.Series(np.where(column.str.startswith("EVEN"), 
                                "Tied", 
                                column))
    column = pd.Series(np.where(column.str.startswith("Independent"), 
                                "Independent", 
                                column))
    column = pd.Series(np.where(column.str.startswith("Coalition"), 
                                "Coalition", 
                                column))
    return column



In [15]:
cols_to_clean = (["State Senate", 
                  "State House", 
                  "SeniorU.S. Senator", 
                  "JuniorU.S. Senator", 
                  "U.S. House of Representatives", 
                  "Partisan split (as of 2018[update])"])

for colname in cols_to_clean:
    party_data[colname] = remove_stuff(colname)

In [16]:
party_data

Unnamed: 0,State,2016 presidentialelection,Governor,State Senate,State House,SeniorU.S. Senator,JuniorU.S. Senator,U.S. House of Representatives,Partisan split (as of 2018[update])
0,Alabama,Republican,Republican,Republican,Republican,Republican,Democratic,Republican,Republican
1,Alaska,Republican,Republican,Republican,Coalition,Republican,Republican,Republican,Republican
2,Arizona,Republican,Republican,Republican,Republican,Democratic,Republican,Democratic,Republican
3,Arkansas,Republican,Republican,Republican,Republican,Republican,Republican,Republican,Republican
4,California,Democratic,Democratic,Democratic,Democratic,Democratic,Democratic,Democratic,Democratic
5,Colorado,Democratic,Democratic,Democratic,Democratic,Democratic,Republican,Democratic,Democratic
6,Connecticut,Democratic,Democratic,Democratic,Democratic,Democratic,Democratic,Democratic,Democratic
7,Delaware,Democratic,Democratic,Democratic,Democratic,Democratic,Democratic,Democratic,Democratic
8,Florida,Republican,Republican,Republican,Republican,Republican,Republican,Republican,Democratic
9,Georgia,Republican,Republican,Republican,Republican,Republican,Republican,Republican,Democratic


## Writing to a csv file


In [18]:
party_data.to_csv("data/party_data_by_state.csv", index = False)