In [36]:
import pandas as pd
import numpy as np
import torch
from torch.autograd import Variable
import torch.nn as nn
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
import locale
from locale import atof

locale.setlocale(locale.LC_NUMERIC, '')

### First we'll be importing the dataset as a pandas file. Next we'll use some basic functions that describe the data, in order to better understand it.

In [37]:
df = pd.read_csv(r'../input/dataset1csv/dataset1.csv', thousands=',')
df.head()

In [38]:
df.info()

In [39]:
df.describe() 

In [40]:
df[df['State'].str.contains("Columbia")]

In [41]:
df

#### The immediate problem I noticed was that there were 52 rows entered, while they are in fact 50 states. With the help of the .head method, we can see that they included the US as a whole on the first entry, which we can remove. This still leaves us with one extra state. After some searching it seems that people often confuse D.C or Distric of Columbia as a state. Sure, enough I found the a column with the state of D.C, which is incorrect if we were to do a state analysis. 

### Now onto the first problem, which is pre-processing the 'State' column

#### As seen previosly the two rows we need to delete if we were to do a analysis based by states would be entries 0 and 9, which represt the US and DC respectively. We'll create another dataset for that as we don't want to alter the original one.

In [42]:
state_df = df.drop([0,9])
state_df.info()

#### Above we can see that the state_df dataframe now only contains 50 entries(i.e all the states).

### Next up we need to find all rows where the vote for highest office(President) contain a NaN value and return the VEP turnout rate for them.


In [43]:
na_df = df[df['Vote for Highest Office (President)'].isna()]
na_df

#### Above is all the entries with a NaN value for the 'vote for highest office (president)' column. Next we'll just print out the state name and VEP turnout rate

In [44]:
na_df[['State','VEP Turnout Rate']]

### 

### Now onto the predictive model. I'll be using a simple linear regression model as it works well with the data provided. I'll write all the parts of the model out instead of making one class or function.

#### We need to arrays, one for the inputs and the other for the  targets

In [45]:
inputs = df[['Total Ineligible Felon', 'Parole', 'Prison', '% Non-citizen', 'Voting-Age Population (VAP)']]
targets = df['Total Ballots Counted (Estimate)']
inputs['% Non-citizen'].replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
inputs = inputs.to_numpy()
targets = targets.to_numpy()
inputs = inputs.astype('float32')
targets = targets.astype('float32')


#### I had forgetten but at this point I realized that a lot of entries were in string form with commas representing thousands. In order to convert them I made use of locale and changed the way I read the csv file at the beginning. As well as removing the % from the non-citizen column. This would normally belong in the data cleanup right at the beginning.

In [46]:
inputs = torch.from_numpy(inputs)
targets = torch.from_numpy(targets)
targets.resize_(52,1);

In [47]:
train_ds = TensorDataset(inputs, targets)
val_ds = TensorDataset(inputs, targets)
batch_size = 16
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size*2)
model = nn.Linear(5, 1)
preds = model(inputs)
loss_fn = F.l1_loss
params = [inputs,targets]
opt = torch.optim.RMSprop(params, lr=1e-10, alpha=0.99, eps=1e-10, weight_decay=0, momentum=0, centered=False);

In [48]:
def fit(num_epochs, model, loss_fn, opt, train_dl):
    for epoch in range(num_epochs):
        for xb,yb in train_dl:
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            opt.step()
            opt.zero_grad()
        if (epoch+1) % 100 == 0:
            print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, loss.item()))

In [49]:
fit(2000, model, loss_fn, opt, train_dl)

In [50]:
preds = model(inputs)
preds

In [51]:
targets

### Unfortunatly I have made quite a few mistakes with how I went about making a model. I would scrap it and build a better feedforward model but I'm short on time so I'll focus on the last part first.

#### We'll start off by looking at the top 10 states with the highest VEP turnout

In [52]:
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from PIL import Image
%matplotlib inline

In [53]:
turn_out = df.sort_values(by = ['VEP Turnout Rate'], ascending = False)
turn_out[['State','VEP Turnout Rate']].head(10)

In [54]:
turn_out[['State','VEP Turnout Rate']].tail(10)

In [57]:
top10 = Image.open('../input/election-images/top10.png')
bottom10 = Image.open('../input/election-images/bottom10.png')
top10

In [58]:
bottom10