In [185]:
import os
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [229]:
df = pd.read_csv("../data/county_2016_preprocessed1.csv")

# encode dem/gop wins
df = df.dropna()
df['DEM'] = df['DEM'].astype('int32')
df['GOP'] = df['GOP'].astype('int32')

df['DEMWin']=0
for idx, row in df.iterrows():
    if row['DEM'] >= row['GOP']:
        df['DEMWin'][idx]=1

df['MedianIncome'] = df['MedianIncome'].replace(',','', regex=True)
df['MedianIncome'] = df['MedianIncome'].astype('int32')

df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['DEMWin'][idx]=1


Unnamed: 0,FIPS,County,DEM,GOP,MedianIncome,MigraRate,BirthRate,DeathRate,BachelorRate,UnemploymentRate,State,zip,primary_city,latitude,longitude,estimated_population,DEMWin
0,1001,Autauga,5908,18110,54487,5.3,12.2,10.0,27.7,5.1,AL,36003.0,Autaugaville,32.43,-86.65,1628.0,0
1,1003,Baldwin,18409,72780,56460,21.5,11.2,9.8,31.3,5.3,AL,31034.0,Hardwick,32.99,-83.29,1741.0,0
2,1005,Barbour,4848,5431,32884,-18.2,10.6,10.8,12.2,8.3,AL,26238.0,Volga,39.06,-80.11,698.0,0
3,1007,Bibb,1874,6733,43079,-0.8,12.1,10.8,11.5,6.4,AL,31052.0,Lizella,32.76,-83.86,7592.0,0
4,1009,Blount,2150,22808,47213,-1.0,12.2,11.2,12.6,5.4,AL,35013.0,Allgood,33.9,-86.51,0.0,0


We set up x and edge indices:

In [230]:
x = df[['MedianIncome', 'MigraRate', 'BirthRate', 'DeathRate', 'BachelorRate', 'UnemploymentRate']] #, 'latitude', 'longitude', 'estimated_population']]
x = np.array(x)

In [231]:
graph_df = pd.read_csv("../data/county_graph_preprocessed.csv")
graph_df.head()

Unnamed: 0,SRC,DST
0,1001,1001
1,1001,1021
2,1001,1047
3,1001,1051
4,1001,1085


In [232]:
all_fips = df['FIPS']
all_fips = set(all_fips.tolist())

# encode fips to index
fips_node_idx = {}
i = 0
for f in graph_df['SRC'].tolist():
    if (f not in fips_node_idx) and (f in all_fips):
        fips_node_idx[f] = i
        i += 1

In [233]:
assert len(fips_node_idx) == len(all_fips)

In [234]:
src = graph_df['SRC'].tolist()
dst = graph_df['DST'].tolist()

final_src = []
final_dst = []
for s, d in zip(src, dst):
    if s in all_fips and d in all_fips:
        final_src.append(fips_node_idx[s])
        final_dst.append(fips_node_idx[d])
    
src_lst = final_src + final_dst
dst_lst = final_dst + final_src

Set up train, validation, and test splits:

In [235]:
trainval_idx = np.random.choice(len(df), int(len(df) * 0.7), replace=False)

test_mask = np.ones(len(df), dtype=bool)
test_mask[trainval_idx] = 0

num_train = int(0.3 * len(trainval_idx))
train_idx = trainval_idx[num_train:]
val_idx = trainval_idx[:num_train]

train_mask = np.zeros(len(df), dtype=bool)
val_mask = np.zeros(len(df), dtype=bool)
train_mask[train_idx] = 1
val_mask[val_idx] = 1

In [236]:
np.sum(train_mask), np.sum(val_mask)

(1495, 640)

In [237]:
y = df['DEMWin'].tolist()
train_y = np.compress(train_mask, y)
val_y = np.compress(val_mask, y)
test_y = np.compress(test_mask, y)

In [238]:
np.sum(train_mask + val_mask + test_mask)

3050

In [245]:
import torch
from torch_geometric.data import Data

x = torch.tensor(np.array(x), dtype=torch.float)
edge_index = torch.tensor([src_lst,
                           dst_lst], dtype=torch.long)

data_loader = Data(x=x, edge_index=edge_index, y_train=torch.from_numpy(train_y).long(), y_val=torch.from_numpy(val_y).long(), y_test=torch.from_numpy(test_y).long(), train_mask=train_mask, val_mask=val_mask, test_mask=test_mask, num_classes=2)

In [246]:
import torch.nn.functional as F
import torch_geometric.transforms as T
from torch_geometric.nn import GCNConv, ChebConv

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.h_layers = 4

        self.conv1 = ChebConv(data_loader.num_node_features, self.h_layers, K=2)
        self.conv2 = ChebConv(self.h_layers, data_loader.num_classes, K=2)

    def forward(self, data_loader):
        x, edge_index, edge_weight = data_loader.x, data_loader.edge_index, data_loader.edge_attr
        x = F.relu(self.conv1(x, edge_index, edge_weight))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index, edge_weight)
#         x = F.relu(self.conv2(x, edge_index, edge_weight))
#         x = F.dropout(x, training=self.training)
        return F.log_softmax(x, dim=1)

In [247]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model, data_loader = Net().to(device), data_loader.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [252]:
def weighted_accuracy(pred, true):
    assert(len(pred) == len(true))
    num_labels = len(true)
    num_pos = sum(true)
    num_neg = num_labels - num_pos
    frac_pos = num_pos/num_labels
    weight_pos = 1/frac_pos
    weight_neg = 1/(1-frac_pos)
    num_pos_correct = 0
    num_neg_correct = 0
    for pred_i, true_i in zip(pred, true):
        num_pos_correct += (pred_i == true_i and true_i == 1)
        num_neg_correct += (pred_i == true_i and true_i == 0)
    weighted_accuracy = ((weight_pos * num_pos_correct) 
                         + (weight_neg * num_neg_correct))/((weight_pos * num_pos) + (weight_neg * num_neg))
    return weighted_accuracy

In [257]:
from sklearn.metrics import balanced_accuracy_score

pos_w = np.sum(train_y) / len(train_y)
neg_w = 1 - pos_w

def train():
    model.train()
    optimizer.zero_grad()
#     import pdb; pdb.set_trace()
    loss = F.nll_loss(model(data_loader)[data_loader.train_mask], data_loader.y_train, weight=torch.as_tensor([pos_w, neg_w], dtype=torch.float))
    loss.backward()
    optimizer.step()

@torch.no_grad()
def test():
    model.eval()
    logits, accs = model(data_loader), []
    
    import pdb; pdb.set_trace()
    pred = logits[data_loader.train_mask].max(1)[1]
#     acc = pred.eq(data_loader.y_train).sum().item() / data_loader.train_mask.sum().item()
#     acc = balanced_accuracy_score(pred.detach().numpy(), data_loader.y_train)
    acc = weighted_accuracy(pred.detach().numpy(), data_loader.y_train)
    accs.append(acc)
    
    pred = logits[data_loader.val_mask].max(1)[1]
#     acc = pred.eq(data_loader.y_val).sum().item() / data_loader.val_mask.sum().item()
#     acc = balanced_accuracy_score(pred.detach().numpy(), data_loader.y_val)
    acc = weighted_accuracy(pred.detach().numpy(), data_loader.y_val)
    accs.append(acc)
    
    return accs

In [258]:
# train val
for epoch in range(1, 501):
    train()
    train_acc, val_acc = test()
    log = 'Epoch: {:03d}, Train: {:.4f}, Val: {:.4f}'
    if epoch % 10 == 0:
        print(log.format(epoch, train_acc, val_acc))

> [0;32m<ipython-input-257-1661a3222e9f>[0m(20)[0;36mtest[0;34m()[0m
[0;32m     18 [0;31m[0;34m[0m[0m
[0m[0;32m     19 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 20 [0;31m    [0mpred[0m [0;34m=[0m [0mlogits[0m[0;34m[[0m[0mdata_loader[0m[0;34m.[0m[0mtrain_mask[0m[0;34m][0m[0;34m.[0m[0mmax[0m[0;34m([0m[0;36m1[0m[0;34m)[0m[0;34m[[0m[0;36m1[0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     21 [0;31m[0;31m#     acc = pred.eq(data_loader.y_train).sum().item() / data_loader.train_mask.sum().item()[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     22 [0;31m[0;31m#     acc = balanced_accuracy_score(pred.detach().numpy(), data_loader.y_train)[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m
ipdb> data_loader.y_train
tensor([0, 0, 0,  ..., 0, 0, 0])
ipdb> exit


BdbQuit: 

Test accuracy:

In [122]:
logits = model(data_loader)
pred = logits[data_loader.test_mask].max(1)[1]
test_acc = pred.eq(data_loader.y_test).sum().item() / data_loader.test_mask.sum().item()
print("Test accuracy: " + str(test_acc))

Test accuracy: 0.8437158469945355
