In [1]:
import pandas as pd
import numpy as np

# Get the winners and the margin of win

In [2]:
results = pd.read_excel("2015 Constituency Results.xlsx")
results.rename(columns={"Unnamed: 1": "Name"}, inplace=True)
results.columns

Index(['Constituency', 'Name', 'Country', 'Region', 'Electorate', 'Turnout',
       'Unnamed: 6', 'Candidate', 'Party', 'Votes', 'Share', 'Place'],
      dtype='object')

In [3]:
winners = results[results["Place"] == 1]
assert np.all( winners["Constituency"] == np.arange(1,651) )
winners.head()

Unnamed: 0,Constituency,Name,Country,Region,Electorate,Turnout,Unnamed: 6,Candidate,Party,Votes,Share,Place
0,1,Aberavon,Wales,Wales,49821,31523,0.633,Stephen Kinnock,Labour,15416,0.489,1
9,2,Aberconwy,Wales,Wales,45540,30148,0.662,Guto Bebb,Conservative,12513,0.415,1
15,3,Aberdeen North,Scotland,Scotland,67745,43936,0.649,Kirsty Blackman,Scottish National Party,24793,0.564,1
21,4,Aberdeen South,Scotland,Scotland,68056,48551,0.713,Callum McCaig,Scottish National Party,20221,0.416,1
28,5,Aberdeenshire West & Kincardine,Scotland,Scotland,73445,55196,0.752,Stuart Donaldson,Scottish National Party,22949,0.416,1


In [4]:
seconds = results[results["Place"] == 2]
assert np.all( winners["Constituency"] == np.arange(1,651) )
seconds.head()

Unnamed: 0,Constituency,Name,Country,Region,Electorate,Turnout,Unnamed: 6,Candidate,Party,Votes,Share,Place
1,1,Aberavon,Wales,Wales,49821,31523,0.633,Peter Bush,UKIP,4971,0.158,2
10,2,Aberconwy,Wales,Wales,45540,30148,0.662,Mary Wimbury,Labour,8514,0.282,2
16,3,Aberdeen North,Scotland,Scotland,67745,43936,0.649,Richard Baker,Labour,11397,0.259,2
22,4,Aberdeen South,Scotland,Scotland,68056,48551,0.713,Anne Begg,Labour,12991,0.268,2
29,5,Aberdeenshire West & Kincardine,Scotland,Scotland,73445,55196,0.752,Alexander Burnett,Conservative,15916,0.288,2


In [5]:
winning_votes = winners.set_index("Constituency")["Votes"]
seconds_votes = seconds.set_index("Constituency")["Votes"]
margins = pd.DataFrame({"win":winning_votes, "2nd":seconds_votes})
margins["margin"] = margins["win"] - margins["2nd"]
margins.head()

Unnamed: 0_level_0,2nd,win,margin
Constituency,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,4971,15416,10445
2,8514,12513,3999
3,11397,24793,13396
4,12991,20221,7230
5,15916,22949,7033


# Process the demographic data

And then normalise is a _vaguely_ reasonable way.

In [6]:
demo = pd.read_csv("demo_new.csv")
demo.head()

Unnamed: 0.1,Unnamed: 0,name,population,age 18-29,age 30-44,age 45-64,age 65+,households,with a car,white,immigrants,gcse+
0,0,Berwick-upon-Tweed,75718.0,8867.0,12655.0,23898.0,16759.0,38261.0,27098.0,74698.0,2403.0,37274.0
1,1,Bishop Auckland,87143.0,11815.0,15764.0,25820.0,16621.0,41200.0,28573.0,86058.0,2102.0,38016.0
2,2,Blaydon,88281.0,11388.0,17010.0,25004.0,17491.0,40046.0,27851.0,86692.0,2256.0,41253.0
3,3,Blyth Valley,82174.0,11435.0,15887.0,23834.0,14069.0,37070.0,26912.0,81071.0,1912.0,36921.0
4,4,"Durham, City of",94375.0,22609.0,16647.0,24033.0,15408.0,39659.0,28525.0,90135.0,6662.0,52625.0


In [30]:
df = pd.DataFrame({"name" : demo["name"], "population" : demo["population"]})
df["age 18-29"] = demo["age 18-29"] / demo["population"]
df["age 30-44"] = demo["age 30-44"] / demo["population"]
df["age 45-64"] = demo["age 45-64"] / demo["population"]
df["age 65+"] = demo["age 65+"] / demo["population"]
df["households"] = demo["households"] / demo["population"]
df["car"] = demo["with a car"] / demo["population"]
df["white"] = demo["white"] / demo["population"]
df["immigrants"] = demo["immigrants"] / demo["population"]
df["gcse+"] = demo["gcse+"] / (demo["age 18-29"] + demo["age 30-44"] + demo["age 45-64"] + demo["age 65+"])
df.head()

Unnamed: 0,name,population,age 18-29,age 30-44,age 45-64,age 65+,households,car,white,immigrants,gcse+
0,Berwick-upon-Tweed,75718.0,0.117106,0.167133,0.315618,0.221334,0.505309,0.357881,0.986529,0.031736,0.599463
1,Bishop Auckland,87143.0,0.135582,0.180898,0.296295,0.190732,0.472786,0.327886,0.987549,0.024121,0.542931
2,Blaydon,88281.0,0.128997,0.19268,0.283232,0.198129,0.45362,0.315481,0.982001,0.025555,0.581905
3,Blyth Valley,82174.0,0.139156,0.193334,0.290043,0.17121,0.451116,0.3275,0.986577,0.023268,0.566056
4,"Durham, City of",94375.0,0.239566,0.176392,0.254654,0.163264,0.420228,0.302252,0.955073,0.070591,0.668704


# Stupidly, throw a random forest at it

In [12]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier

In [44]:
win = pd.DataFrame({"name":winners["Name"], "party":winners["Party"]})
df = df.join(win.set_index("name"), on="name", how="inner")
df.head()

Unnamed: 0,name,population,age 18-29,age 30-44,age 45-64,age 65+,households,car,white,immigrants,gcse+,party
0,Berwick-upon-Tweed,75718.0,0.117106,0.167133,0.315618,0.221334,0.505309,0.357881,0.986529,0.031736,0.599463,Conservative
1,Bishop Auckland,87143.0,0.135582,0.180898,0.296295,0.190732,0.472786,0.327886,0.987549,0.024121,0.542931,Labour
2,Blaydon,88281.0,0.128997,0.19268,0.283232,0.198129,0.45362,0.315481,0.982001,0.025555,0.581905,Labour
3,Blyth Valley,82174.0,0.139156,0.193334,0.290043,0.17121,0.451116,0.3275,0.986577,0.023268,0.566056,Labour
4,"Durham, City of",94375.0,0.239566,0.176392,0.254654,0.163264,0.420228,0.302252,0.955073,0.070591,0.668704,Labour


In [68]:
train = np.random.random(size=len(df)) <= 0.8
train, score = df[train], df[~train]

features = train.columns[1:11]
clf = RandomForestClassifier()
y, nums_to_party = pd.factorize(train['party'])
clf.fit(train[features], y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

Doesn't actually do too badly...

In [69]:
preds = nums_to_party[clf.predict(score[features])]
pd.crosstab(score['party'], preds, rownames=['actual'], colnames=['preds'])

preds,Conservative,Labour
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
Conservative,48,7
Labour,9,42
Liberal Democrat,2,0
Plaid Cymru,0,1
