In [1]:
import pandas as pd
import numpy as np


# clustering and modelling
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

#plotting
from bokeh.charts import *
from bokeh.plotting import figure, show
from bokeh.models import HoverTool
import matplotlib.pyplot as plt
output_notebook()

In [7]:
votes = pd.read_csv('/../data/cleaned_votes.csv', index_col=0)

In [8]:
c = votes.iloc[:, :-1].T

In [9]:
km = KMeans(n_clusters=2)
km.fit(c)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [10]:
predicted_parties = pd.Series(km.labels_)

In [11]:
predicted_parties.value_counts()

0    54
1    46
dtype: int64

In [12]:
c.shape, predicted_parties.shape

((100, 438), (100,))

In [13]:
#PCA
pca = PCA(n_components=2)

In [14]:
pca.fit(c.T)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [15]:
pca.explained_variance_ratio_

array([ 0.50643735,  0.18673049])

In [16]:
c1, c2 = pca.components_

In [17]:
c1.shape, c2.shape, predicted_parties.shape

((100,), (100,), (100,))

In [18]:
names = votes.T.index.values[:-1]
len(names)

100

In [19]:
# color by cluster
color_dict = {0:'darkred', 1:'darkblue', 2: 'green', 3: 'orange', 4: 'yellow'}
cs = pd.Series(predicted_parties).map(color_dict)
color = cs.values

#make source data
source = ColumnDataSource(data=dict(
    c1 = c1,
    c2=c2,
    color=color,
    names = names
))


p1 = figure(plot_width=800, plot_height=600, title="A House Divided", tools = 'hover, save, box_zoom,reset')
p1.circle('c1', 'c2', color='color', source=source, size=13)

p1.select_one(HoverTool).tooltips = [
    ('Senator', '@names')]

output_file('/../gallery/senate_divided.html')

show(p1)

In [20]:
parties = []
for name in names:
    party = name[-5:-4]
    parties.append(party)

In [21]:
colors_dict = {'R':'red', 'D':'blue', 'I':'green'}
cp = pd.Series(parties).map(colors_dict)

In [23]:
# color by cluster
colors = cp.values
#make source data
source = ColumnDataSource(data=dict(
    c1 = c1,
    c2=c2,
    color=colors,
    names = names
))


p2 = figure(plot_width=800, plot_height=600, title="A House Divided", tools = 'hover, save, box_zoom,reset')
p2.circle('c1', 'c2', color='color', source=source, size=13)

p2.select_one(HoverTool).tooltips = [
    ('Senator', '@names')]

output_file('senate_divided_2.html')

show(p2)