In [1]:
dataset_name = "W14_comb"
df_list = ['BES_Panel']#,"BESnumeric"]

In [2]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
import pickle, os, gc, re

from IPython.display import display, display_html 

import Jupyter_module_loader
from utility import *

# you should clone this git to a subdirectory called 'BES_analysis_code' (in some directory - I call it BES_analysis - doesn't matter though)
%matplotlib inline
encoding = "ISO-8859-1"

(BES_code_folder, BES_small_data_files, BES_data_folder,
 BES_output_folder, BES_file_manifest, BES_R_data_files) = setup_directories()


global BES_Panel, BES_numeric, BES_reduced, BES_reduced_with_na, BES_non_numeric
data_subfolder = BES_data_folder + dataset_name + os.sep

(manifest, dataset_filename, dataset_description, dataset_citation,
 dataset_start, dataset_stop, dataset_wave) = get_manifest(dataset_name, BES_file_manifest)

for df in df_list:
    if df=="BES_Panel":
        globals()[df]  = pd.read_msgpack(data_subfolder + dataset_filename.replace('.dta','.msgpack'))

    else:
        globals()[df]  = pd.read_msgpack(data_subfolder + df + '.msgpack' )
        globals()[df].replace(-1,np.nan,inplace=True)

(var_type, cat_dictionary, new_old_col_names, old_new_col_names) = get_small_files(data_subfolder, encoding)

memory_use(locs = locals().items())

var_type (5692, 13)
                     BES_Panel: 506.4MiB
                      var_type:   4.3MiB
             new_old_col_names: 288.1KiB
                cat_dictionary: 144.1KiB
             old_new_col_names: 144.1KiB
             BES_file_manifest:  42.4KiB
                      manifest:   1.7KiB
                           _i2:   1.3KiB
                  gaussian_kde:   1.0KiB
                         cycle:   400.0B


In [3]:
max_wave, num_to_wave, num_to_weight, weights = get_weights(dataset_name,BES_Panel)

In [4]:
BES_census_data = pd.read_stata( BES_small_data_files + "BES-2017-General-Election-results-file-v1.0.dta" )
print("BES_census_data", BES_census_data.shape )

BES_census_data (632, 308)


In [11]:
corrs = BES_census_data.corrwith(BES_census_data["leaveHanretty"]).sort_values()

In [12]:
corrs[search(BES_census_data, 'Age')]
# p_surridge note - hanretty const. Leave est. derived using MRP which uses age data

c11Age0to4     -0.021106
c11Age5to7      0.144198
c11Age8to9      0.210472
c11Age10to14    0.406411
c11Age15        0.508010
c11Age16to17    0.527042
c11Age18to19   -0.217939
c11Age20to24   -0.379248
c11Age25to29   -0.493926
c11Age30to44   -0.468753
c11Age45to59    0.348489
c11Age60to64    0.440701
c11Age65to74    0.446599
c11Age75to84    0.388374
c11Age85to89    0.314405
c11Age90plus    0.223094
dtype: float64

In [53]:
def make_corr_summary(input_df, name, corr_type = "spearman", sample_size_text = "N", correlation_text = "r",
                      abs_correlation_text = "abs_r", p_value_text = "p",
                      min_p_value = 0.01, min_variance = 0.0, min_sample_size = 500):

    df1 = input_df.copy()
    focal_var = df1[name]
    focal_mask = focal_var.notnull()


    pattern_list = [x for x in df1.columns if re.search(pattern,x)]

    variances = df1[focal_mask].var()
    low_var_list = list(variances[variances<min_variance].index)
    sample_sizes = df1[focal_mask].notnull().sum()
    low_sample_size_list = list(sample_sizes[sample_sizes<min_sample_size].index)

    drop_list = pattern_list+low_var_list+low_sample_size_list
    df1.drop(drop_list,axis=1,inplace=True)

    if corr_type == "pearson":
        df = df1.apply(lambda x: corr_simple_pearsonr(x,focal_var)).apply(pd.Series).T
    elif corr_type == "spearman":
        df = df1.apply(lambda x: corr_simple_spearmanr(x,focal_var)).apply(pd.Series).T

    df.columns = [correlation_text,p_value_text,sample_size_text]
 
    df[sample_size_text] = df[sample_size_text].astype('int')
    df[abs_correlation_text] = df[correlation_text].abs()

    zero_var_other_way_around_list = list(df[df[correlation_text].isnull()].index)
    df.dropna(inplace=True)

    insignificant_list = df[df[p_value_text]>min_p_value].index
    df.drop(insignificant_list,inplace=True)

    df.sort_values(by=abs_correlation_text,ascending=False,inplace=True)


    stub_dict = {}
    drop_list = []
    # drop repeated references to same variable in different waves???
    # so, what about different categories??? eg. blahWX_subcat
    # how about, just replace wave match as "X"
    # create a dictionary keyed on the top corr variable with all the drops inside
    for ind in df.index:
        waveless = remove_wave(ind)
        if waveless in stub_dict.keys():
            drop_list.append(ind)
            stub_dict[waveless].append(ind)
        else:
            stub_dict[waveless] = [ind]
    df.drop(drop_list,inplace=True)
    return df, corr_type

In [56]:
def display_corr(df, name, corr_type, top_num = 20, round_places = 2,
                 correlation_text = "r", p_value_text = "p", sample_size_text = "N"):
    df.index = [x[0:60] for x in df.index]
    df[correlation_text] = df[correlation_text].round(round_places)
    
    df1 = df.sort_values(by=correlation_text, ascending=False)[0:top_num][[correlation_text,p_value_text,sample_size_text]]
    df2 = df.sort_values(by=correlation_text)[0:top_num][[correlation_text,p_value_text,sample_size_text]]
    
    df1[p_value_text]     = df1[p_value_text].apply(lambda x: "{0:0.2f}".format(x))
    df2[p_value_text]     = df2[p_value_text].apply(lambda x: "{0:0.2f}".format(x))

    df1_caption = "Top "+str(top_num)+ " positive "+"("+corr_type+")"+" correlations for "+name
    df2_caption = "Top "+str(top_num)+ " negative "+"("+corr_type+")"+" correlations for "+name

    df1_styler = df1.style.set_table_attributes("style='display:inline'").set_caption(df1_caption)
    df2_styler = df2.style.set_table_attributes("style='display:inline'").set_caption(df2_caption)

    display_html(df1_styler._repr_html_()+df2_styler._repr_html_(), raw=True)


In [61]:


pattern = "remainHanretty|leaveHanretty"
name = "leaveHanretty"
corr_type = "spearman"
df,corr_type = make_corr_summary(BES_census_data, name, corr_type)
display_corr(df, name, corr_type)




Unnamed: 0,r,p,N
UKIP15,0.88,0.0,614
UKIP1015,0.84,0.0,614
UKIPVote15,0.78,0.0,614
c11HouseholdCohabitDependents,0.72,0.0,632
c11HealthFair,0.68,0.0,632
c11PassportNone,0.68,0.0,573
c11IndustryManufacturing,0.65,0.0,632
c11NSSECSemiRoutine,0.65,0.0,632
c11HouseholdCohabitNodependents,0.63,0.0,632
c11NSSECLowerSupervisor,0.62,0.0,632

Unnamed: 0,r,p,N
c11Degree,-0.9,0.0,573
UKIP1517,-0.83,0.0,614
c11HealthVeryGood,-0.78,0.0,632
c11QualLevel4,-0.75,0.0,632
c11PassportOceania,-0.7,0.0,573
c11PassportNorthAmerica,-0.7,0.0,573
c11PassportAny,-0.7,0.0,573
c11PassportCentralAmerica,-0.65,0.0,573
c11NSSECHigherProfessional,-0.62,0.0,632
c11PassportIreland,-0.62,0.0,573


In [12]:
BES_census_data["Winner15"].cat.categories

Index(['Conservative', 'Labour', 'Liberal Democrat', 'Scottish National Party',
       'Plaid Cymru', 'UKIP', 'Green', 'Speaker.'],
      dtype='object')

In [15]:
BES_census_data["Winner17"].cat.set_categories(BES_census_data["Winner15"].cat.categories, inplace=True)

In [16]:
changed_hands = BES_census_data["Winner15"]!=BES_census_data["Winner17"]

In [23]:
changed_hands_lab_con = changed_hands &\
                        BES_census_data["Winner15"].apply(lambda x: x in ["Conservative","Labour"]) &\
                        BES_census_data["Winner17"].apply(lambda x: x in ["Conservative","Labour"])

In [27]:
BES_census_data["remainHanretty"].mean()

47.94174237705698

In [29]:
census_lab_con_changed_hands = BES_census_data[["Winner15","Winner17","remainHanretty"]][changed_hands_lab_con].sort_values("Winner15")
census_lab_con_changed_hands

Unnamed: 0,Winner15,Winner17,remainHanretty
34,Conservative,Labour,77.953259
581,Conservative,Labour,48.889946
569,Conservative,Labour,43.440918
530,Conservative,Labour,54.137841
521,Conservative,Labour,42.235608
449,Conservative,Labour,61.746673
443,Conservative,Labour,48.236966
438,Conservative,Labour,45.594765
436,Conservative,Labour,38.690405
355,Conservative,Labour,42.567366


In [33]:
census_lab_gains_only = census_lab_con_changed_hands[census_lab_con_changed_hands["Winner15"]=="Conservative"]
census_lab_gains_only

Unnamed: 0,Winner15,Winner17,remainHanretty
34,Conservative,Labour,77.953259
581,Conservative,Labour,48.889946
569,Conservative,Labour,43.440918
530,Conservative,Labour,54.137841
521,Conservative,Labour,42.235608
449,Conservative,Labour,61.746673
443,Conservative,Labour,48.236966
438,Conservative,Labour,45.594765
436,Conservative,Labour,38.690405
355,Conservative,Labour,42.567366


In [36]:
(census_lab_gains_only["remainHanretty"]>50).value_counts()

False    17
True     11
Name: remainHanretty, dtype: int64

In [38]:
(census_lab_gains_only["remainHanretty"]>=BES_census_data["remainHanretty"].mean()).value_counts()


True     18
False    10
Name: remainHanretty, dtype: int64

In [39]:
BES_census_data["remainHanretty"].mean()

47.94174237705698

In [40]:
BES_census_data

Unnamed: 0,pano,ONSConstID,ConstituencyName,Country,Region,ConstituencyType,Winner17,Con17,Lab17,LD17,...,c11HealthBad,c11HealthVeryBad,c11NoAdultsEmployed,c11NoAdultsEmployedChildren,c11NoAdultsEmployedNoChildren,c11DeprivedNone,c11Deprived1,c11Deprived2,c11Deprived3,c11Deprived4
0,1,W07000049,Aberavon,Wales,Wales,County,Labour,17.737766,68.119514,1.800529,...,8.299941,2.691546,42.462109,6.230530,36.231580,32.230740,29.679023,27.795863,9.832336,0.462039
1,2,W07000058,Aberconwy,Wales,Wales,County,Conservative,44.594090,42.618974,2.926905,...,4.679606,1.313480,40.390082,3.115130,37.274953,40.875711,34.258528,19.590967,4.761529,0.513266
2,3,S14000001,Aberdeen North,Scotland,Scotland,Borough,Scottish National Party,22.692276,30.010610,4.605925,...,3.650631,1.059666,31.955123,3.209807,28.745316,40.373828,33.189665,19.629204,6.051418,0.755886
3,4,S14000002,Aberdeen South,Scotland,Scotland,Borough,Conservative,42.132470,20.549300,5.866091,...,2.563387,0.700666,26.960984,1.742846,25.218138,51.914644,30.473801,13.691849,3.465347,0.454361
4,5,S14000058,West Aberdeenshire and Kincardine,Scotland,Scotland,County,Conservative,47.852785,11.052785,8.641162,...,1.922364,0.522299,24.974422,1.354262,23.620160,53.012762,31.304184,13.572236,1.968122,0.142696
5,6,S14000003,Airdrie and Shotts,Scotland,Scotland,County,Scottish National Party,23.190885,37.092785,2.110415,...,5.846584,1.851011,36.973800,5.544289,31.429511,32.539465,32.287327,25.350800,8.953629,0.868779
6,7,E14000530,Aldershot,England,South East,Borough,Conservative,55.056180,31.617978,7.430031,...,2.774196,0.801563,23.736783,3.122279,20.614504,47.943774,32.837418,15.180993,3.624829,0.412987
7,8,E14000531,Aldridge-Brownhills,England,West Midlands,Borough,Conservative,65.408227,29.849633,3.337890,...,4.878271,1.442045,37.254111,3.282656,33.971455,39.975178,33.062364,22.137760,4.520633,0.304065
8,9,E14000532,Altrincham and Sale West,England,North West,Borough,Conservative,51.019132,38.846372,7.673802,...,3.236326,0.907952,30.050828,2.381131,27.669696,54.415484,29.019254,13.370390,3.014597,0.180275
9,10,W07000043,Alyn and Deeside,Wales,Wales,County,Labour,40.393208,52.088919,2.406166,...,4.197321,1.147809,31.830069,3.562504,28.267566,43.926890,32.332772,19.082350,4.355785,0.302203


In [43]:
BES_census_data.loc[census_lab_gains_only.index,"pano"].sort_values().values

array([ 38,  41,  97, 100, 111, 125, 127, 158, 169, 170, 181, 238, 273,
       314, 332, 338, 340, 368, 451, 453, 458, 464, 536, 546, 588, 600,
       601, 607], dtype=int16)

In [46]:
BES_census_data[BES_census_data["ConstituencyName"]=="Wakefield"]["pano"]

572    591
Name: pano, dtype: int16