In [1]:
dataset_name = "W13_comb"
df_list = ["BES_Panel","BES_reduced_with_na"]#,"BESnumeric"]

In [2]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
import pickle, os, gc, re

from IPython.display import display

import Jupyter_module_loader
from utility import *

In [3]:
# you should clone this git to a subdirectory called 'BES_analysis_code' (in some directory - I call it BES_analysis - doesn't matter though)
%matplotlib inline
encoding = "ISO-8859-1"

(BES_code_folder, BES_small_data_files, BES_data_folder,
 BES_output_folder, BES_file_manifest, BES_R_data_files) = setup_directories()




In [4]:
%%time
global BES_Panel, BES_numeric, BES_reduced, BES_reduced_with_na, BES_non_numeric
data_subfolder = BES_data_folder + dataset_name + os.sep

(manifest, dataset_filename, dataset_description, dataset_citation,
 dataset_start, dataset_stop, dataset_wave) = get_manifest(dataset_name, BES_file_manifest)

for df in df_list:
    if df=="BES_Panel":
        globals()[df]  = pd.read_msgpack(data_subfolder + dataset_filename.replace('.dta','.msgpack'))

    else:
        globals()[df]  = pd.read_msgpack(data_subfolder + df + '.msgpack' )
#         globals()[df].replace(-1,np.nan,inplace=True)

(var_type, cat_dictionary, new_old_col_names, old_new_col_names) = get_small_files(data_subfolder, encoding)

var_type (5173, 8)
Wall time: 29.9 s


In [30]:
memory_use(locs = locals().items())

           BES_reduced_with_na: 673.6MiB
                     BES_Panel: 410.7MiB
                       weights:   7.3MiB
                      var_type:   2.3MiB
                            _7: 804.2KiB
                           _10: 804.2KiB
                cat_dictionary: 192.1KiB
             old_new_col_names:  96.1KiB
             new_old_col_names:  96.1KiB
             BES_file_manifest:  42.4KiB


In [31]:
from sklearn import datasets
from sklearn.decomposition import PCA, IncrementalPCA, NMF, TruncatedSVD, FastICA, FactorAnalysis, SparsePCA
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [32]:
max_wave, num_to_wave, num_to_weight, weights = get_weights(dataset_name,BES_Panel)

In [35]:
import shap
import xgboost as xgb
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.linear_model import ElasticNet
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, explained_variance_score, r2_score

In [36]:
def get_base_from_BES(im_var, waves=None, noNans = False):

    if waves:
        wave_list = waves
    else:
        wave_list = [x.replace(im_var,"") for x in BES_Panel.columns if re.match(im_var+"W\d+$",  x)]
    
    immig_vars = [im_var+x for x in wave_list]

    if noNans:
        all_in_dataset = BES_Panel[immig_vars].notnull().all(axis=1)

        max_range = len(BES_Panel[immig_vars[0]].cat.categories.drop("Don't know",errors='ignore'))-1
        max_cat = BES_Panel[immig_vars[0]].cat.categories.drop("Don't know",errors='ignore')[max_range]
        BES_immig = BES_Panel[immig_vars][all_in_dataset]\
        .replace("Don't know",np.nan)\
        .apply(lambda x: x.cat.codes)\
        .replace(-1,np.nan)\
        .apply(lambda x: x.fillna(x.median())) / max_range
    else:
        all_in_dataset = BES_Panel[immig_vars].isnull()
        
        max_range = len(BES_Panel[immig_vars[0]].cat.categories.drop("Don't know",errors='ignore'))-1
        max_cat = BES_Panel[immig_vars[0]].cat.categories.drop("Don't know",errors='ignore')[max_range]
        BES_immig = BES_Panel[immig_vars]\
        .replace("Don't know",np.nan)\
        .apply(lambda x: x.cat.codes)\
        .replace(-1,np.nan)\
        .apply(lambda x: x.fillna(x.median())) / max_range
        
        BES_immig[all_in_dataset] = np.nan
        all_in_dataset = ~all_in_dataset
        
        
    return BES_immig, all_in_dataset

def get_base(im_var, waves=None, noNans = False, nan_value = False):
# get df containing all waves of some variable
# specify waves with integers! (makes sorting easier) - crash if you choose waves not present?

    wave_dict = {int(re.match(im_var+"W(\d+)",  x).groups()[0]):x for x in BES_reduced_with_na.columns if re.match(im_var+"W\d+($|_)",  x)}

    if waves:
        immig_vars = [wave_dict[x] for x in sorted(intersection(waves,wave_dict.keys()))]
    else:
        immig_vars = [wave_dict[x] for x in sorted(wave_dict.keys())]

    if noNans:
        all_in_dataset = BES_reduced_with_na[immig_vars].notnull().all(axis=1)
        max_range = BES_reduced_with_na[immig_vars].max().max()
        BES_immig = BES_reduced_with_na[immig_vars][all_in_dataset]\
        .apply(lambda x: x.fillna(x.median())) / max_range
    elif nan_value:
        all_in_dataset = BES_reduced_with_na[immig_vars].notnull().all(axis=1)
        max_range = BES_reduced_with_na[immig_vars].max().max()
        BES_immig = BES_reduced_with_na[immig_vars][all_in_dataset]\
        .apply(lambda x: x.fillna(nan_value)) / max_range
    else:
        all_in_dataset = BES_reduced_with_na[immig_vars].isnull()
        max_range = BES_reduced_with_na[immig_vars].max().max()
        BES_immig = BES_reduced_with_na[immig_vars] / max_range

        all_in_dataset = ~all_in_dataset
        
        
    return BES_immig, all_in_dataset



def get_diff(imvar_list, nan_value = False):
    num = 0
    for im_var in imvar_list:

        print(im_var)
        immig, mask = get_base(im_var, noNans = False, nan_value = nan_value)

        waves_tmp = [re.search("(W\d+)",x).groups()[0] for x in immig.columns]
        replace_wave = ["("+waves_tmp[a]+"->"+waves_tmp[a+1]+")" for a,b in enumerate(immig.columns[1:])]
        temp = [immig.columns[0]]
        temp.extend([re.sub("W\d+",replace_wave[i],x) for i,x in enumerate(immig.columns[1:])])
        immig.columns = temp     
        if num ==0:
            immigDiff = immig.astype('float').diff(axis=1).drop(immig.columns[0],axis=1)
        else:
            immigDiff = pd.concat( [immigDiff, immig.astype('float').diff(axis=1).drop(immig.columns[0],axis=1)], axis=1)
        num = num + 1
        
        assert (immigDiff.max().max()<=1) and (immigDiff.min().min()>=-1)
    return immigDiff

In [37]:
# temp = pd.read_msgpack(data_subfolder + "BES_reduced" + '.msgpack')
# nans = pd.read_msgpack(data_subfolder + "nans" + '.msgpack')
# temp[nans] = np.nan
# globals()[df] = temp[nans]
# del temp, nans

In [12]:
# %%time
# global BES_Panel, BES_numeric, BES_reduced, BES_reduced_with_na, BES_non_numeric
# data_subfolder = BES_data_folder + dataset_name + os.sep

# (manifest, dataset_filename, dataset_description, dataset_citation,
#  dataset_start, dataset_stop, dataset_wave) = get_manifest(dataset_name, BES_file_manifest)
# df="BES_reduced_with_na"
# globals()[df] = pd.read_msgpack(data_subfolder + "BES_reduced" + '.msgpack')
# # globals()[df][pd.read_msgpack(data_subfolder + "nans" + '.msgpack')] = np.nan


In [13]:
# nans = pd.read_msgpack(data_subfolder + "nans" + '.msgpack')

In [14]:
# BES_reduced_with_na.where(nans)

In [15]:
## augment dataset
# pan_dataset_values
# census data
# 

# pan_dataset_values = pd.read_csv( BES_small_data_files + "pan_dataset_values.csv", encoding=encoding)    

In [16]:
# # http://www.britishelectionstudy.com/data-object/2017-bes-constituency-results-with-census-and-candidate-data/
# BES_census_data = pd.read_stata( BES_small_data_files + "BES-2017-General-Election-results-file-v1.0.dta" )
# print("BES_census_data", BES_census_data.shape )

# ## SHOULD BE REPLACED WITH pan_dataset_variables reference
# pano_var_dict = {int(x.replace("panoW","")):x for x in BES_Panel.columns if "pano" in x}
# sorted_pano_vars = [ pano_var_dict[x] for x in sorted( pano_var_dict.keys() ) ]

# BES_reduced_with_na["pano"] = BES_Panel[sorted_pano_vars].bfill(axis=1).iloc[:, 0]


# # len()
# census_vars = [x for x in BES_census_data.columns if re.match('c\w',x) ]
# census_vars.append("leaveHanretty")
# census_vars.append("remainHanretty")
# census_vars.append("pano")
# # ConstituencyType
# BES_reduced_with_na = pd.merge(BES_reduced_with_na, BES_census_data[census_vars], how = 'left', on = 'pano')


In [17]:
# import sys, gc

# def sizeof_fmt(num, suffix='B'):
#     ''' By Fred Cirera, after https://stackoverflow.com/a/1094933/1870254'''
#     for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
#         if abs(num) < 1024.0:
#             return "%3.1f%s%s" % (num, unit, suffix)
#         num /= 1024.0
#     return "%.1f%s%s" % (num, 'Yi', suffix)

# def memory_use(locs = locals().items()):
#     gc.collect()
#     # locals().items()
#     for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locs),
#                              key= lambda x: -x[1])[:10]:
#         print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))
# memory_use()

In [60]:
# [x for x in BES_reduced_with_na.columns if re.match("profile_euref",x)]


In [61]:
# optional_mask_fn(8).sum()

In [12]:
Treatment = "euRefVote_change_shiftsonly_fixed"

### focal variable??

In [13]:
# var_stub = "partyId"
# var_list = [x for x in BES_reduced_with_na.columns if re.match(var_stub+"W\d+_No - none",x)]
# # "partyIdStrength",
# var_stub_list = ["partyId", "partyIdSqueeze",  "generalElectionVote", "ashcroft", "generalElectionCertainty"]
# # base_list = [x for x in BES_reduced_with_na.columns if var_stub in x][0:len(var_list)]
# var_list

In [14]:
var_stub = "euRefVote"
var_list = [x for x in BES_reduced_with_na.columns if re.match(var_stub+"W",x)]
# "partyIdStrength",
# var_stub_list = ["partyId", "partyIdSqueeze",  "generalElectionVote", "ashcroft", "generalElectionCertainty"]
var_stub_list = ["euRefVote"]
# base_list = [x for x in BES_reduced_with_na.columns if var_stub in x][0:len(var_list)]
var_list

In [15]:
imvar_list = [var_stub]

immigDiff = get_diff(imvar_list)

euRefVote


In [16]:
var_stub = "euRefVote"
var_list = [x for x in immigDiff.columns if re.match(var_stub,x)]
var_list
# # "partyIdStrength",
# # var_stub_list = ["partyId", "partyIdSqueeze",  "generalElectionVote", "ashcroft", "generalElectionCertainty"]
# # var_stub_list = ["euRefVote","profile_eurefvote_Leave the EU"]
# var_stub_list = ["immigSelf"]
# # base_list = [x for x in BES_reduced_with_na.columns if var_stub in x][0:len(var_list)]
# var_list

In [17]:
# immigDiff

In [18]:
thresh = 0.1 # -> 76 variables
# thresh = 0.05 # -> 65 variables!
# thresh = 0.1 # -> 134 variables
# thresh = 0.5 # -> 134 variables
focal_df = BES_reduced_with_na
focal_df = immigDiff

In [19]:
# %%time



diff_feature_candidate_list = []
ordinalised_vars = var_type[ var_type["type"].apply(lambda x: x in [0, 1, 2, 3, 5, 6])].index
# ordinalised_vars = var_type[ np.logical_and( var_type["type"].apply(lambda x: x in [0, 1, 2, 3, 5, 6]),
#                                              var_type["dataset_name"] == dataset_name )].index
reduced_ordinal_vars = [x for x in new_old_col_names.keys() if new_old_col_names[x] in ordinalised_vars]
vars_present = intersection(reduced_ordinal_vars, BES_reduced_with_na.columns)


for focal_var in var_list:
    print(focal_var)
    corr = BES_reduced_with_na[vars_present].corrwith(focal_df[focal_var])
    reduced_corr = corr.sort_values()

    high_corr = reduced_corr[ np.logical_or( reduced_corr< reduced_corr.quantile(thresh) ,
                                             reduced_corr> reduced_corr.quantile(1-thresh) ) ]
    diff_feature_candidate_list.extend(list(high_corr.index))
print(len(diff_feature_candidate_list))

euRefVote(W1->W2)_Leave the EU
euRefVote(W2->W3)_Leave the EU
euRefVote(W3->W4)_Leave the EU
euRefVote(W4->W6)_Leave the EU
euRefVote(W6->W7)_Leave the EU
euRefVote(W7->W8)_Leave the EU
euRefVote(W8->W9)_Leave the EU
euRefVote(W9->W10)_Leave the EU
euRefVote(W10->W11)_Leave the EU
euRefVote(W11->W12)_Leave the EU


  c = cov(x, y, rowvar)
  c *= np.true_divide(1, fact)


euRefVote(W12->W13)_Leave the EU
euRefVote(W13->W14)_Leave the EU
16310


In [20]:
# diff_feature_candidate_list

In [21]:
temp  = [x for x in diff_feature_candidate_list if re.search("W\d+",x)]
temp2 = [x for x in temp if not re.search("(W\d+){2}",x)]
temp3 = [re.match("([a-zA-Z0-9_]+?)(W\d+){1}",x).groups(0)[0] if re.match("([a-zA-Z0-9_]+)(W\d+){1}",x) else x for x in temp2 ]
imvar_list = list(set(temp3))
auto_gen = imvar_list

In [22]:
auto_gen

In [23]:
imvar_list = auto_gen

immigDiff = get_diff(imvar_list)

ptvGrn
selfOccOrgSizeLast
labPartnerGreen
responsibleCrimeConservatives
candChoice2
referendumContact_none
regretsIHaveAFewEU
partySiteOth
twitterInfof2f
ukipLookAfterMC
scotSNP
warmMC
discussantEthnicity1
partyIdSqueeze
generalElectionVote
localElectionVote
partyContactOthRemain
achieveReduceImmigUKIP
lrgreens
labPriorities_immig
leaveVoteUKIPCon
labPriorities_crime
winConstituencySNP
gor
ethno1
tryReduceImmigUKIP
discussantEthnicity2
winConstituencyLab
labPriorities2_immig
moreParl_2
welfarePreference
empathy9
moreParl_8
likeAlanJohnson
partyContactStrongerIn
childBenefitsRepatriate
al1
partyContactGreen_6
profile_religion
propMPWC
partyContactLD_6
euKnow4
resourceAccess3_99
conPartnerLD
warmEastern
responsibleImmigLibDems
mpSecondJob4
snpFear
certaintyUKRemain
immigLab
likeFarage
savings
noChanceCoalitionUKIP
allSourceIncome_6
warmGreece
resourceAccess3_4
motive1
certaintyEULab
cspl5
likeRudd
scotReferendumIntention
cspl1
EUIntegrationLD
earlyElecSupport
voteMethodEurope_3
selfOccSu

conPriorities2_crime
britRespectLaw
responsibleDebtLiberalDems
approveScotGovt
finlit2
labSupport
locus1
spendLeast3
ukipPriorities_immig
forcedSubjClass
tryReduceImmigLab
changeEducation
scotIndepJoinEU
groupempathy3
partyContactUKIP_2
ldTone
discussPolDaysD3
ukipHope
ukipNone
homeAmtb
wcVmc2
likeMiliband
resourceAccess1_3
achieveReduceImmigCon
certaintyUKLeave
immigEcon
partySiteGreen
redistGreen
propMPYoung
ukipPriorities2_none
likeLab
snpLookAfterWC
britishness
spendMost6
UKsovereigntyPost
payoffsizeB5
csplImproveStandards
discrimWhite
accessMedia
participation_6
discussantApprovalVoteName3
creditLeave_4
immigContributeTake
satDemEU
accessFixComputer
gayEquality
responsibleEconScottishGovt
education
relationshipName3
conGovTrust
pidPraiseGood
likeBatten
polForTheRich
britChristian
eesEUIntegrationLab
scotElectionVoteConst
discussPolDays
ptvGrnExp
ldAngry
radical
empathy7
handleMIICon
happyLeave
ldPriorities_immig
efficacyEU
grnHope
regSatisfaction
responsibleImmigScottishGovt
labUn

In [24]:
# lr_pos =  [x for x in BES_reduced_with_na.columns if re.match("lrCon|lrLab",x)]
# lr_pos.remove('lrConW10__Right')
# lr_pos

In [25]:
# immigDiff

In [26]:
# immigDiff = BES_reduced_with_na[lr_pos].diff(axis=1)[lr_pos[1::2]]

In [27]:
# lrLab-lrCon

In [28]:
# var_stub = "partyId"
# var_list = [x for x in BES_reduced_with_na.columns if re.match(var_stub+"W\d+_No - none",x)]
# # "partyIdStrength",
# var_stub_list = ["partyId", "partyIdSqueeze",  "generalElectionVote", "ashcroft", "generalElectionCertainty"]
# # base_list = [x for x in BES_reduced_with_na.columns if var_stub in x][0:len(var_list)]
# var_list

In [29]:
# party_id_none_delta = BES_reduced_with_na[var_list].astype('float').diff(axis=1).drop(var_list[0],axis=1)
# party_id_none_delta
# var_list = var_list[1:]
# var_list


In [30]:
# var_list = immigDiff.columns
# var_stub_list = ['lrLab','lrCon']

In [31]:
# var_list.remove('partydiffconlabW5__There is not much difference between them')
# var_list = var_list[-2:]

In [32]:
# BES_Panel['happyEULeaveW11'].notnull().sum()

In [33]:
# BES_reduced_with_na['euRefVoteW13_Leave the EU'] = BES_Panel["euRefVoteW13"].cat.codes
# BES_reduced_with_na['euRefVoteW13_Leave the EU'] = BES_reduced_with_na['euRefVoteW13_Leave the EU'].replace(2, np.nan).replace(3, np.nan).replace(-1, np.nan)

# BES_reduced_with_na['euRefVoteW11_Leave the EU'] = BES_Panel["euRefVoteW11"].cat.codes
# BES_reduced_with_na['euRefVoteW11_Leave the EU'] = BES_reduced_with_na['euRefVoteW11_Leave the EU'].replace(2, np.nan).replace(3, np.nan).replace(-1, np.nan)

# BES_reduced_with_na['generalElectionVoteW10__Conservative'] = (BES_Panel['generalElectionVoteW10']=="Conservative")
# BES_reduced_with_na.loc[BES_Panel['generalElectionVoteW10'].isnull(),'generalElectionVoteW10__Conservative'] = np.nan

# BES_reduced_with_na['generalElectionVoteW11__Conservative'] = BES_Panel['generalElectionVoteW11']=="Conservative"
# BES_reduced_with_na.loc[BES_Panel['generalElectionVoteW11'].isnull(),'generalElectionVoteW11__Conservative'] = np.nan

# BES_reduced_with_na['generalElectionVoteW13__Conservative'] = BES_Panel['generalElectionVoteW13']=="Conservative"
# BES_reduced_with_na.loc[BES_Panel['generalElectionVoteW11'].isnull(),'generalElectionVoteW11__Conservative'] = np.nan

In [1]:
# var_list

In [35]:
# var_list = ['euRefDoOverW13_Yes']
# mask = immigDiff["polAttention(W7->W8)__Pay a great deal of attention"].notnull()
# immigDiff["polAttention(W7->W8)__Pay a great deal of attention"] = np.random.randn(immigDiff["polAttention(W7->W8)__Pay a great deal of attention"].size)
# immigDiff["polAttention(W7->W8)__Pay a great deal of attention"][~mask] = np.nan

In [36]:
# [x for x in BES_reduced_with_na.columns if "euRefVote" in x]

In [62]:
def shap_outputs(shap_values, train, target_var, output_subfolder, dependence_plots = False, threshold = .1, min_features = 30):

    #################################
#     threshold = .1
#     min_features = 30
    global_shap_vals = np.abs(shap_values).mean(0)[:-1]
    n_top_features = max( sum(global_shap_vals[np.argsort(global_shap_vals)]>=threshold),
                          min_features )
#     if n_top_features <min_features:
#         n_top_features = min_features

    ##########################

    inds = np.argsort(global_shap_vals)[-n_top_features:]

    y_pos = np.arange(n_top_features)
    plt.figure(figsize=(16,10))
    plt.title(target_var);
    plt.barh(y_pos, global_shap_vals[inds], color="#1E88E5")
    plt.yticks(y_pos, train.columns[inds])
    plt.gca().spines['right'].set_visible(False)
    plt.gca().spines['top'].set_visible(False)
    plt.xlabel("mean SHAP value magnitude (change in log odds)")
    plt.gcf().set_size_inches(6, 4.5)

    plt.savefig( output_subfolder + "mean_impact" + ".png", bbox_inches='tight' )

    plt.show()

    ####################
    
    fig = plt.figure()
    fig.suptitle(target_var);
    shap.summary_plot( shap_values, train, max_display=n_top_features, plot_type='dot' );
    if shap_problem:
        summary_text = "summary_plot(approx)"
    else:
        summary_text = "summary_plot"
    
    fig.savefig( output_subfolder + summary_text + ".png", bbox_inches='tight' )
    
        ##################
    if dependence_plots:
        count = 0
        for name in train.columns[inds[::-1]]:
            fig = plt.figure(figsize = (16,10))
            fig.suptitle(target_var);
            shap.dependence_plot(name, shap_values, train)
            fig.savefig(output_subfolder + "featureNo "+str(count) + " " + name.replace("/","_").replace(">","")[0:30] + ".png", bbox_inches='tight')
            count = count + 1

In [63]:
memory_use(locs = locals().items())

           BES_reduced_with_na: 673.6MiB
                     BES_Panel: 410.7MiB
                         train:  31.4MiB
                       weights:   7.3MiB
                      var_type:   2.3MiB
                            _7: 804.2KiB
                           _10: 804.2KiB
                          mask: 603.2KiB
                sample_weights: 380.9KiB
                        target: 285.7KiB


In [16]:
def get_wave_no(var):
    return int( re.search( 'W(\d+)($|_|\))', target_var ).groups()[0] )

def get_other_wave_pattern(wave_no, max_wave, num_to_wave):
    wave_seq = [num_to_wave[x] for x in range(1,max_wave+1) if (x>wave_no) or (x<wave_no) ]
    return "("+"|".join(wave_seq) +")" +"([^0-9-]|$)"

def remove_sublist(lst, sublist):
    # remove sublist from lst
    return list(set(lst) - set(sublist))

In [65]:
# split data into train and test sets
# fit, validate and find overfitting limit
seed = 27
test_size = 0.33
minimum_sample = 100
early_stoppping_fraction = .1

objective = 'binary:logistic' # logistic regression for binary classification, output probability
eval_metric = 'error' # Binary classification error rate
# eval_metric = 'rmse' # Binary classification error rate
eval_metric = 'softmax'

alg = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=.85,
       colsample_bytree=.9, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, n_estimators=2000,
       n_jobs=3, objective='binary:logistic', random_state=seed,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       subsample=.75)


# alg = XGBRegressor(
#  learning_rate =0.01,
#  n_estimators= 200,
#  max_depth=3,
#  min_child_weight=1,
#  gamma=0,
#  subsample=0.75,
#  colsample_bytree=0.9,
#  colsample_bylevel=.85,
#  objective= objective,
#  scale_pos_weight=1.0,
#  reg_alpha=0,
#  reg_lambda=1,
#  njobs=3,
#  seed=seed**2)




In [66]:
def get_non_overfit_settings( train, target, alg, seed, early_stoppping_fraction, test_size, eval_metric, verbose = True,
                              sample_weights = None ):

    if sample_weights is not None:

        X_train, X_test, y_train, y_test = train_test_split( pd.concat( [train,sample_weights], axis=1 ),
                                                             target, test_size=test_size,
                                                             random_state=seed, stratify=round(target) )

        eval_set = [(X_test, y_test)]

        sample_weight = X_train[weight_var].values
        sample_weight_eval_set = X_test[weight_var].values
        X_train.drop(weight_var, axis=1, inplace=True)
        X_test.drop(weight_var, axis=1, inplace=True)

        alg.fit(X_train, y_train, eval_metric=eval_metric, 
                early_stopping_rounds = alg.get_params()['n_estimators']*early_stoppping_fraction,
                eval_set=eval_set, verbose=True, sample_weight = sample_weight)
        
    else:
        X_train, X_test, y_train, y_test = train_test_split( train,
                                                             target, test_size=test_size,
                                                             random_state=seed, stratify=round(target) )

        eval_set = [(X_test, y_test)]

        alg.fit(X_train, y_train, eval_metric=eval_metric, 
                early_stopping_rounds = alg.get_params()['n_estimators']*early_stoppping_fraction,
                eval_set=eval_set, verbose=True )        
        

    # make predictions for test data
    predictions = alg.predict(X_test)

    # evaluate predictions
    MSE = mean_squared_error(y_test, predictions)
    MAE = mean_absolute_error(y_test, predictions)
    EV = explained_variance_score(y_test, predictions)
    R2 = r2_score(y_test, predictions)

    print("MSE: %.2f, MAE: %.2f, EV: %.2f, R2: %.2f" % (MSE, MAE, EV, R2) )
    alg.set_params(n_estimators=alg.best_iteration)

In [67]:
BES_reduced_with_na[ search(BES_reduced_with_na, 'bestPM') ]['bestPMW12_Jeremy Corbyn'].value_counts()

0.0    16495
1.0     7879
Name: bestPMW12_Jeremy Corbyn, dtype: int64

In [68]:
Treatment = "bestPMW12"
var_stub = "bestPMW12"

var_list = [x for x in BES_reduced_with_na.columns if re.match(var_stub,x)]
var_stub_list = [var_stub]

var_list


# default no wave
def optional_mask_fn(wave=[]):
    if wave:
        return 1



def create_train(drop_other_waves):
    keep_list = BES_reduced_with_na.columns
    
    if drop_other_waves:
        # drop variables from other waves
        other_waves = get_other_wave_pattern(wave_no, max_wave, num_to_wave)
        keep_list = [x for x in keep_list if not re.search( other_waves, x )]
        
    # drop key variables
    keep_list = [x for x in keep_list if not any([var_stub in x for var_stub in var_stub_list])] 
    
    return BES_reduced_with_na[keep_list][mask]


def create_target(target_var):
    
    return BES_reduced_with_na[target_var]





In [69]:
# Treatment = "Sov_vs_imm"
# var_stub = "euMIISmallW\d_Sovereignty"

# var_list = [x for x in BES_reduced_with_na.columns if re.match(var_stub,x)]
# var_stub_list = ["euMIISmallW","EUMIICategory","small_mii_cat","mii_cat"]

# optional_mask = True
# var_list

# # default no wave
# def optional_mask_fn(wave=[]):
#     if wave:
#         return (BES_reduced_with_na['profile_eurefvote_Leave the EU']==1.0) & \
#             (BES_reduced_with_na[[ re.sub('\d+',str(wave),x) for x in ['euMIISmallW7_Immigration','euMIISmallW7_Sovereignty/EU bureaucracy'] ]]==1.0).any(axis=1)



# def create_train(drop_other_waves):
#     keep_list = BES_reduced_with_na.columns
    
#     if drop_other_waves:
#         # drop variables from other waves
#         other_waves = get_other_wave_pattern(wave_no, max_wave, num_to_wave)
#         keep_list = [x for x in keep_list if not re.search( other_waves, x )]
        
#     # drop key variables
#     keep_list = [x for x in keep_list if not any([var_stub in x for var_stub in var_stub_list])] 
    
#     return BES_reduced_with_na[keep_list][mask]


# def create_target(target_var):
    
#     return BES_reduced_with_na[target_var]




In [77]:
search(BES_Panel,"generalElectionVoteW12")

10

In [None]:
Treatment = "generalElectionVoteW12"
var_stub = "generalElectionVoteW12"

var_list = [x for x in BES_Panel.columns if re.match(var_stub,x)]
var_stub_list = [var_stub]

objective = 'multi:softmax'
eval_metric = 'mlogloss'

## SUPER SLOW!
## hits memory problem in shap
    
alg = XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=.85,
       colsample_bytree=.9, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=3, min_child_weight=1, n_estimators=20,
       n_jobs=3, objective=objective, random_state=seed,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       subsample=.75,
       num_class = len( BES_Panel["generalElectionVoteW12"].cat.categories )
                   )

# # default no wave
# def optional_mask_fn(wave=[]):
#     if wave:
#         return 1



# def create_train(drop_other_waves):
#     keep_list = BES_reduced_with_na.columns
    
#     if drop_other_waves:
#         # drop variables from other waves
#         other_waves = get_other_wave_pattern(wave_no, max_wave, num_to_wave)
#         keep_list = [x for x in keep_list if not re.search( other_waves, x )]
        
#     # drop key variables
#     keep_list = [x for x in keep_list if not any([var_stub in x for var_stub in var_stub_list])] 
    
#     return BES_reduced_with_na[keep_list][mask]


def create_target(target_var):
    
    return BES_Panel[target_var].cat.codes



var_list

In [93]:
treatment_subfolder = create_subdir(BES_output_folder,"xgb_classifier"+Treatment)
sample_wts = True
drop_other_waves = True



In [94]:
# for target_var,base_var in zip(var_list,base_list):
for target_var in var_list:
    
    wave_no = get_wave_no( target_var )
    weight_var = num_to_weight[wave_no]    
    print( target_var, wave_no )
    
    target = create_target(target_var)
    mask   = target.notnull()
    if optional_mask:
        mask = mask&optional_mask_fn(wave_no)
    target = target[mask]
    
    if sum(mask) < minimum_sample:
        continue
    
    train = create_train(drop_other_waves)
    
    output_subfolder = create_subdir(treatment_subfolder,target_var)

    
    if sample_wts:
        sample_weights = weights[weight_var][mask]
        print("missing vals in sample weights: "+ str( sample_weights.isnull().sum() ) )
        sample_weights = sample_weights.fillna(sample_weights.median())
    else:
        sample_weights = None
#         get_non_overfit_settings( train, target, alg, seed, early_stoppping_fraction, test_size, sample_weights )
#         # fit to full dataset at non-overfitting level
#         alg.fit(train, target, verbose = True, sample_weight = sample_weights)        
#     else:

    get_non_overfit_settings( train, target, alg, seed, early_stoppping_fraction, test_size, eval_metric, verbose = True,
                              sample_weights=sample_weights )
    # fit to full dataset at non-overfitting level
    alg.fit(train, target, verbose = True, sample_weight = sample_weights)


#################

    shap_values = shap.TreeExplainer(alg).shap_values(train);
    
    shap_problem = np.isnan(np.abs(shap_values).mean(0)).any()
    if shap_problem:
        print("hit problem!")
        shap_values = shap.TreeExplainer(alg).shap_values(train, approximate=True);

    shap_outputs(shap_values, train, target_var, output_subfolder, threshold = .1, min_features = 30)


generalElectionVoteW12 12
missing vals in sample weights: 34161
[0]	validation_0-mlogloss:2.35307
Will train until validation_0-mlogloss hasn't improved in 20.0 rounds.
[1]	validation_0-mlogloss:2.31034
[2]	validation_0-mlogloss:2.27045
[3]	validation_0-mlogloss:2.23148
[4]	validation_0-mlogloss:2.19526
[5]	validation_0-mlogloss:2.15918
[6]	validation_0-mlogloss:2.12544
[7]	validation_0-mlogloss:2.09217
[8]	validation_0-mlogloss:2.0605
[9]	validation_0-mlogloss:2.02951
[10]	validation_0-mlogloss:1.9997
[11]	validation_0-mlogloss:1.9709
[12]	validation_0-mlogloss:1.94299
[13]	validation_0-mlogloss:1.9158
[14]	validation_0-mlogloss:1.88984
[15]	validation_0-mlogloss:1.86442
[16]	validation_0-mlogloss:1.83994
[17]	validation_0-mlogloss:1.81601
[18]	validation_0-mlogloss:1.79271
[19]	validation_0-mlogloss:1.77006
[20]	validation_0-mlogloss:1.74792
[21]	validation_0-mlogloss:1.72634
[22]	validation_0-mlogloss:1.70551
[23]	validation_0-mlogloss:1.68499
[24]	validation_0-mlogloss:1.66488
[25]

MemoryError: 

In [None]:
eval_metric

In [None]:
# WORKS but makes page slow!
# shap.initjs()
# shap.force_plot(explainer.expected_value, shap_values[::130,:],train.iloc[::130,:])

In [70]:
# lr, mcw, ss

alg = XGBRegressor(
 learning_rate =0.05,
 n_estimators= 200,
 max_depth=3,
 min_child_weight=6,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.9,
 colsample_bylevel=.85,
 objective= 'reg:linear',
 scale_pos_weight=1.0,
 reg_alpha=1,
 reg_lambda=100,
 njobs=3,
 seed=seed**2)

X_train, X_test, y_train, y_test = train_test_split( pd.concat( [train,sample_weights], axis=1 ),
                                                     target, test_size=test_size,
                                                     random_state=seed, stratify=round(target) )

eval_set = [(X_test, y_test)]

sample_weight = X_train[weight_var].values
sample_weight_eval_set = X_test[weight_var].values
X_train.drop(weight_var, axis=1, inplace=True)
X_test.drop(weight_var, axis=1, inplace=True)

param_test1 ={
 'min_child_weight':[1,6],
 'learning_rate':[0.01, 0.02, 0.03, 0.04, 0.05, 0.06],
 'subsample':[.8, .75],
}

from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=3)

gsearch1 = GridSearchCV(estimator = alg,
                        param_grid = param_test1,
                        scoring='r2', #'r2'
                        n_jobs=4, 
                        iid=False,
                        cv=skf,
                        verbose=True)

gsearch1.fit(X_train, y_train, eval_metric='rmse',
        early_stopping_rounds=alg.get_params()['n_estimators']*.1, eval_set=eval_set,
        verbose=True, sample_weight= sample_weight)

display(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_)


Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done  72 out of  72 | elapsed:  2.2min finished


[0]	validation_0-rmse:0.498485
Will train until validation_0-rmse hasn't improved in 20.0 rounds.
[1]	validation_0-rmse:0.497161
[2]	validation_0-rmse:0.495783
[3]	validation_0-rmse:0.494419
[4]	validation_0-rmse:0.493166
[5]	validation_0-rmse:0.491863
[6]	validation_0-rmse:0.490867
[7]	validation_0-rmse:0.489868
[8]	validation_0-rmse:0.488858
[9]	validation_0-rmse:0.487766
[10]	validation_0-rmse:0.486907
[11]	validation_0-rmse:0.485976
[12]	validation_0-rmse:0.484926
[13]	validation_0-rmse:0.484175
[14]	validation_0-rmse:0.483328
[15]	validation_0-rmse:0.482605
[16]	validation_0-rmse:0.481957
[17]	validation_0-rmse:0.481268
[18]	validation_0-rmse:0.480465
[19]	validation_0-rmse:0.479854
[20]	validation_0-rmse:0.479218
[21]	validation_0-rmse:0.478535
[22]	validation_0-rmse:0.478049
[23]	validation_0-rmse:0.477426
[24]	validation_0-rmse:0.476776
[25]	validation_0-rmse:0.476157
[26]	validation_0-rmse:0.475672
[27]	validation_0-rmse:0.475031
[28]	validation_0-rmse:0.474474
[29]	validation



{'mean_fit_time': array([7.24741475, 7.66377171, 7.48009451, 7.4984289 , 7.19474483,
        7.60276818, 7.465427  , 6.9643983 , 6.75371957, 7.16907676,
        6.95339775, 7.10940663, 6.89639449, 7.12740787, 6.88372715,
        6.81572318, 6.73638511, 6.92806284, 6.7977221 , 6.81238969,
        6.71338399, 6.26802516, 7.05373669, 6.00767692]),
 'std_fit_time': array([0.08402072, 0.35027736, 0.21193084, 0.15789704, 0.02776657,
        0.21301654, 0.62516667, 0.0700182 , 0.02469058, 0.20789381,
        0.06497043, 0.18358787, 0.09509827, 0.10044485, 0.11109888,
        0.05700804, 0.06207343, 0.01515205, 0.06600533, 0.08703382,
        0.04699899, 1.04728152, 0.37625942, 1.05212988]),
 'mean_score_time': array([0.03733524, 0.04100227, 0.03333529, 0.03200181, 0.03533546,
        0.03500199, 0.03266859, 0.03266859, 0.03266851, 0.03333529,
        0.03433537, 0.03200189, 0.03266851, 0.03200181, 0.03300174,
        0.03266851, 0.03500215, 0.03333529, 0.03300182, 0.03500199,
        0.032335

{'learning_rate': 0.04, 'min_child_weight': 1, 'subsample': 0.75}

0.1641235151483538

In [62]:
### xgboost_tuner
# https://github.com/cwerner87/xgboost-tuner

from xgboost_tuner.tuner import tune_xgb_params

# train = X.values
# label = rounded_targets

# metric_sklearn = ‘neg_mean_squared_error’ or 'remse'
# estimator_cls = xgb.XGBClassifier, xgb.XGBRegressor


# Tune the parameters incrementally and limit the range for colsample_bytree and subsample
best_params, history = tune_xgb_params(
    cv_folds = 3,
    label = target,
    metric_sklearn = 'neg_mean_squared_error',
    metric_xgb = "rmse",
    n_jobs = 4,
    objective = 'reg:linear',
    random_state = seed,
    strategy = 'incremental',
    train = train,
    init_colsample_bytree = 0.9,
    init_gamma = 0.0,
    init_max_depth = 3,
    init_min_child_weight  = 1,
    init_subsample = 0.75,
    learning_rates = [0.03,.04,.05],
)

[0]	train-rmse:0.497144+6.08331e-05	test-rmse:0.497818+0.000292193
[1]	train-rmse:0.494301+0.000139045	test-rmse:0.49545+0.000314938
[2]	train-rmse:0.491724+0.000204943	test-rmse:0.49354+0.000512929
[3]	train-rmse:0.489185+0.000241751	test-rmse:0.491277+0.000471619
[4]	train-rmse:0.486836+0.000259562	test-rmse:0.489383+0.000607157
[5]	train-rmse:0.484573+0.000342245	test-rmse:0.487802+0.000863712
[6]	train-rmse:0.482502+0.000399881	test-rmse:0.486272+0.00102169
[7]	train-rmse:0.480602+0.000548731	test-rmse:0.484854+0.00106231
[8]	train-rmse:0.478689+0.000549005	test-rmse:0.483417+0.00107709
[9]	train-rmse:0.476885+0.00051608	test-rmse:0.482045+0.00128936
[10]	train-rmse:0.475152+0.000561471	test-rmse:0.480761+0.00146676
[11]	train-rmse:0.473531+0.000611852	test-rmse:0.479591+0.00152938
[12]	train-rmse:0.471956+0.000548956	test-rmse:0.478455+0.00171126
[13]	train-rmse:0.470417+0.000558394	test-rmse:0.477304+0.00185039
[14]	train-rmse:0.468933+0.000569547	test-rmse:0.476398+0.0020622
[15

[123]	train-rmse:0.40169+0.000634805	test-rmse:0.449926+0.00163249
[124]	train-rmse:0.401347+0.000666019	test-rmse:0.449923+0.00172488
[125]	train-rmse:0.401054+0.00062853	test-rmse:0.449865+0.00174506
[126]	train-rmse:0.400683+0.000615499	test-rmse:0.449781+0.00166399
[127]	train-rmse:0.400331+0.000610755	test-rmse:0.449812+0.00171421
[128]	train-rmse:0.399996+0.000543265	test-rmse:0.449642+0.00173735
[129]	train-rmse:0.39967+0.000549646	test-rmse:0.44964+0.0016838
[130]	train-rmse:0.399386+0.000561967	test-rmse:0.449663+0.00170887
[131]	train-rmse:0.399047+0.000532703	test-rmse:0.449725+0.0017258
[132]	train-rmse:0.398685+0.000528528	test-rmse:0.449731+0.00170655
[133]	train-rmse:0.398279+0.000501243	test-rmse:0.449646+0.00169244
[134]	train-rmse:0.398017+0.000542756	test-rmse:0.44961+0.0016682
[135]	train-rmse:0.397719+0.000534865	test-rmse:0.44953+0.00168958
[136]	train-rmse:0.397376+0.000521864	test-rmse:0.449488+0.00171812
[137]	train-rmse:0.397096+0.000522962	test-rmse:0.449469+

[245]	train-rmse:0.367098+0.00102075	test-rmse:0.448558+0.00145608
[246]	train-rmse:0.366887+0.000978461	test-rmse:0.448553+0.00149559
[247]	train-rmse:0.366601+0.000967354	test-rmse:0.448554+0.00146326
[248]	train-rmse:0.366416+0.00100973	test-rmse:0.448602+0.00148216
[249]	train-rmse:0.36615+0.000991049	test-rmse:0.448502+0.00146387
[250]	train-rmse:0.365911+0.00100072	test-rmse:0.448503+0.00155194
[251]	train-rmse:0.365682+0.00104839	test-rmse:0.448452+0.00153608
[252]	train-rmse:0.365444+0.0010256	test-rmse:0.448457+0.00161004
[253]	train-rmse:0.3652+0.00102778	test-rmse:0.448488+0.0015847
[254]	train-rmse:0.365001+0.00102455	test-rmse:0.448475+0.00152963
[255]	train-rmse:0.364799+0.00108571	test-rmse:0.448441+0.00151006
[256]	train-rmse:0.364483+0.00109017	test-rmse:0.448451+0.00154492
[257]	train-rmse:0.364224+0.00113293	test-rmse:0.448475+0.0015106
[258]	train-rmse:0.364014+0.00111902	test-rmse:0.448441+0.00155046
[259]	train-rmse:0.363793+0.00107987	test-rmse:0.448453+0.0015396

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   29.8s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   43.8s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  3.1min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:  4.2min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  5.6min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:  7.4min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:  9.1min
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed: 11.6min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed: 14.1min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed: 17.7min
[Parallel(n_jobs=4)]: Done 144 out of 144 | elapsed: 18.9min finished


Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   28.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   44.0s
[Parallel(n_jobs=4)]: Done  13 out of  18 | elapsed:   57.8s remaining:   22.2s
[Parallel(n_jobs=4)]: Done  15 out of  18 | elapsed:   59.2s remaining:   11.8s
[Parallel(n_jobs=4)]: Done  18 out of  18 | elapsed:  1.2min finished


Fitting 3 folds for each of 25 candidates, totalling 75 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   19.7s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   30.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:   50.6s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.1min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:  4.2min finished


Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   28.9s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   43.8s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.1min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  3.5min finished


Fitting 3 folds for each of 19 candidates, totalling 57 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   28.4s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   44.1s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.6min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.1min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done  57 out of  57 | elapsed:  3.7min finished


[0]	train-rmse:0.498157+6.62437e-05	test-rmse:0.498552+0.000260671
[1]	train-rmse:0.496308+0.000139333	test-rmse:0.497111+0.000214739
[2]	train-rmse:0.494537+0.000189839	test-rmse:0.495736+0.000286271
[3]	train-rmse:0.492838+0.000246085	test-rmse:0.494429+0.000243608
[4]	train-rmse:0.491208+0.000339918	test-rmse:0.49316+0.00037361
[5]	train-rmse:0.489643+0.000400221	test-rmse:0.492044+0.00058393
[6]	train-rmse:0.488171+0.000422539	test-rmse:0.490813+0.000763348
[7]	train-rmse:0.486682+0.00049199	test-rmse:0.489705+0.000784951
[8]	train-rmse:0.485299+0.000520992	test-rmse:0.488625+0.000876746
[9]	train-rmse:0.484014+0.000580476	test-rmse:0.487753+0.000886955
[10]	train-rmse:0.482741+0.000691065	test-rmse:0.486804+0.000894354
[11]	train-rmse:0.481475+0.000789651	test-rmse:0.485834+0.00100036
[12]	train-rmse:0.480278+0.000779227	test-rmse:0.484959+0.00105789
[13]	train-rmse:0.479076+0.000735585	test-rmse:0.484016+0.00119329
[14]	train-rmse:0.477924+0.000717794	test-rmse:0.483189+0.0013235

[123]	train-rmse:0.424824+0.000744085	test-rmse:0.453092+0.00226002
[124]	train-rmse:0.424586+0.000738601	test-rmse:0.453038+0.00234002
[125]	train-rmse:0.424309+0.000711751	test-rmse:0.452937+0.00231692
[126]	train-rmse:0.424045+0.000724641	test-rmse:0.45286+0.00231889
[127]	train-rmse:0.423793+0.000715731	test-rmse:0.452738+0.00232186
[128]	train-rmse:0.423545+0.000714844	test-rmse:0.452722+0.00234035
[129]	train-rmse:0.423322+0.000728208	test-rmse:0.452656+0.00233479
[130]	train-rmse:0.423044+0.000711179	test-rmse:0.452686+0.00232227
[131]	train-rmse:0.422799+0.000759198	test-rmse:0.452674+0.00237064
[132]	train-rmse:0.422588+0.000752548	test-rmse:0.452588+0.00237088
[133]	train-rmse:0.422388+0.000777233	test-rmse:0.452483+0.00239125
[134]	train-rmse:0.42217+0.00078747	test-rmse:0.452401+0.00235824
[135]	train-rmse:0.42193+0.000774783	test-rmse:0.452282+0.00239326
[136]	train-rmse:0.421656+0.000779427	test-rmse:0.452186+0.00240794
[137]	train-rmse:0.421405+0.000769713	test-rmse:0.45

[245]	train-rmse:0.400128+0.000929678	test-rmse:0.448847+0.00235607
[246]	train-rmse:0.399971+0.000944955	test-rmse:0.44885+0.00238973
[247]	train-rmse:0.399823+0.000950411	test-rmse:0.44884+0.00236343
[248]	train-rmse:0.399659+0.000948041	test-rmse:0.448782+0.00238079
[249]	train-rmse:0.399495+0.000950573	test-rmse:0.44877+0.00242194
[250]	train-rmse:0.399367+0.000958445	test-rmse:0.448765+0.00239142
[251]	train-rmse:0.399189+0.000949588	test-rmse:0.4487+0.00238233
[252]	train-rmse:0.398997+0.000928609	test-rmse:0.448669+0.00239595
[253]	train-rmse:0.398813+0.000926384	test-rmse:0.448654+0.00236794
[254]	train-rmse:0.398653+0.000944992	test-rmse:0.44865+0.00238307
[255]	train-rmse:0.398501+0.000956406	test-rmse:0.448649+0.00243648
[256]	train-rmse:0.39834+0.000957571	test-rmse:0.448654+0.00243947
[257]	train-rmse:0.398183+0.000959916	test-rmse:0.448657+0.00240814
[258]	train-rmse:0.398003+0.000967334	test-rmse:0.448603+0.00238841
[259]	train-rmse:0.397856+0.00095089	test-rmse:0.448613

[55]	train-rmse:0.45573+0.000646363	test-rmse:0.468308+0.00225716
[56]	train-rmse:0.455271+0.000661018	test-rmse:0.468041+0.00228327
[57]	train-rmse:0.454817+0.000690345	test-rmse:0.467697+0.00226237
[58]	train-rmse:0.454363+0.00067649	test-rmse:0.467474+0.00227711
[59]	train-rmse:0.453919+0.000658061	test-rmse:0.467187+0.00221796
[60]	train-rmse:0.453502+0.000639707	test-rmse:0.46695+0.00217845
[61]	train-rmse:0.453085+0.00062786	test-rmse:0.46669+0.00213872
[62]	train-rmse:0.452655+0.000597941	test-rmse:0.46642+0.0021184
[63]	train-rmse:0.452229+0.000613649	test-rmse:0.466204+0.00217252
[64]	train-rmse:0.451846+0.000588965	test-rmse:0.465984+0.00219332
[65]	train-rmse:0.451438+0.000558935	test-rmse:0.465739+0.0021568
[66]	train-rmse:0.45103+0.000570102	test-rmse:0.465532+0.00215924
[67]	train-rmse:0.450633+0.000559718	test-rmse:0.465271+0.002108
[68]	train-rmse:0.450247+0.000556738	test-rmse:0.465035+0.00206711
[69]	train-rmse:0.449868+0.000572536	test-rmse:0.464869+0.00209103
[70]	t

[178]	train-rmse:0.422117+0.000811495	test-rmse:0.452676+0.0024861
[179]	train-rmse:0.421931+0.000830468	test-rmse:0.452643+0.00247489
[180]	train-rmse:0.421746+0.00081491	test-rmse:0.452608+0.00245625
[181]	train-rmse:0.421546+0.000818271	test-rmse:0.452584+0.00247029
[182]	train-rmse:0.421373+0.000834138	test-rmse:0.452541+0.00245114
[183]	train-rmse:0.421189+0.000849398	test-rmse:0.452505+0.00244771
[184]	train-rmse:0.421001+0.000841272	test-rmse:0.452437+0.0024131
[185]	train-rmse:0.420808+0.000815937	test-rmse:0.452401+0.00239698
[186]	train-rmse:0.420629+0.000835666	test-rmse:0.452334+0.00233723
[187]	train-rmse:0.420472+0.000845623	test-rmse:0.452266+0.00230199
[188]	train-rmse:0.420284+0.000822612	test-rmse:0.452211+0.00229531
[189]	train-rmse:0.42009+0.000806328	test-rmse:0.452168+0.00226908
[190]	train-rmse:0.419924+0.000804143	test-rmse:0.452133+0.00225347
[191]	train-rmse:0.419759+0.000805018	test-rmse:0.452118+0.00227195
[192]	train-rmse:0.419579+0.000808855	test-rmse:0.45

[300]	train-rmse:0.403309+0.00094234	test-rmse:0.449196+0.00209925
[301]	train-rmse:0.403172+0.000936532	test-rmse:0.449165+0.00209064
[302]	train-rmse:0.403051+0.000947898	test-rmse:0.449172+0.00209318
[303]	train-rmse:0.402937+0.000953179	test-rmse:0.449172+0.00210004
[304]	train-rmse:0.402805+0.000960677	test-rmse:0.449144+0.00210361
[305]	train-rmse:0.402667+0.000962915	test-rmse:0.44912+0.00206875
[306]	train-rmse:0.402548+0.000957962	test-rmse:0.449113+0.00205572
[307]	train-rmse:0.402435+0.000958443	test-rmse:0.449108+0.00202527
[308]	train-rmse:0.402312+0.000952405	test-rmse:0.449115+0.00200853
[309]	train-rmse:0.402198+0.000956754	test-rmse:0.449108+0.00200598
[310]	train-rmse:0.402079+0.000975488	test-rmse:0.449077+0.00203027
[311]	train-rmse:0.401944+0.000975816	test-rmse:0.449061+0.002053
[312]	train-rmse:0.401804+0.000973785	test-rmse:0.449048+0.00208862
[313]	train-rmse:0.401686+0.000978032	test-rmse:0.44905+0.00212175
[314]	train-rmse:0.401561+0.000974223	test-rmse:0.449

[423]	train-rmse:0.388766+0.00106957	test-rmse:0.448528+0.00218765
[424]	train-rmse:0.38866+0.0010662	test-rmse:0.448532+0.00223418
[425]	train-rmse:0.388579+0.0010435	test-rmse:0.448514+0.00221718
[426]	train-rmse:0.388475+0.00103704	test-rmse:0.448504+0.00223265
[427]	train-rmse:0.388349+0.0010372	test-rmse:0.448495+0.00224538
[428]	train-rmse:0.388247+0.00103996	test-rmse:0.448496+0.00222342
[429]	train-rmse:0.388148+0.00101639	test-rmse:0.448497+0.00222984
[430]	train-rmse:0.388031+0.0010204	test-rmse:0.448515+0.00222281
[431]	train-rmse:0.387928+0.00104092	test-rmse:0.448497+0.00220734
[432]	train-rmse:0.387828+0.00103538	test-rmse:0.448519+0.00218266
[433]	train-rmse:0.387727+0.00102493	test-rmse:0.448525+0.00215003
[434]	train-rmse:0.387634+0.00102808	test-rmse:0.448531+0.00211851
[435]	train-rmse:0.387526+0.00101835	test-rmse:0.448549+0.00211487
[436]	train-rmse:0.387419+0.00100883	test-rmse:0.448553+0.00210417
[437]	train-rmse:0.387317+0.00102199	test-rmse:0.448559+0.00209304


In [63]:
best_params

{'colsample_bytree': 0.9,
 'gamma': 0.0,
 'learning_rate': 0.05,
 'max_depth': 3,
 'min_child_weight': 6,
 'nthread': 4,
 'objective': 'reg:linear',
 'scale_pos_weight': 1,
 'subsample': 0.8,
 'random_state': 27,
 'n_estimators': 263,
 'reg_alpha': 1,
 'reg_lambda': 100}

In [72]:
history

In [None]:
# lr, mcw, ss


alg = XGBRegressor(
 learning_rate =0.05,
 n_estimators= 200,
 max_depth=3,
 min_child_weight=6,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.9,
 colsample_bylevel=.85,
 objective= 'reg:linear',
 scale_pos_weight=1.0,
 reg_alpha=1,
 reg_lambda=100,
 njobs=3,
 seed=seed**2)

In [14]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [None]:
def score(params):
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    dtrain = xgboost.DMatrix(X_train, label=y_train)
    dvalid = xgboost.DMatrix(X_val, label=y_val)
    watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    gbm_model = xgboost.train(params, 
                              dtrain, 
                              num_round,
                              evals=watchlist,
                              verbose_eval=False)
    predictions = gbm_model.predict(dvalid, ntree_limit=gbm_model.best_iteration)
    print(gini_normalized(y_val, np.array(predictions)))
    loss = 1 - gini_normalized(y_val, np.array(predictions))
    return {'loss': loss, 'status': STATUS_OK}
 

In [29]:
def score(space):
    seed = 27
    test_size = 0.33
#     minimum_sample = 100
    early_stoppping_fraction = .1



    alg = XGBRegressor(
     learning_rate =space['learning_rate'],
     n_estimators= space['n_estimators'],
     max_depth=space['max_depth'],
     min_child_weight=space['min_child_weight'],
     gamma=space['gamma'],
     subsample=space['subsample'],
     colsample_bytree=space['colsample_bytree'],
#      colsample_bylevel=.85,
     objective= 'reg:linear',
     scale_pos_weight=1.0,
     reg_alpha=0,
     reg_lambda=1,
     njobs=3,
     seed=seed**2)

    get_non_overfit_settings( train, target, alg, seed, early_stoppping_fraction, test_size, sample_weights )
    
    print("SCORE:", alg.best_score)

    return{'loss':alg.best_score, 'status': STATUS_OK }


In [None]:
def optimize(evals, cores, trials, optimizer=tpe.suggest, random_state=0):

    space = {
        'n_estimators': hp.quniform('n_estimators', 100, 1000, 1),
        'learning_rate': hp.quniform('learning_rate', 0.025, 0.5, 0.025),
        'max_depth':  hp.choice('max_depth', np.arange(1, 14, dtype=int)),
        'min_child_weight': hp.quniform('min_child_weight', 1, 6, 1),
        'subsample': hp.quniform('subsample', 0.5, 1, 0.05),
        'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
        'colsample_bytree': hp.quniform('colsample_bytree', 0.5, 1, 0.05),
    }    
    
    best = fmin(score, space, algo=tpe.suggest, max_evals=evals, trials = trials)
    return best

In [None]:
trials = Trials()
cores = 4
n = 100
start = time.time()
best_param = optimize(evals = n,
                      optimizer=tpe.suggest,
                      cores = cores,
                      trials = trials)
print("------------------------------------")
print("The best hyperparameters are: ", "\n")
print(best_param)
end = time.time()
print('Time elapsed to optimize {0} executions: {1}'.format(n,end - start))
s

In [None]:
#### add tensorflow and neural net explainers!

In [None]:
# needs BES_reduced

In [None]:
import tensorflow as tf
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.callbacks import EarlyStopping
import time

In [None]:
# warnings - don't know why, online comments suggest just blocking
tf.logging.set_verbosity(tf.logging.ERROR)

In [None]:
drop_vars = []
drop_vars.extend( [x for x in BES_reduced.columns if "eurefvote" in x.lower() ] )
drop_vars.extend( [x for x in BES_reduced.columns if "euid_" in x.lower() ] )
drop_vars.extend( [x for x in BES_reduced.columns if "happyeuleave" in x.lower() ] )
drop_vars                 

In [None]:
# [x for x in BES_reduced.columns if "euref" in x.lower()]
y = BES_reduced['euRefVote_Leave the EU']
X = BES_reduced.drop(drop_vars,axis=1)

In [None]:
from tensorflow.python.keras.utils import to_categorical
y = to_categorical(y)

In [None]:
start_time = time.time()

early_stopping_monitor = EarlyStopping(patience=3)

n_neurons_layer = 500

# Create the model: model
model = Sequential()

# Add the first hidden layer
input_shape = (X.shape[1],)
model.add( Dense(n_neurons_layer, activation='relu', input_shape = input_shape) )

# Add the second hidden layer
model.add( Dense(n_neurons_layer, activation='relu') )

# Add the third hidden layer
model.add( Dense(n_neurons_layer, activation='relu') )

# Add the fourth hidden layer
model.add( Dense(n_neurons_layer, activation='relu') )

# Add the output layer
no_of_outputs = len(np.unique(y))
model.add( Dense( no_of_outputs , activation='softmax') )

# Compile the model
model.compile( optimizer = 'adam',
               loss = 'categorical_crossentropy',
               metrics=['accuracy'])

# Fit the model
model_training = model.fit(X, y, epochs=15, validation_split=0.2, callbacks=[early_stopping_monitor], verbose=False)

# Create the plot
plt.plot(model_training.history['val_loss'], 'r')
plt.xlabel('Epochs')
plt.ylabel('Validation score')
plt.show()

elapsed_time = time.time() - start_time
print(elapsed_time)

In [None]:
import shap
from shap.explainers.deep import DeepExplainer
from shap.explainers.gradient import GradientExplainer
# e = DeepExplainer(model,X)

In [None]:
tf.__version__

In [None]:
%%time
background = X.loc[np.random.choice(X.shape[0], 200, replace=False)]

# e = shap.DeepExplainer((model.layers[0].input, model.layers[-1].output), background)
e = shap.DeepExplainer(model, background)
shap_values = e.shap_values(background.values)
# 100: 1.2s
# 500: 16.4s
# 1000: 66s

# same size but larger neural net -> 7 mins

In [None]:
shap.summary_plot(shap_values[0], background, plot_type='dot')

In [29]:
eurefcorr = BES_reduced_with_na.corrwith(BES_reduced_with_na["profile_eurefvote_Leave the EU"])
eurefcorr.sort_values()

euRefFinalPostW9_Yes                                    -0.473737
euLawsLevelW8__Strongly agree                           -0.427015
bestPMW12_Jeremy Corbyn                                 -0.426421
euUKRichW8__Strongly agree                              -0.422679
satDemEUW9__Very satisfied                              -0.421705
creditLeave_6W9_Yes                                     -0.418926
europeannessW9__Very strongly European                  -0.414238
europeannessW11__Very strongly European                 -0.406890
europeannessW8__Very strongly European                  -0.405208
immigCulturalW8__Enriches cultural life                 -0.393074
immigEconW13__Good for economy                          -0.391046
europeannessW10__Very strongly European                 -0.386257
europeannessW14__Very strongly European                 -0.383990
approveEUW9__Strongly approve                           -0.382175
satDemEUW8__Very satisfied                              -0.381733
euUKRichW7

In [32]:
eurefcorr.drop([x for x in eurefcorr.index if "euRef" in x]).sort_values()

euLawsLevelW8__Strongly agree                           -0.427015
bestPMW12_Jeremy Corbyn                                 -0.426421
euUKRichW8__Strongly agree                              -0.422679
satDemEUW9__Very satisfied                              -0.421705
creditLeave_6W9_Yes                                     -0.418926
europeannessW9__Very strongly European                  -0.414238
europeannessW11__Very strongly European                 -0.406890
europeannessW8__Very strongly European                  -0.405208
immigCulturalW8__Enriches cultural life                 -0.393074
immigEconW13__Good for economy                          -0.391046
europeannessW10__Very strongly European                 -0.386257
europeannessW14__Very strongly European                 -0.383990
approveEUW9__Strongly approve                           -0.382175
satDemEUW8__Very satisfied                              -0.381733
euUKRichW7__Strongly agree                              -0.380728
socialIden

In [33]:
%%time
Sovcorr = BES_reduced_with_na.corrwith(BES_reduced_with_na["euLawsLevelW8__Strongly agree"])
Sovcorr.sort_values()


EUIntegrationSelfW8__Protect our independence          -0.659486
euParlOverRideW8__Strongly agree                       -0.644185
UKsovereigntyW8__Strongly agree                        -0.612591
euUndermineIdentityW8__Strongly agree                  -0.599770
ukCoopMovementW8__Bad for Britain                      -0.593671
effectsEUEconW8__Much better                           -0.572376
happyEULeaveW9__Extremely happy                        -0.566978
EUIntegrationSelfW9__Protect our independence          -0.565935
ukCoopWorkersW8__Bad for Britain                       -0.561525
likeUKIPW8__Strongly like                              -0.546492
effectsEUNHSW8__Much better                            -0.534426
euRedTapeW8__Strongly agree                            -0.534305
effectsEUWorkersW8__Much better                        -0.531734
euIDW12_The leave side                                 -0.522181
euLeaveVoiceW8__Much higher                            -0.521240
effectsEUTradeW8__Much hi

In [35]:
Sovcorr = BES_reduced_with_na.corrwith(BES_reduced_with_na["UKsovereigntyW8__Strongly agree"])
Sovcorr.sort_values()
# UKsovereignty "How much do you agree or disagree that the EU has… Undermined the powers of the UK Parliament"
# immigSelf "Some people think that the UK should allow *many more* immigrants to come to the UK to
#           live and others think that the UK should allow *many fewer* immigrants. Where would you
#           place yourself on this scale?"


echrW8__Strongly agree                                 -0.623543
euLawsLevelW8__Strongly agree                          -0.612591
euUKRichW8__Strongly agree                             -0.601917
immigSelfW8__Allow many more                           -0.578368
approveEUW9__Strongly approve                          -0.565704
euPreventWarW8__Strongly agree                         -0.549970
euCloserW8__Strongly agree                             -0.542856
immigCulturalW8__Enriches cultural life                -0.537796
immigEconW8__Good for economy                          -0.520597
euLeaveBigBusinessW8__Much more likely                 -0.520386
asylumMoreW8__Many more                                -0.504113
euMoreW8__Many more                                    -0.502763
immigSelfW9__Allow many more                           -0.502158
europeannessW8__Very strongly European                 -0.496763
satDemEUW8__Very satisfied                             -0.490473
effectsEUUnemploymentW8__

In [9]:
welshness_cols = [x for x in BES_Panel.columns if "welshness" in x]


In [10]:
[x for x in BES_Panel.columns if "languageSkillsWelsh" in x]

['languageSkillsWelshW8']

In [11]:
welshness_cols = [x for x in BES_Panel.columns if "welshness" in x]
latest_welshness = BES_Panel[welshness_cols].fillna(method="ffill",axis=1)[welshness_cols[-1]]
latest_welshness = latest_welshness.astype(
                    pd.api.types.CategoricalDtype(categories = BES_Panel[welshness_cols[0]].cat.categories) )

In [12]:
englishness_cols = [x for x in BES_Panel.columns if "englishness" in x]
latest_englishness = BES_Panel[englishness_cols].fillna(method="ffill",axis=1)[englishness_cols[-1]]
latest_englishness = latest_englishness.astype(
                    pd.api.types.CategoricalDtype(categories = BES_Panel[englishness_cols[0]].cat.categories) )

In [13]:
britishness_cols = [x for x in BES_Panel.columns if "britishness" in x]
latest_britishness = BES_Panel[britishness_cols].fillna(method="ffill",axis=1)[britishness_cols[-1]]
latest_britishness = latest_britishness.astype(
                    pd.api.types.CategoricalDtype(categories = BES_Panel[britishness_cols[0]].cat.categories) )

In [16]:
euRefVote_cols = [x for x in BES_Panel.columns if re.match("euRefVoteW\d", x)]
# euRefVote_cols
latest_euRefVote = BES_Panel[euRefVote_cols].fillna(method="ffill",axis=1)[euRefVote_cols[-1]]
latest_euRefVote = latest_euRefVote.astype(
                    pd.api.types.CategoricalDtype(categories = BES_Panel[euRefVote_cols[0]].cat.categories) )

49745

In [76]:
BES_Panel['countryOfBirth'].value_counts()

England                                 25181
Scotland                                 5993
Wales                                    2821
Other: European Union member country      703
Other: Rest of world                      694
Other: Commonwealth member country        598
Prefer not to answer                      268
Republic of Ireland                       193
Northern Ireland                          184
Name: countryOfBirth, dtype: int64

In [78]:
WelshResident = (BES_Panel['country']=="Wales")
ScottishResident = (BES_Panel['country']=="Scotland")
EnglishResident = (BES_Panel['country']=="England")
BornInWales = (BES_Panel['countryOfBirth']=="Wales")
BornInWales.name = "BornInWales"
BornInEngland = (BES_Panel['countryOfBirth']=="England")
BornInEngland.name = "BornInEngland"
BornInScotland = (BES_Panel['countryOfBirth']=="Scotland")
BornInScotland.name = "BornInScotland"
BornInBritain = BornInWales|BornInEngland|BornInScotland
BornInBritain.name = "BornInBritain"

print("WelshResident:",WelshResident.sum(),
      "BornInWales:",  BornInWales.sum(),
      "BornInEngland:",BornInEngland.sum(),
     "BornInBritain:",BornInBritain.sum())

WelshResident: 5447 BornInWales: 2821 BornInEngland: 25181 BornInBritain: 33995


In [83]:
pd.crosstab( BES_Panel['euRefVoteW14'][WelshResident], BornInWales[WelshResident],
             values= BES_Panel['wt_new_W14'][WelshResident], aggfunc=sum,
             normalize='columns')*100

BornInWales,False,True
euRefVoteW14,Unnamed: 1_level_1,Unnamed: 2_level_1
Stay/remain in the EU,44.949271,48.755024
Leave the EU,42.775066,43.978919
I would/will not vote,6.19621,2.673487
Don't know,6.079453,4.592571


In [74]:
pd.crosstab( BES_Panel['profile_eurefvote'][WelshResident], BornInWales[WelshResident],
             values= BES_Panel['wt_new_W8'][WelshResident], aggfunc=sum, normalize='columns')*100

BornInWales,False,True
profile_eurefvote,Unnamed: 1_level_1,Unnamed: 2_level_1
Stay/remain in the EU,49.731851,51.565513
Leave the EU,49.303835,47.535946
Don't know,0.964314,0.898541


In [106]:
pd.crosstab( BES_Panel['profile_eurefvote'][WelshResident], BornInEngland[WelshResident],
             values= BES_Panel['wt_new_W8'][WelshResident], aggfunc=sum, normalize='columns')*100

BornInEngland,False,True
profile_eurefvote,Unnamed: 1_level_1,Unnamed: 2_level_1
Stay/remain in the EU,50.74742,49.979228
Leave the EU,48.269109,49.197794
Don't know,0.983472,0.822979


In [94]:

# BES_Panel[welshness_cols[0]]
# from pandas.api.types import CategoricalDtype
# t = CategoricalDtype(categories=BES_Panel[welshness_cols[0]].cat.categories, ordered=True)
# latest_welshness.astype(t)
# .astype(pd.api.types.CategoricalDtype(categories = ["a", "b", "c"]))

In [97]:
latest_welshness[WelshResident].value_counts()

Very strongly Welsh    2058
Not at all Welsh        927
6                       702
5                       539
4                       507
2                       341
3                       290
Don't know               69
Name: welshnessW14, dtype: int64

In [99]:
latest_englishness[WelshResident].value_counts()

Not at all English       2327
Very strongly English     700
4                         661
2                         467
5                         446
3                         372
6                         356
Don't know                112
Name: englishnessW14, dtype: int64

In [124]:
latest_britishness[WelshResident].value_counts()

Very strongly British    2151
6                        1078
5                         892
4                         656
Not at all British        232
3                         196
2                         168
Don't know                 68
Name: britishnessW14, dtype: int64

In [120]:
round(pd.crosstab( BES_Panel['profile_eurefvote'][WelshResident], latest_welshness[WelshResident],
            normalize='columns')*100,1)

welshnessW14,Not at all Welsh,2,3,4,5,6,Very strongly Welsh,Don't know
profile_eurefvote,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Stay/remain in the EU,47.6,62.9,63.9,57.8,58.6,63.3,52.6,60.0
Leave the EU,51.6,37.1,35.6,41.5,40.8,35.5,46.7,36.7
Don't know,0.9,0.0,0.6,0.7,0.6,1.2,0.6,3.3


In [121]:
round(pd.crosstab( BES_Panel['profile_eurefvote'][WelshResident], latest_englishness[WelshResident],
            normalize='columns')*100,1)

englishnessW14,Not at all English,2,3,4,5,6,Very strongly English,Don't know
profile_eurefvote,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Stay/remain in the EU,58.3,62.8,65.6,58.9,52.5,53.5,37.5,52.8
Leave the EU,40.9,36.5,34.0,40.4,46.7,45.7,62.3,43.4
Don't know,0.8,0.7,0.5,0.8,0.7,0.8,0.2,3.8


In [123]:
round(pd.crosstab( BES_Panel['profile_eurefvote'][WelshResident], latest_britishness[WelshResident],
            normalize='columns')*100,1)

britishnessW14,Not at all British,2,3,4,5,6,Very strongly British,Don't know
profile_eurefvote,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Stay/remain in the EU,76.6,77.6,76.3,68.0,67.4,60.5,38.6,69.2
Leave the EU,23.4,22.4,23.7,30.9,31.5,38.9,60.6,30.8
Don't know,0.0,0.0,0.0,1.0,1.1,0.6,0.8,0.0


In [126]:
latest_welshness_numeric = latest_welshness[WelshResident].cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_welshness_numeric.name = "welshness"
latest_englishness_numeric = latest_englishness[WelshResident].cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_englishness_numeric.name = "englishness"
latest_britishness_numeric = latest_britishness[WelshResident].cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_britishness_numeric.name = "britishness"
pd.concat([
          BornInWales[WelshResident],BornInEngland[WelshResident],
          latest_welshness_numeric,
          latest_englishness_numeric,
          latest_britishness_numeric
          ],axis=1).corrwith(BES_Panel['profile_eurefvote'][WelshResident]=="Leave the EU")

BornInWales     -0.054593
BornInEngland    0.003712
welshness       -0.020250
englishness      0.093911
britishness      0.182532
dtype: float64

In [25]:
latest_welshness_numeric = latest_welshness[WelshResident].cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_welshness_numeric.name = "welshness"
latest_englishness_numeric = latest_englishness[WelshResident].cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_englishness_numeric.name = "englishness"
latest_britishness_numeric = latest_britishness[WelshResident].cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_britishness_numeric.name = "britishness"
pd.concat([
          BornInWales[WelshResident],BornInEngland[WelshResident],
          latest_welshness_numeric,
          latest_englishness_numeric,
          latest_britishness_numeric
          ],axis=1).corrwith(latest_euRefVote[WelshResident]=="Leave the EU")

BornInWales      0.003093
BornInEngland    0.029401
welshness       -0.021917
englishness      0.114942
britishness      0.228011
dtype: float64

In [22]:
latest_welshness_numeric = latest_welshness[~WelshResident].cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_welshness_numeric.name = "welshness"
latest_englishness_numeric = latest_englishness[~WelshResident].cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_englishness_numeric.name = "englishness"
latest_britishness_numeric = latest_britishness[~WelshResident].cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_britishness_numeric.name = "britishness"
pd.concat([
          BornInWales[~WelshResident],BornInEngland[~WelshResident],
          latest_welshness_numeric,
          latest_englishness_numeric,
          latest_britishness_numeric
          ],axis=1).corrwith(latest_euRefVote[~WelshResident]=="Leave the EU")

BornInWales     -0.004466
BornInEngland    0.072430
welshness       -0.034346
englishness      0.288416
britishness      0.209402
dtype: float64

In [24]:
latest_welshness_numeric = latest_welshness[~WelshResident].cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_welshness_numeric.name = "welshness"
latest_englishness_numeric = latest_englishness[~WelshResident].cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_englishness_numeric.name = "englishness"
latest_britishness_numeric = latest_britishness[~WelshResident].cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_britishness_numeric.name = "britishness"
pd.concat([
          BornInWales[~WelshResident],BornInEngland[~WelshResident],
          latest_welshness_numeric,
          latest_englishness_numeric,
          latest_britishness_numeric
          ],axis=1).corrwith(BES_Panel['profile_eurefvote'][~WelshResident]=="Leave the EU")

BornInWales     -0.008091
BornInEngland    0.009166
welshness       -0.049665
englishness      0.259873
britishness      0.184013
dtype: float64

In [130]:
BES_Panel['languageSkillsWelshW8'].value_counts()

No            1631
Yes            391
Don't know      23
Name: languageSkillsWelshW8, dtype: int64

In [137]:
welsh_speaker = BES_Panel[['languageSkillsWelshW8']]=="Yes"
welsh_speaker_asked = BES_Panel['languageSkillsWelshW8'].notnull()

welsh_speaker[welsh_speaker_asked].corrwith(BES_Panel['profile_eurefvote'][welsh_speaker_asked]=="Leave the EU")

languageSkillsWelshW8   -0.102898
dtype: float64

In [142]:
latest_welshness_numeric.notnull().sum()

67796

In [151]:
sizes = BES_reduced_with_na[latest_welshness_numeric.notnull()].notnull().sum()
too_small = sizes[sizes <1000].index

latest_welshness_numeric = latest_welshness.cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_welshness_numeric.name = "welshness"
welsh_corr = BES_reduced_with_na[latest_welshness_numeric.notnull()].corrwith(latest_welshness_numeric[latest_welshness_numeric.notnull()], drop=True)

welsh_corr.drop(intersection(too_small,welsh_corr.index)).dropna().sort_values()

In [166]:
Leave_voting = BES_Panel['profile_eurefvote'][WelshResident]=="Leave the EU"

sizes = BES_reduced_with_na[WelshResident][BES_Panel['profile_eurefvote'][WelshResident].notnull()].notnull().sum()
too_small = sizes[sizes <1000].index

# latest_welshness_numeric = latest_welshness.cat.codes.replace(-1, np.nan).replace(7,np.nan)
# latest_welshness_numeric.name = "welshness"
Leave_wales_corr = BES_reduced_with_na[WelshResident][BES_Panel['profile_eurefvote'][WelshResident].notnull()].corrwith(Leave_voting, drop=True)

Leave_wales_corr.drop(intersection(too_small,Leave_wales_corr.index)).dropna().sort_values()


approveEUW9__Strongly approve                           -0.735330
euRefDoOverW14_Yes                                      -0.714102
euRefFinalPostW9_Yes                                    -0.712617
euRefDoOverW10_Yes                                      -0.702982
euRefDoOverW11_Yes                                      -0.694791
euRefDoOverW13_Yes                                      -0.684459
euUKRichW8__Strongly agree                              -0.679281
euLawsLevelW8__Strongly agree                           -0.667896
euUKRichW7__Strongly agree                              -0.662577
euLeaveBigBusinessW8__Much more likely                  -0.661266
approveEUW10__Strongly approve                          -0.661187
approveEUW7__Strongly approve                           -0.656469
euPreventWarW8__Strongly agree                          -0.648819
euLeaveBigBusinessW14__Much more likely                 -0.642325
europeannessW9__Very strongly European                  -0.636138
euLeaveBig

In [27]:
scottishness_cols = [x for x in BES_Panel.columns if "scottishness" in x]
latest_scottishness = BES_Panel[scottishness_cols].fillna(method="ffill",axis=1)[scottishness_cols[-1]]
latest_scottishness = latest_scottishness.astype(
                    pd.api.types.CategoricalDtype(categories = BES_Panel[scottishness_cols[0]].cat.categories) )

In [73]:
# latest_al_scale

In [56]:
al_scale_dict = {int(re.match("al_scaleW(\d+)", x).groups()[0]):x for x in BES_Panel.columns if re.match("al_scaleW(\d+)", x)}
al_scale_cols = [al_scale_dict[x] for x in sorted(al_scale_dict.keys())]
latest_al_scale = BES_Panel[al_scale_cols].fillna(method="ffill",axis=1)[al_scale_cols[-1]]
latest_al_scale = latest_al_scale.astype(
                    pd.api.types.CategoricalDtype(categories = BES_Panel[al_scale_cols[0]].cat.categories) )

In [57]:
lr_scale_dict = {int(re.match("lr_scaleW(\d+)", x).groups()[0]):x for x in BES_Panel.columns if re.match("lr_scaleW(\d+)", x)}
lr_scale_cols = [lr_scale_dict[x] for x in sorted(lr_scale_dict.keys())]
latest_lr_scale = BES_Panel[lr_scale_cols].fillna(method="ffill",axis=1)[lr_scale_cols[-1]]
latest_lr_scale = latest_lr_scale.astype(
                    pd.api.types.CategoricalDtype(categories = BES_Panel[lr_scale_cols[0]].cat.categories) )

In [87]:
latest_al_scale_numeric = latest_al_scale.cat.codes.replace(-1,np.nan)
latest_al_scale_numeric.name = "al_scale"
latest_lr_scale_numeric = latest_lr_scale.cat.codes.replace(-1,np.nan)
latest_lr_scale_numeric.name = "lr_scale"

latest_scottishness_numeric = latest_scottishness.cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_scottishness_numeric.name = "scottishness"
latest_welshness_numeric = latest_welshness.cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_welshness_numeric.name = "welshness"
latest_englishness_numeric = latest_englishness.cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_englishness_numeric.name = "englishness"
latest_britishness_numeric = latest_britishness.cat.codes.replace(-1, np.nan).replace(7,np.nan)
latest_britishness_numeric.name = "britishness"

In [95]:
def values_by_ness_and_region(regionMask, value_series):
    global BornInWales,BornInEngland,BornInScotland,BornInBritain, geo_values
    global latest_welshness_numeric,latest_englishness_numeric,latest_britishness_numeric,latest_scottishness_numeric
    return geo_values[regionMask].corrwith(value_series[regionMask])

In [96]:
geo_values  = pd.concat([
              BornInWales,BornInEngland,BornInScotland,BornInBritain,
              latest_welshness_numeric,
              latest_englishness_numeric,
              latest_britishness_numeric,
              latest_scottishness_numeric,
              ],axis=1)
# geo_values

In [126]:
print("Welsh Residents")
df = pd.concat([values_by_ness_and_region(WelshResident, latest_al_scale_numeric),
                values_by_ness_and_region(WelshResident, latest_lr_scale_numeric),
                geo_values[WelshResident].notnull().sum()
                ], axis=1)
df.columns = ["al_scale(Lib->Auth)", "lr_scale(Left->Right)", "sample size"]
df.drop(df[df["sample size"]<1000].index).round(2)

Welsh Residents


Unnamed: 0,al_scale(Lib->Auth),lr_scale(Left->Right),sample size
BornInWales,0.08,-0.06,5447
BornInEngland,-0.05,0.06,5447
BornInScotland,0.0,0.02,5447
BornInBritain,0.03,-0.0,5447
welshness,0.07,-0.12,5364
englishness,0.11,0.13,5329
britishness,0.31,0.19,5373


In [127]:
print("Scottish Residents")
df = pd.concat([values_by_ness_and_region(ScottishResident, latest_al_scale_numeric),
                values_by_ness_and_region(ScottishResident, latest_lr_scale_numeric),
                geo_values[ScottishResident].notnull().sum()
               ],  axis=1)
df.columns = ["al_scale(Lib->Auth)", "lr_scale(Left->Right)", "sample size"]
df.drop(df[df["sample size"]<1000].index).round(2)

Scottish Residents


Unnamed: 0,al_scale(Lib->Auth),lr_scale(Left->Right),sample size
BornInWales,-0.02,0.02,9430
BornInEngland,-0.05,0.04,9430
BornInScotland,0.07,-0.05,9430
BornInBritain,0.04,-0.02,9430
englishness,0.07,0.13,9221
britishness,0.31,0.35,9295
scottishness,0.06,-0.17,9304


In [128]:
print("English Residents")
df = pd.concat([values_by_ness_and_region(EnglishResident, latest_al_scale_numeric),
                values_by_ness_and_region(EnglishResident, latest_lr_scale_numeric),
                geo_values[EnglishResident].notnull().sum()
               ],  axis=1)
df.columns = ["al_scale(Lib->Auth)", "lr_scale(Left->Right)", "sample size"]
df.drop(df[df["sample size"]<1000].index).round(2)

English Residents


Unnamed: 0,al_scale(Lib->Auth),lr_scale(Left->Right),sample size
BornInWales,-0.01,0.01,51817
BornInEngland,0.05,0.0,51817
BornInScotland,0.0,0.01,51817
BornInBritain,0.05,0.0,51817
englishness,0.39,0.09,50527
britishness,0.25,0.11,50574


In [68]:
pd.concat([
          BornInWales[WelshResident],BornInEngland[WelshResident],BornInScotland[WelshResident],
          latest_welshness_numeric[WelshResident],
          latest_englishness_numeric[WelshResident],
          latest_britishness_numeric[WelshResident],
          latest_scottishness_numeric[WelshResident],
          ],axis=1).corrwith(latest_al_scale_numeric[WelshResident])

BornInWales      0.075204
BornInEngland   -0.052951
welshness        0.065090
englishness      0.114135
britishness      0.306061
scottishness    -0.455932
dtype: float64

In [69]:
pd.concat([
          BornInWales[WelshResident],BornInEngland[WelshResident],
          latest_welshness_numeric[WelshResident],
          latest_englishness_numeric[WelshResident],
          latest_britishness_numeric[WelshResident],
          latest_scottishness_numeric[WelshResident],
          ],axis=1).corrwith(latest_lr_scale_numeric[WelshResident])

BornInWales     -0.062292
BornInEngland    0.060179
welshness       -0.118067
englishness      0.133635
britishness      0.186472
scottishness    -0.598862
dtype: float64

In [65]:
pd.concat([
          BornInWales[WelshResident],BornInEngland[WelshResident],
          latest_welshness_numeric[WelshResident],
          latest_englishness_numeric[WelshResident],
          latest_britishness_numeric[WelshResident],
          latest_scottishness_numeric[WelshResident],
          ],axis=1).corrwith(latest_al_scale_numeric[WelshResident])

BornInWales      0.075204
BornInEngland   -0.052951
welshness        0.065090
englishness      0.114135
britishness      0.306061
welshness       -0.455932
dtype: float64