In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import gc
import re

from itertools import cycle
from IPython.display import display
import pickle, os

from utility import *

import seaborn as sns

encoding = "ISO-8859-1"

import Jupyter_module_loader

In [2]:
# you should clone this git to this subdirectory (in some directory - I call it BES_analysis - doesn't matter though)

if os.getcwd().split(os.sep)[-1] != 'BES_analysis_code':
    raise Exception("Stop! You're in the wrong directory - should be in 'BES_analysis_code'")

BES_code_folder   = "../BES_analysis_code/" # we should be here!
BES_small_data_files = BES_code_folder + "small data files" + os.sep
if not os.path.exists( BES_small_data_files ):
    os.makedirs( BES_small_data_files )

# we should create these if they don't already exist
BES_data_folder   = "../BES_analysis_data/"
if not os.path.exists( BES_data_folder ):
    os.makedirs( BES_data_folder )

BES_output_folder = "../BES_analysis_output/"
if not os.path.exists( BES_output_folder ):
    os.makedirs( BES_output_folder )
    
BES_file_manifest = pd.read_csv( BES_small_data_files + "BES_file_manifest.csv" )

BES_R_data_files = BES_data_folder + "R_data" + os.sep
if not os.path.exists( BES_R_data_files ):
    os.makedirs( BES_R_data_files )


In [3]:
dataset_name = "W16_comb"

In [4]:
%%time

manifest = BES_file_manifest[ BES_file_manifest["Name"] == dataset_name ]

data_subfolder = BES_data_folder + dataset_name + os.sep

dataset_filename = manifest["Stata_Filename"].values[0]
# dataset_description = manifest["Friendlier_Description"].values[0]
# dataset_citation = manifest["Citation"].values[0]
# dataset_start = manifest["Date_Start"].values[0]
# dataset_stop = manifest["Date_Stop"].values[0]
# dataset_wave = manifest["Wave No"].values[0]

# BES_Panel = pd.read_stata( data_subfolder + dataset_filename )
BES_Panel  = pd.read_msgpack(data_subfolder + dataset_filename.replace('.dta','.msgpack'))
print("BES_Panel", BES_Panel.shape )


It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code, glob, local_ns)


BES_Panel (92587, 6516)
Wall time: 5.12 s


In [5]:
encoding = "ISO-8859-1"

In [6]:
## HELPER FUNCTIONS / REPLACEMENT VALUE DICTIONARIES

# Rename -> Reorder

# changing the order of some sets of categories
change_cat_dict = {"Bad time to buy|Good time to buy|Neither good nor bad time to buy|Don't know": ["Bad time to buy",
                                                                                                    "Neither good nor bad time to buy",
                                                                                                    "Good time to buy",
                                                                                                    "Don't know"],
                   "Larger|Smaller|About the same|Don't know": ["Larger", "About the same", "Smaller","Don't know"],
                   "Yes|No|99.0":       ['No', 'Yes', '99.0'],
                   "Yes|No|Don't know": ['No', 'Yes', "Don't know"],
                   "Yes|No" :           ['No', 'Yes'],                   
                   "Yes|No|Did not vote|Don't know" : ["No","Yes","Did not vote","Don't know"],
                   "Yes, voted|No, did not vote|Don't know" : ["No, did not vote", "Yes, voted", "Don't know"],
                   "I would/will not vote|Leave the EU|Stay in the EU|Don't know":
                       ['Stay in the EU', 'Leave the EU', 'I would/will not vote', "Don't know"],
                   "Mainly leave|Mainly remain|Fairly evenly divided|Don't know": ["Mainly remain",
                                                                                   "Fairly evenly divided", "Mainly leave", "Don't know"],
                   "An individual share in a company|A portfolio of different company shares|The risk is the same|Don't know|Prefer not to say":
                       ['An individual share in a company', 'The risk is the same', 'A portfolio of different company shares',"Prefer not to say","Don't know"],
                   "No, I have never been a member|Yes, I am a member of a party|I am not a member now but I used to be|Don't know":
                       ['No, I have never been a member', 'I am not a member now but I used to be', 'Yes, I am a member of a party', "Don't know"],
                   "Never or practically never|Less often than once a year|Less often but at least once a year|Less often but at least twice a year|Less often but at least once a month|Less often but at least once in two weeks|Once a week or more|Varies too much to say|I am not religious|Don't know":
                       ['I am not religious', 'Never or practically never', 'Less often than once a year',
                        'Less often but at least once a year', 'Less often but at least twice a year',
                        'Less often but at least once a month', 'Less often but at least once in two weeks',
                        'Once a week or more', "Varies too much to say","Don't know"],
                   "under £5,000 per year|£5,000 to £9,999 per year|£10,000 to £14,999 per year|£15,000 to £19,999 per year|£20,000 to £24,999 per year|£25,000 to £29,999 per year|£30,000 to £34,999 per year|£35,000 to £39,999 per year|£40,000 to £44,999 per year|£45,000 to £49,999 per year|£50,000 to £59,999 per year|£60,000 to £69,999 per year|£70,000 to £99,999 per year|£100,000 to £149,999 per year|£150,000 and over|Don't know|Prefer not to answer":
                       [ 'under £5,000 per year',
                         '£5,000 to £9,999 per year',
                         '£10,000 to £14,999 per year',
                         '£15,000 to £19,999 per year',
                         '£20,000 to £24,999 per year',
                         '£25,000 to £29,999 per year',
                         '£30,000 to £34,999 per year',
                         '£35,000 to £39,999 per year',
                         '£40,000 to £44,999 per year',
                         '£45,000 to £49,999 per year',
                         '£50,000 to £59,999 per year',
                         '£60,000 to £69,999 per year',
                         '£70,000 to £99,999 per year',
                         '£100,000 to £149,999 per year',
                         '£150,000 and over',                         
                         'Prefer not to answer',
                         "Don't know",], # change order of "don't know" and "prefer not to answer" to keep don't knows last
                   "1|2|3|4|5|6|7|8 or more|Don't know|Prefer not to say":
                       ["1","2","3","4","5","6","7","8 or more","Prefer not to say","Don't know"],
                   "The Yes side|The No side|Neither|Don't know":
                       ["The Yes side","Neither","The No side","Don't know"], # is this ordinal - meh?
                   "1|2|3|4|5|6|7|8|9|Right  10|Don't know|Left  0":
                       ["Left  0","1","2","3","4","5","6","7","8","9","Right  10","Don't know"], # lrMayW12
                   "1|2|3|4|5|6|7|8|9|Government should increase taxes a lot and spend much more on health and social services  10|Don't know|Government should cut taxes a lot and spend much less on health and social services  0":
                       ['Government should cut taxes a lot and spend much less on health and social services  0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'Government should increase taxes a lot and spend much more on health and social services  10', "Don't know"],#taxSpendSelfW14
                   "1|2|3|4|5|More than 5|0|Prefer not to say":
                       ["0","1","2","3","4","5","More than 5","Prefer not to say"], # numChildrenW14
                   "None|1|2 to 5|6 to 10|11 to 20|51 to 100|Over 100|21 to 50|Not applicable":
                       ["None","1","2 to 5","6 to 10","11 to 20","21 to 50","51 to 100","Over 100","Not applicable"], # profile_work_responsibilityW14
                   "None|Other Newspaper|Other local daily morning newspaper|The Daily Mail / The Scottish Daily Mail|The Daily Star / The Daily Star of Scotland|The Daily Telegraph|The Express|The Financial Times|The Guardian|The Herald (Glasgow)|The Independent|The Mirror / Daily Record|The Scotsman|The Sun|The Times|The Western Mail":
                       ['The Express', 'The Daily Mail / The Scottish Daily Mail', 'The Mirror / Daily Record', 'The Daily Star / The Daily Star of Scotland', 'The Sun', 'The Daily Telegraph', 'The Financial Times', 'The Guardian', 'The Independent', 'The Times', 'The Scotsman', 'The Herald (Glasgow)', 'The Western Mail', 'Other local daily morning newspaper', 'Other Newspaper', 'None'],# profiles_newspaper2W16
                   "Agree|Disagree|Don't know|Neither agree nor disagree|Strongly agree|Strongly disagree":
                       ['Strongly disagree','Disagree', 'Neither agree nor disagree', 'Agree', 'Strongly agree', "Don't know"],#W16_comb: immigExpDVW2
                   "About right|Don't know|Much too high|Much too low|Too high|Too low":
                       ['Much too high', 'Too high','About right', 'Too low',  'Much too low', "Don't know"], #W16_comb:welfarePreferenceExpW2|welfarePreferenceExpW3|welfarePreferenceExpW4|welfarePreferenceExpW6
                   "Agree|Disagree|Don't know|Neither agree nor disagree|Somewhat agree|Somewhat disagree|Strongly agree|Strongly disagree":
                       ['Strongly disagree','Disagree', 'Somewhat disagree', 'Neither agree nor disagree','Somewhat agree','Agree', 'Strongly agree', "Don't know"],#W16_comb: immigExpDVW7
                   "Don't know|No, I did not vote|No, I was not eligible to vote|Yes, I voted":
                       ['Yes, I voted', 'No, I did not vote', 'No, I was not eligible to vote', "Don't know"], #W16_comb:profile_scotref_turnout
                   "Don't know|I voted 'No' (Scotland should not be an independent country)|I voted 'Yes' (Scotland should be an independent country)":
                       ["I voted 'No' (Scotland should not be an independent country)", "I voted 'Yes' (Scotland should be an independent country)", "Don't know"], #W16_comb:profile_scotref_vote
                    "1.0|2.0|3.0|4.0|5.0|6.0|7.0|8.0|9.0|Don't know|Leave the EU without a deal|Remain in the EU":
                       ['Leave the EU without a deal', '1.0', '2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '8.0', '9.0', 'Remain in the EU', "Don't know"],
                   '0.5|1.0|1.5|2.0|2.5|3.0|3.5|4.0|4.5|5.0|5.5|6.0|6.5|7.0|7.5|8.0|8.5|9.0|9.5|Left|Right':
                       ['Left', '0.5', '1.0', '1.5', '2.0', '2.5', '3.0', '3.5', '4.0', '4.5', '5.0', '5.5', '6.0', '6.5', '7.0', '7.5', '8.0', '8.5', '9.0', '9.5', 'Right'], # W16_comb:lr_scaleW16

 
                  }

reorder_variable_dict = pd.DataFrame.from_dict({k : "|".join(v) for k, v in change_cat_dict.items()},orient='index').reset_index()
reorder_variable_dict.columns = ["original_cat_list","reordered_cat_list"]
reorder_variable_dict.to_csv( BES_small_data_files + "reorder_variable_dict.csv" )

# reorder categories
def re_order(ques):
    if ques in change_cat_dict.keys():
        return "|".join( change_cat_dict[ques] )
    else:
        return ques


In [7]:
# '0.5|1.0|1.5|2.0|2.5|3.0|3.5|4.0|4.5|5.0|5.5|6.0|6.5|7.0|7.5|8.0|8.5|9.0|9.5|Left|Right'.split("|")


In [8]:
## typos - more directly useful for the BES!
# typos = set(['Do\x92t know', 'Dont know', 'Donât know', 'Don??t know','DonaÂ€Â™t know'])# ,
#          "9999.0", "997.0", "222.0", "99.0", "0.0", "1.0", "2.0"   ]) # problem here, is this picks up numeric sequences ...



# Big set of actual answers **I interpet** as non-answers (and set to NaN)
# REALLY MERITS RECHECKING WHAT THE IMPACT OF THIS IS!
Weasel_answers = ["Don't know","Donâ€™t Know",
                  "Prefer not to say", "Prefer not to answer", "Refused", "Unknown",
                  "Neither", "Other", "I would/will not vote", "Will not vote",
                  "I would not vote", "It depends", "Other",
                  "Don't follow politics on twitter",
                  "Yes, other", "Haven't thought about it",
                  "There wasn't a local election in my area", "No, haven't received it",
                  "I don't know what was negotiated", "I never received a response",
                  "There are not local elections in my area", "Can't remember",
                  "Varies too much to say", "Will not state a choice",
                  "All leaders equally good", "They are not eligible to vote",
                  "There are not local elections in my area", "Both/neither",
                  "Did not vote","Can't remember","I did not vote",
                  "Not sure", "Don't know/Prefer not to say", "Don't know / Prefer not to say",
                  "Not applicable","Did not choose a candidate"]

# BES codes for NaN/other/misc/none of the above
Weasel_number_answers = [ "9999.0", "997.0", "222.0", "99.0", "0.0", "9999", "98.0" ]

# non-answer answers
Weasel_set = set(Weasel_answers) # gets rid of duplicates!


## define 'de_Weasel' function to remove Weasel Words from lists of options
## ie. "Yes|No|Don't know" -> "Yes|No"

# Weasel_answers = ["Don't know", 'Don?t know', 'Donâ??t know', 'Do\x92t know', 'Dont know', 'Donât know',
#                   "Prefer not to say", "Prefer not to answer", "Refused", "Unknown", "Neither", "Other",
#                   "I would/will not vote", "Will not vote", "No - not decided", "I would not vote", "It depends",
#                   "Other", "Don’t follow politics on Facebook", "Don't follow politics on twitter", "9999.0", "997.0",
#                   "222.0", "Yes, other", "Haven't thought about it", "There wasn't a local election in my area",
#                   "No, haven't received it", "I don't know what was negotiated", "I never received a response",
#                   "There are not local elections in my area", "Can't remember", "Varies too much to say" ]

# # non-answer answers
# Weasel_set = set(Weasel_answers) # gets rid of duplicates!

# remove weasel phrases
def de_weasel(ques): 
    return "|".join( [x for x in ques.split("|") if x not in Weasel_answers] )

def de_num_el(el):
    if el.isdigit():
        el = "%.1f" % int( el )
    return el

def de_number(ques):
    return "|".join( [de_num_el(x) for x in ques.split("|")] )

def de_num(ques):
    return [de_num_el(x) for x in ques]

def floatable(flt):
    try:
        float(flt)
        return True
    except:
        return False

# Weasel_number_answers
# Remove 'weasel' numbers
# but only if they are the last element
# or not the last element, but the next is not a number
# to avoid catching parts of sequential numerical categories
def de_weasel_numbers(ques):
    el_list = ques.split("|")
    el_list_len = len(el_list)
    remove_list = []
    for el_pos in range( 0, el_list_len ):
        if el_list[el_pos] in Weasel_number_answers:
            # last element, or not last element but next element is a not a number
            if el_pos==(el_list_len-1) or not floatable(el_list[el_pos+1]):
                remove_list.append(el_list[el_pos])

    return "|".join( [x for x in el_list if x not in remove_list] )


# version to act directly on cat.categories array
def de_weasel_nums(el_list):

    el_list_len = len(el_list)
    remove_list = []
    for el_pos in range( 0, el_list_len ):
        if el_list[el_pos] in Weasel_number_answers:
            # last element, or not last element but next element is a not a number
            if el_pos==(el_list_len-1) or not floatable(el_list[el_pos+1]):
                remove_list.append(el_list[el_pos])

    return remove_list

In [9]:
# s.cat.rename_categories([1,2,3])
# EUContactRemainConW8|EUContactRemainLabW8|EUContactRemainLDW8|
# EUContactRemainSNPW8|EUContactRemainPCW8|EUContactRemainUKIPW8|
# EUContactRemainGreenW8|EUContactRemainOthW8|EUContactRemainNoneW8|
# EUContactRemainDKW8|EUContactLeaveConW8|EUContactLeaveLabW8|
# EUContactLeaveLDW8|EUContactLeaveSNPW8|EUContactLeavePCW8|
# EUContactLeaveUKIPW8|EUContactLeaveGreenW8|EUContactLeaveOthW8|
# EUContactLeaveNoneW8|EUContactLeaveDKW8

# pattern match "EUContact*****W8"
# debateOneWatchW8|debateTwoWatchW8

# "1.0|2.0|99.0" -> 

# euRefVoteSqueezeW7 "Will not vote|Yes - Leave|Yes - Remain|No - not decided"
#    -> Stay/remain in the EU|Leave the EU|I would/will not vote|Don't know
#    HMM - RENAME AND REORDER!

# miieuW7
# "Issue stated|Nothing|Don't know" -> "Issue stated|None|Don't know"
# MIIEUW8
# "1.0|Nothing|Don't know" -> "Issue stated|None|Don't know"
# partyIdEUW7|partyIdEUW8
# "Mainly leave|Mainly remain|Fairly evenly split|Don't know" -> "Mainly remain|Fairly evenly divided|Mainly leave|Don't know"
#    HMM - RENAME AND REORDER!

# 1. campaignVisionYesW3|campaignVisionNoW3, govtNatSecuritySuccessW4
# Very unsuccessful|Fairly unsuccessful|Neither successful nor unsuccessful|Fairly successful|Very successful|Don't know
# Very unsuccessful|Somewhat unsuccessful|Neither successful or unsuccessful|Somewhat successful|Very successful|Don't know

# Fairly <-> Somewhat

# 2. euroTurnoutW1, scotReferendumTurnoutW1|scotReferendumTurnoutW2|welshTurnoutW7|scotTurnoutW7, turnoutUKGeneralW1|turnoutUKGeneralW2|turnoutUKGeneralW3|turnoutUKGeneralW4|turnoutUKGeneralW5|euRefTurnoutW7|euRefTurnoutW8
# Very unlikely that I vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I vote|Don't know
# Very unlikely that I would vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I would vote|Don't know
# There are not local elections in my area
    #|Very unlikely that I will vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I will vote|Don't know
# Very unlikely that I will vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I will vote|Don't know

# "Very unlikely that I vote", "Very unlikely that I would vote" ->  "Very unlikely that I will vote" 

rename_cat_dict = {"North East|North West": [ "No", "Yes" ],
                   "1.0|2.0|99.0": ["No", "Yes", "99.0"],
                   "Will not vote|Yes - Leave|Yes - Remain|No - not decided":
                       ['I would/will not vote', 'Leave the EU','Stay in the EU', "Don't know"], 
                   "Stay/remain in the EU|Leave the EU|I would/will not vote|Don't know":
                       ['Stay in the EU','Leave the EU',  'I would/will not vote', "Don't know"],   # euRefVote    
                   "Stay/remain in the EU|Leave the EU|Don't know":
                       ['Stay in the EU','Leave the EU', "Don't know"],   # profile_eurefvote                    
                   "Issue stated|Nothing|Don't know":  ['Issue stated', 'None', "Don't know"],
                   "1.0|Nothing|Don't know":           ['Issue stated', 'None', "Don't know"],
                   "a|b|C1|C2|d|e|Refused|Unknown" : ['A', 'B', 'C1', 'C2', 'D', 'E', 'Refused', 'Unknown'],
                   "a|b|C1|C2|d|e" : ["A","B","C1","C2","D","E"],
                   "Mainly leave|Mainly remain|Fairly evenly split|Don't know":
                       ['Mainly leave','Mainly remain', 'Fairly evenly divided', "Don't know"],
                   "Very unsuccessful|Somewhat unsuccessful|Neither successful or unsuccessful|Somewhat successful|Very successful|Don't know":
                       ['Very unsuccessful', 'Fairly unsuccessful', 'Neither successful nor unsuccessful',
                        'Fairly successful', 'Very successful', "Don't know"],
                   "Very unlikely that I vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I vote|Don't know":
                       ['Very unlikely that I will vote', 'Fairly unlikely', 'Neither likely nor unlikely',
                        'Fairly likely', 'Very likely that I will vote', "Don't know"],
                   "Very unlikely that I would vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I would vote|Don't know":
                       ['Very unlikely that I will vote', 'Fairly unlikely', 'Neither likely nor unlikely',
                        'Fairly likely', 'Very likely that I will vote', "Don't know"],
                   "No, did not vote|Yes, voted|3.0|4.0|5.0|Don't know":   
                       ["Very unlikely that I will vote", "Fairly unlikely", 'Neither likely nor unlikely',
                        "Fairly likely", "Very likely that I will vote", "Don't know"], #londonTurnoutW7
                   'No, I do not regard myself as belonging to any particular religion.|Yes - Church of England/Anglican/Episcopal|Yes - Roman Catholic|Yes - Presbyterian/Church of Scotland|Yes - Methodist|Yes - Baptist|Yes - United Reformed Church|Yes - Free Presbyterian|Yes - Brethren|Yes - Judaism|Yes - Hinduism|Yes - Islam|Yes - Sikhism|Yes - Buddhism|Yes - Other|Prefer not to say|Yes Orthodox Christian|Yes - Pentecostal (e.g. Assemblies of God, Elim Pentecostal Church, New Testament Church of God, Redeemed Christian Chur|Yes - Evangelical independent/non-denominational (e.g. FIEC, Pioneer, Vineyard, Newfrontiers)':
                       ["No, I do not regard myself as belonging to any particular religion.","Yes - Church of England/Anglican/Episcopal",
                        "Yes - Roman Catholic","Yes - Presbyterian/Church of Scotland","Yes - Methodist","Yes - Baptist",
                        "Yes - United Reformed Church","Yes - Free Presbyterian","Yes - Brethren","Yes - Judaism","Yes - Hinduism",
                        "Yes - Islam","Yes - Sikhism","Yes - Buddhism","Yes - Other","Prefer not to say","Yes - Orthodox Christian",
                        "Yes - Pentecostal","Yes - Evangelical /independent/non-denominational"], #xprofile_religionW10
                   'No, I do not regard myself as belonging to any particular religion.|Yes - Church of England/Anglican/Episcopal|Yes - Roman Catholic|Yes - Presbyterian/Church of Scotland|Yes - Methodist|Yes - Baptist|Yes - United Reformed Church|Yes - Free Presbyterian|Yes - Brethren|Yes - Judaism|Yes - Hinduism|Yes - Islam|Yes - Sikhism|Yes - Buddhism|Yes - Other|Prefer not to say|Yes - Orthodox Christian|Yes - Pentecostal (e.g. Assemblies of God, Elim Pentecostal Church, New Testament Church of God, Redeemed Christian Chur|Yes - Evangelical - independent/non-denominational (e.g. FIEC, Pioneer, Vineyard, Newfrontiers)':
                       ["No, I do not regard myself as belonging to any particular religion.","Yes - Church of England/Anglican/Episcopal",
                        "Yes - Roman Catholic","Yes - Presbyterian/Church of Scotland","Yes - Methodist","Yes - Baptist",
                        "Yes - United Reformed Church","Yes - Free Presbyterian","Yes - Brethren","Yes - Judaism","Yes - Hinduism",
                        "Yes - Islam","Yes - Sikhism","Yes - Buddhism","Yes - Other","Prefer not to say","Yes - Orthodox Christian",
                        "Yes - Pentecostal","Yes - Evangelical /independent/non-denominational"], #xprofile_religionW10                   
                   'Own - outright|Own - with a mortgage|Own (part-own) - through shared ownership scheme (i.e. pay part mortgage, part rent)|Rent - from a private landlord|Rent - from my local authority|Rent - from a housing association|Neither - I live with my parents, family or friends but pay some rent to them|Neither - I live rent-free with my parents, family or friends|Other|9999':
                       [ 'Own outright',
                         'Own with a mortgage',
                         'Own (part-own) through shared ownership scheme (i.e. pay part mortgage, part rent)',
                         'Rent from a private landlord',
                         'Rent from my local authority',
                         'Rent from a housing association',
                         'Neither I live with my parents, family or friends but pay some rent to them',
                         'Neither I live rent-free with my parents, family or friends',
                         'Other',
                         '9999'], #profile_house_tenureW11|profile_house_tenureW12|profile_house_tenureW13
                   "I voted 'No' (Scotland should not be an independent country)|I voted 'Yes' (Scotland should be an independent country)|111.0|Don't know":
                       ["No","Yes","Did not vote","Don't know"], # referendumrecall
                   "Voted Yes|Voted No|Did not vote|Can't remember":
                       ["Yes","No","Did not vote","Don't know"], # scotRefVoteW4_
                   "No|Yes|3.0|Don't know":
                       ["No","Yes","Did not vote","Don't know"], # regretsIHaveAFewEUW10|regretsIHaveAFewEUW11   
                   "No|Yes|3|Don't know":
                       ["No","Yes","Did not vote","Don't know"], # regretsIHaveAFewEU W11_only|regretsIHaveAFew W10_only 
                   "Professional or higher technical work - work that requires at least degree-level qualifications (e.g. doctor, accountant|Manager or Senior Administrator (e.g. company director, finance manager, personnel manager, senior sales manager, senior|Clerical (e.g. clerk, secretary)|Sales or Services (e.g. commercial traveller, shop assistant, nursery nurse, care assistant, paramedic)|Foreman or Supervisor of Other Workers (e.g building site foreman, supervisor of cleaning workers)|Skilled Manual Work (e.g. plumber, electrician, fitter)|Semi-Skilled or Unskilled Manual Work (e.g. machine operator, assembler, postman, waitress, cleaner, labourer, driver, b|Other|Have never worked":
                       ['Professional or higher technical work / higher managerial - work that requires at least degree-level qualifications (e.g',
                        'Manager or Senior Administrator / intermediate managerial / professional (e.g. company director, finance manager, person',
                        'Clerical/junior managerial/professional/administrator (e.g. office worker, student doctor, sales person, clerk, secretar',
                        'Sales or Services (e.g. commercial traveller, shop assistant, nursery nurse, care assistant, paramedic)',
                        'Foreman or Supervisor of Other Workers (e.g. building site foreman, supervisor of cleaning workers)',
                        'Skilled Manual Work (e.g. plumber, electrician, fitter)',
                        'Semi-Skilled or Unskilled Manual Work (e.g. machine operator, assembler, postman, waitress, cleaner, labourer, driver, b',
                        'Other',
                        'Have never worked'], # work_type -> profile_work_typeW7
                   "No formal qualifications|Youth training certificate/skillseekers|Recognised trade apprenticeship completed|Clerical and commercial|City & Guilds certificate|City & Guilds certificate - advanced|onc|CSE grades 2-5|CSE grade 1, GCE O level, GCSE, School Certificate|Scottish Ordinary/ Lower Certificate|GCE A level or Higher Certificate|Scottish Higher Certificate|Nursing qualification (eg SEN, SRN, SCM, RGN)|Teaching qualification (not degree)|University diploma|University or CNAA first degree (eg BA, B.Sc, B.Ed)|University or CNAA higher degree (eg M.Sc, Ph.D)|Other technical, professional or higher qualification|Don't know|Prefer not to say":
                       ['No formal qualifications','Youth training certificate/skillseekers','Recognised trade apprenticeship completed',
                        'Clerical and commercial','City and Guild certificate','City and Guild certificate - advanced','onc','CSE grades 2-5',
                        'CSE grade 1, GCE O level, GCSE, School Certificate','Scottish Ordinary/ Lower Certificate','GCE A level or Higher Certificate',
                        'Scottish Higher Certificate','Nursing qualification (eg SEN, SRN, SCM, RGN)','Teaching qualification (not degree)',
                        'University diploma','University or CNAA first degree (eg BA, B.Sc, B.Ed)','University or CNAA higher degree (eg M.Sc, Ph.D)',
                        'Other technical, professional or higher qualification',"Don't know",'Prefer not to say'], # W6_comb: qeducationW6
                   "Strongly disapprove|Disapprove|Don't know":
                       ["Approve","Disapprove","Don't know"], # approveEUW2 # W7_comb, W10_comb, W13_comb, W8_comb, W9_comb
                   '1 to 24 employees|25 to 499 employees|500 or more employees|':
                       ['1 to 24 employees','25 to 499 employees','500 or more employees',"Don't know"], #fatherNumEmployees,motherNumEmployees #W6_comb,W5_comb,W5_only,W3_comb
                   "Yes, voted|No, did not vote|Don't know":
                       ['Yes',"No","Don't know"],
                   "No, did not vote|Yes, voted|Don't know":
                       ['No','Yes',"Don't know"],
                   "No, did not vote|Yes, voted|2.0":
                       ['No','Yes',"Don't know"],
                   "Strongly disagree|Disagree|Neither nor disagree|Agree|Strongly agree|Don't know":
                       ["Strongly disagree","Disagree","Neither agree nor disagree","Agree","Strongly agree","Don't know"],# euFinancialHelpW2 W3-6_comb
                   "Strongly disagree|Disagree|Neither agree nor disagree|Agree|Strongly agree|99.0":
                       ["Strongly disagree","Disagree","Neither agree nor disagree","Agree","Strongly agree","Don't know"],# nhsEUW14
                   "I am very unsure what will happen|I am quite unsure what will happen|I am quite sure what will happen|I am very sure what will happen|Don't know":
                       ["I am very unsure what would happen","I am quite unsure what would happen","I am quite sure what would happen","I am very sure what would happen","Don't know"], # certaintyScotUnionW3 W3-5_comb
                   "0.0|1.0|2.0|3.0|4.0|5.0|6.0|7.0|997.0":
                       ["0 days","1 day","2 days","3 days","4 days","5 days","6 days","7 days","Don't know"], # discussPolDaysW5	W5_comb
                   "A major transfer of powers from Westminster to the Scottish Parliament (\"devo-max\")|Some powers will be transferred but well short of \"devo-max\"|No change to the relationship between Westminster and the Scottish Parliament":
                       ["A major transfer of powers from Westminster to the Scottish Parliament (devo-max)","Some powers will be transferred but well short of devo-max","No change to the relationship between Westminster and the Scottish Parliament"], # expectationManipCheckW1 # W13,10,9,8,7 vs W6-3_comb
                   "No, I did not vote|Yes, I voted|There wasn't a local election in my area|Don't know":
                       ["No, did not vote","Yes, voted","There wasn't a local election in my area","Don't know"], # localTurnoutRetroW2 W3-6_comb
                   "Focuses mainly on criticising other parties|2.0|3.0|4.0|Focuses mainly on putting forward their own policies and personalities|Don't know":
                       ["1 - Focused mainly on criticising other parties","2.0","3.0","4.0","5 - Focused mainly on putting forward their own policies and personalities","Don't know"], # <party>ToneW5 # W5-6_comb, W5_only
                   "Environmental Policy|Defence|Education|Pensions":
                       ["No, I think they *will not* vote","Yes, I think they *will* vote","They are not eligible to vote","Don't know"], # discussantturnoutName1-3W4 # W4-5_comb
                   "Employers in large organisations and higher managerial|Higher professional occupations|Lower professional and managerial and higher supervisory|Intermediate occupations|Employers in small organisations and own account workers|Lower suprivsory and technical occupations|Semi-routine occupations|Routine occupations":
                       ['Employers in large organisations and higher managerial', 'Higher professional occupations',
                        'Lower professional and managerial and higher supervisory', 'Intermediate occupations',
                        'Employers in small organisations and own account workers', 'Lower supervisory and technical occupations',
                        'Semi-routine occupations', 'Routine occupations'], # ns_sec_analytic	 W5_only, W3-6_comb                   
                   "Employers in large organisations and higher managerial|Higher professional occupations|Lower professional and managerail and higher supervisory|Intermediate occupations|Employers in small organisations and own account workers|Lower suprivsory and technical occupations|Semi-routine occupations|Routine occupations":
                       ['Employers in large organisations and higher managerial', 'Higher professional occupations',
                        'Lower professional and managerial and higher supervisory', 'Intermediate occupations',
                        'Employers in small organisations and own account workers', 'Lower supervisory and technical occupations',
                        'Semi-routine occupations', 'Routine occupations'], # ns_sec_analytic	 W5_only, W3-6_comb    # v slight typo!
                   "A major transfer of powers from Westminster to the Scottish Parliament (\"devo-max\")|Some powers will be transferred but well short of \"devo-max\"|No change to the relationship between Westminster and the Scottish Parliament|Don't know":
                       ['A major transfer of powers from Westminster to the Scottish Parliament (devo-max)',
                        'Some powers will be transferred but well short of devo-max',
                        'No change to the relationship between Westminster and the Scottish Parliament',"Don't know"], # expectationManipCheckW1 W3-6_comb
                   "Employers in large establishments|Higher managerial and administrative occupations|L3.1 'Traditional' employees|L3.2 'New' employees|L3.3 'Traditional' self-employed|L3.4 'New' self-employed|L4.1 'Traditional' employees|L4.2 'New' employees|L4.3 'Traditional' self-employed|L4.4 'New' self-employed|Lower managerial and administrative occupations|Higher supervisory occupations|L7.1 Intermediate clerical and administrative occupations|L7.2 Intermediate sales and service occupations|L7.3 Intermediate technical and auxiliary occupations|L7.4 Intermediate engineering occupations|L8.1 Employers in small establishments in industry, commerce, services etc.|L8.2 Employers in small establishments in agriculture|L9.1 Own account workers (non-professional)|L9.2 Own account workers (agriculture)|Lower supervisory occupations|L11.1 Lower technical craft occupations|L11.2 Lower technical process operative occupations|L12.1 Semi-routine sales occupations|L12.2 Semi-routine service occupations|L12.3 Semi-routine technical occupations|L12.4 Semi-routine operative occupations|L12.5 Semi-routine agricultural occupations|L12.6 Semi-routine clerical occupations|L12.7 Semi routine childcare occupations|L13.1 Routine sales and service occupations|L13.2 Routine production occupations|L13.3 Routine technical occupations|L13.4 Routine operative occupations|L13.5 Routine agricultural occupations":
                       ['Employers in large establishments', 'Higher managerial and administrative occupations',
                        'L3.1 Traditional employees', 'L3.2 New employees', 'L3.3 Traditional self-employed',
                        'L3.4 New self-employed', 'L4.1 Traditional employees', 'L4.2 New employees',
                        'L4.3 Traditional self-employed', 'L4.4 New self-employed', 'Lower managerial and administrative occupations',
                        'Higher supervisory occupations', 'L7.1 Intermediate clerical and administrative occupations',
                        'L7.2 Intermediate sales and service occupations', 'L7.3 Intermediate technical and auxiliary occupations',
                        'L7.4 Intermediate engineering occupations', 'L8.1 Employers in small establishments in industry, commerce, services etc.',
                        'L8.2 Employers in small establishments in agriculture', 'L9.1 Own account workers (non-professional)',
                        'L9.2 Own account workers (agriculture)', 'Lower supervisory occupations', 'L11.1 Lower technical craft occupations',
                        'L11.2 Lower technical process operative occupations', 'L12.1 Semi-routine sales occupations',
                        'L12.2 Semi-routine service occupations', 'L12.3 Semi-routine technical occupations', 'L12.4 Semi-routine operative occupations',
                        'L12.5 Semi-routine agricultural occupations', 'L12.6 Semi-routine clerical occupations', 'L12.7 Semi routine childcare occupations',
                        'L13.1 Routine sales and service occupations', 'L13.2 Routine production occupations', 'L13.3 Routine technical occupations',
                        'L13.4 Routine operative occupations', 'L13.5 Routine agricultural occupations'],
                   "Employers in large establishments|Higher managerial and administrative occupations|L3.1 ?Traditional? employees|L3.2 ?New? employees|L3.3 ?Traditional? self-employed|L3.4 ?New? self-employed|L4.1 ?Traditional? employees|L4.2 ?New? employees|L4.3 ?Traditional? self-employed|L4.4 ?New? self-employed|Lower managerial and administrative occupations|Higher supervisory occupations|L7.1 Intermediate clerical and administrative occupations|L7.2 Intermediate sales and service occupations|L7.3 Intermediate technical and auxiliary occupations|L7.4 Intermediate engineering occupations|L8.1 Employers in small establishments in industry, commerce, services etc.|L8.2 Employers in small establishments in agriculture|L9.1 Own account workers (non-professional)|L9.2 Own account workers (agriculture)|Lower supervisory occupations|L11.1 Lower technical craft occupations|L11.2 Lower technical process operative occupations|L12.1 Semi-routine sales occupations|L12.2 Semi-routine service occupations|L12.3 Semi-routine technical occupations|L12.4 Semi-routine operative occupations|L12.5 Semi-routine agricultural occupations|L12.6 Semi-routine clerical occupations|L12.7 Semi routine childcare occupations|L13.1 Routine sales and service occupations|L13.2 Routine production occupations|L13.3 Routine technical occupations|L13.4 Routine operative occupations|L13.5 Routine agricultural occupations":
                       ['Employers in large establishments', 'Higher managerial and administrative occupations',
                        'L3.1 Traditional employees', 'L3.2 New employees', 'L3.3 Traditional self-employed',
                        'L3.4 New self-employed', 'L4.1 Traditional employees', 'L4.2 New employees',
                        'L4.3 Traditional self-employed', 'L4.4 New self-employed', 'Lower managerial and administrative occupations',
                        'Higher supervisory occupations', 'L7.1 Intermediate clerical and administrative occupations',
                        'L7.2 Intermediate sales and service occupations', 'L7.3 Intermediate technical and auxiliary occupations',
                        'L7.4 Intermediate engineering occupations', 'L8.1 Employers in small establishments in industry, commerce, services etc.',
                        'L8.2 Employers in small establishments in agriculture', 'L9.1 Own account workers (non-professional)',
                        'L9.2 Own account workers (agriculture)', 'Lower supervisory occupations', 'L11.1 Lower technical craft occupations',
                        'L11.2 Lower technical process operative occupations', 'L12.1 Semi-routine sales occupations',
                        'L12.2 Semi-routine service occupations', 'L12.3 Semi-routine technical occupations', 'L12.4 Semi-routine operative occupations',
                        'L12.5 Semi-routine agricultural occupations', 'L12.6 Semi-routine clerical occupations', 'L12.7 Semi routine childcare occupations',
                        'L13.1 Routine sales and service occupations', 'L13.2 Routine production occupations', 'L13.3 Routine technical occupations',
                        'L13.4 Routine operative occupations', 'L13.5 Routine agricultural occupations'],
                   "1|2":
                       ["No","Yes"], # tryReduceImmigDKW4, achieveReduceImmigUKIPW4, achieveReduceImmigGrnW4, achieveReduceImmigDKW4, tryReduceInequalityDKW4, successReduceInequalityDKW4 # W4-5_comb # sharedContentOnline_1-5W4 W5_comb # voteMethodEurope_dkW2, discussantsAskedYouToVote_DKW2 ,discussantsAccompaniedVote_dkW2, referendumContact_dkW2 # W3_comb
                   "1.0|2.0":
                       ["No","Yes"], # tryReduceImmigDKW4, achieveReduceImmigUKIPW4, achieveReduceImmigGrnW4, achieveReduceImmigDKW4, tryReduceInequalityDKW4, successReduceInequalityDKW4 # W4-5_comb # sharedContentOnline_1-5W4 W5_comb # voteMethodEurope_dkW2, discussantsAskedYouToVote_DKW2 ,discussantsAccompaniedVote_dkW2, referendumContact_dkW2 # W3_comb
                   "Should definitely be illegal|Should probably be illegal|Should probably be legal|Should definitely be legal|5.0":
                       ["Should definitely be illegal","Should probably be illegal","Should probably be legal","Should definitely be legal","Don't know"], # zeroHourContractW6
                   "1|2|3|4|5|More than 5|111.0|Prefer not to say":
                       ["1","2","3","4","5","More than 5","0","Prefer not to say"], # numChildrenW14
                   "No|Yes|99.0":
                       ["No","Yes","Don't know"], # debtW14|studentloanW14
                   "Yes|2.0|Don't know":
                       ["Yes","No","Don't know"], # smallEmergency_1W14|smallEmergency_2W14|smallEmergency_3W14|smallEmergency_4W14|smallEmergency_5W14|childvalues_1W14|childvalues_2W14|childvalues_3W14|childvalues_4W14|childvalues_5W14|childvalues_6W14|childvalues_7W14|childvalues_8W14|childvalues_9W14|childvalues_10W14|childvalues_11W14
                   "0.0|Yes|9999.0":
                       ["No","Yes","Don't know"], # moreParl_1W14|moreParl_2W14|moreParl_3W14|moreParl_4W14|moreParl_5W14|moreParl_6W14|moreParl_7W14|moreParl_8W14|moreParl_9W14|moreParl_10W14|moreParl_111W14
                   "Not at all important|Not very important|Important|Very important|Don't know":
                       ["Not at all important","Not very important","Somewhat important","Very important","Don't know"], # impOccW14|impRaceW14|ImpReligW14|impLocalW14|impGenderW14|impAgeW14|impEdW14|impGayW14
                   "Strongly oppose|Oppose|Neither oppose nor support|Support|Strongly support":
                       ["Strongly oppose","Oppose","Neither support nor oppose","Support","Strongly support"], #wcVmc1W14|wcVmc2W14|wcVself1W14|wcVself2W14|mcVSelf1W14|mcVSelf2W14

                  }


rename_variable_dict = pd.DataFrame.from_dict( {k : "|".join(v) for k, v in rename_cat_dict.items()} , orient='index' ).reset_index()
rename_variable_dict.columns = ["original_cat_list","renameed_cat_list"]
rename_variable_dict.to_csv( BES_small_data_files + "rename_variable_dict.csv" )

def re_name(ques):
    if ques in rename_cat_dict.keys():
        return "|".join( rename_cat_dict[ques] )
    else:
        return ques

In [10]:
## COLUMNS THAT EITHER LACK ALL DATA OR HAVE ACTUAL ERRORS
# check back on these periodically - one assumes they will get fixed!
# maybe tell them about them so that they can?

# {'changeIssue1W9', 'conLeaderLikeW9'}
# these variables appear to have disappeared! Fixed in an updated version?

ignore_list = ['whichPartiesHelped_99W6',
               'partyContactGrnW1',
               'partyContactGrnW2',
               'partyContactGrnW3',
               'reasonNotRegistered_noneW2',               
               'reasonNotRegistered_noneW3',
               'reasonNotRegistered_noneW4',
               'reasonNotRegistered_noneW6',
               'reasonNotRegistered_noneW7',
               'reasonNotRegistered_noneW8',
               'reasonNotRegistered_none',
               'partyContactSNPW1',
               'partyContactSNPW2',
               "locusControlW9",
               "generalElecCertaintyW1", # wave 10 forwards
               "generalElecCertaintyW2",
               "generalElecCertaintyW3",
               "londonMayorVoteW7",
               "fatherNumEmployeesW4",
               "motherNumEmployeesW4",
               "profile_pcon_2010_newW3", # W3_comb: this is parl. constit. ... but by number!
               "euroElectionVoteYoungW2", # W3_comb: all NaNs!
               "profile_GOR_pdlW4", # W4_comb: misnamed selection, probably fixable 
               "participation_111W5", ### -->
               "sharedContentOnline_111W5",
               "sharedContentOnline_99W5", ### <-- W5_comb "Got a lot worse|Got a little worse" doesn't look right (indicator vars?)
               "csplScotRefW3", ### W5_comb: "North East" - just broken!
              ]

#- approveEUW2 'Strongly disapprove|Disapprove|Don't know' - should be "approve|disapprove|don't know"??? NOT SURE (distribution weird)
#- whichPartiesHelped_99W6 - answer set = ["No"]
#- partyContactGrnW1 ... reasonNotRegistered_noneW8 answer set = ["No", "Don't know"]
# -partyContactSNPW1, partyContactSNPW2 - answer set = ["Don't know"]
# -changeIssue1W9|conLeaderLikeW9|locusControlW9 - answer set = ["No formal qualifications"]

In [11]:
## define 'prune' function to prune wave indicators and return question stubs
## ie. "ptvConW1|ptvLabW1" -> "ptvCon|ptvLab"

def prune(x):
    
    y= []
    for el in x.split("|"):
        match_attempt = re.match('(\w*?)_?(W[0-9]+)+' , el )   
        if match_attempt:
            el = match_attempt.groups()[0]
        y.append(el)
    # should we ditch identical repeats?
    # return "|".join(set(y)) NEEDS TO BE TESTED
    return "|".join(y)

               
def prune2(x):
    
    y= []
    for el in x.split("|"):
        # fgdfhfghg_5, fgdfhfghg_4, fgdfhfghg_3 -> fgdfhfghg
        # problem - indicator variables fgdfhfghg_99, fgdfhfghg_111 really are different!
        # solution - leave them distinct
        indicator_variable = re.match('(\w*?)_?(99|111)' , el )       
        match_attempt = re.match('(\w*?)_?[0-9]+' , el )   
        if (not indicator_variable) and (match_attempt):
            el = match_attempt.groups()[0]
        y.append(el)
    # should we ditch identical repeats?
    # return "|".join(set(y)) NEEDS TO BE TESTED
    return "|".join(y)


def hardcoded_fix(col,cat_list):
    
    var_type.loc[ col , "dtype" ]           = BES_Panel[col].dtype.name
    if (var_type.loc[ col , "dtype" ] == 'category'):
        var_type.loc[ col , "cat_all_strings" ] = np.all([isinstance(x,str) for x in BES_Panel[ col ].cat.categories])
        
    BES_Panel[col] = \
        BES_Panel[col].astype("category").cat.rename_categories( cat_list.split("|") )
        
    var_type.loc[ col , "dataset_specific_hardcoded_fix" ] = "|".join( BES_Panel[col].cat.categories.values )
    
# "Â–" -> "-"
# "Â£" -> "£"

# " â€“ " -> " "
# " Â‘" -> " "
# "Â’ " -> " "

# "Â‘" -> "'"
# "Â’" -> "'"
# "Â€Â™" -> "'"
# "â??" -> "'"
# "â€™" -> "'"    

# detect any matching pattern of weird Â stuff in cat1|cat2|cat3... string
# return the fixed version of string if present
# return None if not
def fix_a_hat_chars(cat_string):
    cat_array = cat_string.split("|")
    a_hat_present = False
    for el_no in range( 0, len(cat_array) ):
        el = cat_array[el_no]
        el = re.sub( "SiÃƒÂ¢n C. Jame|SiÃ¢n C. James|SiÃ¢n C. Jame|Siân C. James", "Sian C. James", el)
        el = re.sub( "ThÃ©rÃ¨se  Coff|Thérèse  Coffey", "Therese  Coffey", el)
        el = re.sub( "RA©union|RÃ©union|RAÂ©union|RÃƒÂ©union", "Reunion", el)
        el = re.sub( "\xa0Lower supervisory occupations", "Lower supervisory occupations", el)
        el = re.sub( "Don‘t know|Don?t know|Dona??t know|Dona€™t know|Donâ€™t know|Don’t know|Don‘t know|Don\x91t know|Don\x92t know|Dona\x80\x99t know|Do\x92t know","Don't know", el  )
        el = re.sub( "Â–|\x96|–", "-", el )
        el = re.sub( "Â£|\xc2£", "£", el )
        el = re.sub( "\xa0|\sâ€“\s|\s\xe2\x80\x93\s|\sÂ‘|Â’\s" , " ", el )
        el = re.sub( "Â‘|Â’|Â€Â™|â\?\?|\x80\x99|â€™|\xe2\x80\x99|â|â\x80\x99|\?\?|\x92|‘|\x91|’", "'", el )
        
        if el != cat_array[el_no]:
            a_hat_present = True
            cat_array[el_no] = el
            
    if a_hat_present:
        return cat_array
    else:
        return None
        
## typos - more directly useful for the BES!
# typos = set(['Do\x92t know', 'Dont know', 'Donât know', 'Don??t know','DonaÂ€Â™t know'])# ,

In [12]:
def create_var_list( variable_categories ):
    # load question_categories_correct (it could have been updated)
    # input: 
    # output:
    # var_cat_dict_pruned, var_cat_dict_pruned_2

    # flipping list
    var_cat_dict = dict()
    # range defined by types that exist in question_categories_correct.csv
    type_range = set(variable_categories["type"].values)

    for typ in type_range:

        e = variable_categories[variable_categories.type==typ]["column_name"].values
        var_cat_dict[typ] = [item for sublist in [i.split("|") for i in e] for item in sublist]
        var_cat_dict[typ] = [item for item in var_cat_dict[typ] if item not in ignore_list]

    # dictionary comprehension to prune column-names to wave non-specific stubs
    # list(set()) gets rid of repetitions
    var_cat_dict_pruned   = {k: list(set([prune(x)  for x in v])) for k, v in var_cat_dict.items()}
    var_cat_dict_pruned_2 = {k: list(set([prune2(x) for x in v])) for k, v in var_cat_dict_pruned.items()}
    
    return ( var_cat_dict_pruned , var_cat_dict_pruned_2 )

In [13]:
def careful_isnan(x):
    return ( (not isinstance(x,str)) and np.isnan(x) )

def careful_replace( col,replace_dict ):
    var_type.loc[col,"dtype"] = BES_Panel[col].dtype.name
    if (var_type.loc[ col , "dtype" ] == 'category'):
        var_type.loc[ col , "cat_all_strings" ] = np.all([isinstance(x,str) for x in BES_Panel[ col ].cat.categories])     
    
    BES_Panel[col] = BES_Panel[col]\
        .apply(lambda x: x if careful_isnan(x) else replace_dict[x] )\
        .astype('category').cat.set_categories( replace_dict.values() , ordered = True)
        
    var_type.loc[ col , "dataset_specific_hardcoded_fix" ] = "|".join( BES_Panel[col].cat.categories.values )

def careful_replace_and_set_cats( col, replace_dict, final_cats ):
    var_type.loc[col,"dtype"] = BES_Panel[col].dtype.name
    if (var_type.loc[ col , "dtype" ] == 'category'):
        var_type.loc[ col , "cat_all_strings" ] = np.all([isinstance(x,str) for x in BES_Panel[ col ].cat.categories])     
    
    BES_Panel[col] = BES_Panel[col]\
        .apply(lambda x: x if x not in replace_dict.keys() else replace_dict[x] )\
        .astype('category').cat.set_categories( final_cats , ordered = True)
        
    var_type.loc[ col , "dataset_specific_hardcoded_fix" ] = "|".join( BES_Panel[col].cat.categories.values )    
    

In [14]:
def hard_coded_fixes( dataset_name ):


    ## dataset specific issues
    # (i.e. probably what I should have done all along!)

    # "BES2017_W13_v1.0.dta"

    ## Should I make this *filename specific* or *wave specific*?
    ## Comes down to a question of whether it's safer to assume that things get fixed
    ## or that they probably won't get fixed


    # gor W3_only, W2_only (3->-4, category -> object)
    # # grr - some point BES switched from ONS codes to text names
    # # I feel like percolating the change backwards would have been a good idea
    # ONS codes available here:
    # http://webarchive.nationalarchives.gov.uk/20160128190831/http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/england/government-office-regions/index.html

    
    
    
    if ( dataset_name in ["W14_comb","W14_only"] ):
        col = "profile_turnout_2017"
        replace = dict(zip(BES_Panel["profile_turnout_2017"].value_counts().sort_index().index,
                           BES_Panel["profile_turnout_2015"].cat.categories))
        final_cats = BES_Panel["profile_turnout_2015"].cat.categories
        careful_replace_and_set_cats( col,  replace, final_cats )     
    
    if ( dataset_name in ["W14_comb","W14_only"] ):
        col = "profile_past_vote_2017"
        replace = dict(zip([1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,9999.0],
                           BES_Panel["profile_past_vote_2015"].cat.categories))
        final_cats = BES_Panel["profile_past_vote_2015"].cat.categories
        careful_replace_and_set_cats( col,  replace, final_cats )      
    
    if ( dataset_name in ["W14_comb"] ):
        col = 'pconW1'
        replace = {}
        ex_str = "Aldershot|Aldridge-Brownhills|Altrincham and Sale West|Amber Valley|Arundel and South Downs|Ashfield|Ashford|Ashton-under-Lyne|Aylesbury|Banbury|Barking|Barnsley Central|Barnsley East|Barrow and Furness|Basildon and Billericay|Basingstoke|Bassetlaw|Bath|Batley and Spen|Battersea|Beaconsfield|Beckenham|Bedford|Bermondsey and Old Southwark|Berwick-upon-Tweed|Bethnal Green and Bow|Beverley and Holderness|Bexhill and Battle|Bexleyheath and Crayford|Birkenhead|Birmingham, Edgbaston|Birmingham, Erdington|Birmingham, Hall Green|Birmingham, Hodge Hill|Birmingham, Ladywood|Birmingham, Northfield|Birmingham, Perry Barr|Birmingham, Selly Oak|Birmingham, Yardley|Bishop Auckland|Blackburn|Blackley and Broughton|Blackpool North and Cleveleys|Blackpool South|Blaydon|Blyth Valley|Bognor Regis and Littlehampton|Bolsover|Bolton North East|Bolton South East|Bolton West|Bootle|Boston and Skegness|Bosworth|Bournemouth East|Bournemouth West|Bracknell|Bradford East|Bradford South|Bradford West|Braintree|Brent Central|Brent North|Brentford and Isleworth|Brentwood and Ongar|Bridgwater and West Somerset|Brigg and Goole|Brighton, Kemptown|Brighton, Pavilion|Bristol East|Bristol North West|Bristol South|Bristol West|Broadland|Bromley and Chislehurst|Bromsgrove|Broxbourne|Broxtowe|Buckingham|Burnley|Burton|Bury North|Bury South|Bury St Edmunds|Calder Valley|Camberwell and Peckham|Camborne and Redruth|Cambridge|Cannock Chase|Canterbury|Carlisle|Carshalton and Wallington|Castle Point|Central Devon|Central Suffolk and North Ipswich|Charnwood|Chatham and Aylesford|Cheadle|Chelmsford|Chelsea and Fulham|Cheltenham|Chesham and Amersham|Chesterfield|Chichester|Chingford and Woodford Green|Chippenham|Chipping Barnet|Chorley|Christchurch|Cities of London and Westminster|City of Chester|City of Durham|Clacton|Cleethorpes|Colchester|Colne Valley|Congleton|Copeland|Corby|Coventry North East|Coventry North West|Coventry South|Crawley|Crewe and Nantwich|Croydon Central|Croydon North|Croydon South|Dagenham and Rainham|Darlington|Dartford|Daventry|Denton and Reddish|Derby North|Derby South|Derbyshire Dales|Devizes|Dewsbury|Don Valley|Doncaster Central|Doncaster North|Dover|Dudley North|Dudley South|Dulwich and West Norwood|Ealing Central and Acton|Ealing North|Ealing, Southall|Easington|East Devon|East Ham|East Hampshire|East Surrey|East Worthing and Shoreham|East Yorkshire|Eastbourne|Eastleigh|Eddisbury|Edmonton|Ellesmere Port and Neston|Elmet and Rothwell|Eltham|Enfield North|Enfield, Southgate|Epping Forest|Epsom and Ewell|Erewash|Erith and Thamesmead|Esher and Walton|Exeter|Fareham|Faversham and Mid Kent|Feltham and Heston|Filton and Bradley Stoke|Finchley and Golders Green|Folkestone and Hythe|Forest of Dean|Fylde|Gainsborough|Garston and Halewood|Gateshead|Gedling|Gillingham and Rainham|Gloucester|Gosport|Grantham and Stamford|Gravesham|Great Grimsby|Great Yarmouth|Greenwich and Woolwich|Guildford|Hackney North and Stoke Newington|Hackney South and Shoreditch|Halesowen and Rowley Regis|Halifax|Haltemprice and Howden|Halton|Hammersmith|Hampstead and Kilburn|Harborough|Harlow|Harrogate and Knaresborough|Harrow East|Harrow West|Hartlepool|Harwich and North Essex|Hastings and Rye|Havant|Hayes and Harlington|Hazel Grove|Hemel Hempstead|Hemsworth|Hendon|Henley|Hereford and South Herefordshire|Hertford and Stortford|Hertsmere|Hexham|Heywood and Middleton|High Peak|Hitchin and Harpenden|Holborn and St Pancras|Hornchurch and Upminster|Hornsey and Wood Green|Horsham|Houghton and Sunderland South|Hove|Huddersfield|Huntingdon|Hyndburn|Ilford North|Ilford South|Ipswich|Isle of Wight|Islington North|Islington South and Finsbury|Jarrow|Keighley|Kenilworth and Southam|Kensington|Kettering|Kingston and Surbiton|Kingston upon Hull East|Kingston upon Hull North|Kingston upon Hull West and Hessle|Kingswood|Knowsley|Lancaster and Fleetwood|Leeds Central|Leeds East|Leeds North East|Leeds North West|Leeds West|Leicester East|Leicester South|Leicester West|Leigh|Lewes|Lewisham East|Lewisham West and Penge|Lewisham, Deptford|Leyton and Wanstead|Lichfield|Lincoln|Liverpool, Riverside|Liverpool, Walton|Liverpool, Wavertree|Liverpool, West Derby|Loughborough|Louth and Horncastle|Ludlow|Luton North|Luton South|Macclesfield|Maidenhead|Maidstone and The Weald|Makerfield|Maldon|Manchester Central|Manchester, Gorton|Manchester, Withington|Mansfield|Meon Valley|Meriden|Mid Bedfordshire|Mid Derbyshire|Mid Dorset and North Poole|Mid Norfolk|Mid Sussex|Mid Worcestershire|Middlesbrough|Middlesbrough South and East Cleveland|Milton Keynes North|Milton Keynes South|Mitcham and Morden|Mole Valley|Morecambe and Lunesdale|Morley and Outwood|New Forest East|New Forest West|Newark|Newbury|Newcastle upon Tyne Central|Newcastle upon Tyne East|Newcastle upon Tyne North|Newcastle-under-Lyme|Newton Abbot|Normanton, Pontefract and Castleford|North Cornwall|North Devon|North Dorset|North Durham|North East Bedfordshire|North East Cambridgeshire|North East Derbyshire|North East Hampshire|North East Hertfordshire|North East Somerset|North Herefordshire|North Norfolk|North Shropshire|North Somerset|North Swindon|North Thanet|North Tyneside|North Warwickshire|North West Cambridgeshire|North West Durham|North West Hampshire|North West Leicestershire|North West Norfolk|North Wiltshire|Northampton North|Northampton South|Norwich North|Norwich South|Nottingham East|Nottingham North|Nottingham South|Nuneaton|Old Bexley and Sidcup|Oldham East and Saddleworth|Oldham West and Royton|Orpington|Oxford East|Oxford West and Abingdon|Pendle|Penistone and Stocksbridge|Penrith and The Border|Peterborough|Plymouth, Moor View|Plymouth, Sutton and Devonport|Poole|Poplar and Limehouse|Portsmouth North|Portsmouth South|Preston|Pudsey|Putney|Rayleigh and Wickford|Reading East|Reading West|Redcar|Redditch|Reigate|Ribble Valley|Richmond (Yorks)|Richmond Park|Rochdale|Rochester and Strood|Rochford and Southend East|Romford|Romsey and Southampton North|Rossendale and Darwen|Rother Valley|Rotherham|Rugby|Ruislip, Northwood and Pinner|Runnymede and Weybridge|Rushcliffe|Rutland and Melton|Saffron Walden|Salford and Eccles|Salisbury|Scarborough and Whitby|Scunthorpe|Sedgefield|Sefton Central|Selby and Ainsty|Sevenoaks|Sheffield Central|Sheffield South East|Sheffield, Brightside and Hillsborough|Sheffield, Hallam|Sheffield, Heeley|Sherwood|Shipley|Shrewsbury and Atcham|Sittingbourne and Sheppey|Skipton and Ripon|Sleaford and North Hykeham|Slough|Solihull|Somerton and Frome|South Basildon and East Thurrock|South Cambridgeshire|South Derbyshire|South Dorset|South East Cambridgeshire|South East Cornwall|South Holland and The Deepings|South Leicestershire|South Norfolk|South Northamptonshire|South Ribble|South Shields|South Staffordshire|South Suffolk|South Swindon|South Thanet|South West Bedfordshire|South West Devon|South West Hertfordshire|South West Norfolk|South West Surrey|South West Wiltshire|Southampton, Itchen|Southampton, Test|Southend West|Southport|Spelthorne|St Albans|St Austell and Newquay|St Helens North|St Helens South and Whiston|St Ives|Stafford|Staffordshire Moorlands|Stalybridge and Hyde|Stevenage|Stockport|Stockton North|Stockton South|Stoke-on-Trent Central|Stoke-on-Trent North|Stoke-on-Trent South|Stone|Stourbridge|Stratford-on-Avon|Streatham|Stretford and Urmston|Stroud|Suffolk Coastal|Sunderland Central|Surrey Heath|Sutton and Cheam|Sutton Coldfield|Tamworth|Tatton|Taunton Deane|Telford|Tewkesbury|The Cotswolds|The Wrekin|Thirsk and Malton|Thornbury and Yate|Thurrock|Tiverton and Honiton|Tonbridge and Malling|Tooting|Torbay|Torridge and West Devon|Totnes|Tottenham|Truro and Falmouth|Tunbridge Wells|Twickenham|Tynemouth|Uxbridge and South Ruislip|Vauxhall|Wakefield|Wallasey|Walsall North|Walsall South|Walthamstow|Wansbeck|Wantage|Warley|Warrington North|Warrington South|Warwick and Leamington|Washington and Sunderland West|Watford|Waveney|Wealden|Weaver Vale|Wellingborough|Wells|Welwyn Hatfield|Wentworth and Dearne|West Bromwich East|West Bromwich West|West Dorset|West Ham|West Lancashire|West Suffolk|West Worcestershire|Westminster North|Westmorland and Lonsdale|Weston-Super-Mare|Wigan|Wimbledon|Winchester|Windsor|Wirral South|Wirral West|Witham|Witney|Woking|Wokingham|Wolverhampton North East|Wolverhampton South East|Wolverhampton South West|Worcester|Workington|Worsley and Eccles South|Worthing West|Wycombe|Wyre and Preston North|Wyre Forest|Wythenshawe and Sale East|Yeovil|York Central|York Outer|Ynys Mon|Delyn|Alyn and Deeside|Wrexham|Llanelli|Gower|Swansea West|Swansea East|Aberavon|Cardiff Central|Cardiff North|Rhondda|Torfaen|Monmouth|Newport East|Newport West|Arfon|Aberconwy|Clwyd West|Vale of Clwyd|Dwyfor Meirionnydd|Clwyd South|Montgomeryshire|Ceredigion|Preseli Pembrokeshire|Carmarthen West and South Pembrokeshire|Carmarthen East and Dinefwr|Brecon and Radnorshire|Neath|Cynon Valley|Merthyr Tydfil and Rhymney|Blaenau Gwent|Bridgend|Ogmore|Pontypridd|Caerphilly|Islwyn|Vale of Glamorgan|Cardiff West|Cardiff South and Penarth|Aberdeen North|Aberdeen South|Airdrie and Shotts|Angus|Argyll and Bute|Ayr, Carrick and Cumnock|Banff and Buchan|Berwickshire, Roxburgh and Selkirk|Caithness, Sutherland and Easter Ross|Central Ayrshire|Coatbridge, Chryston and Bellshill|Cumbernauld, Kilsyth and Kirkintilloch East|Dumfries and Galloway|Dumfriesshire, Clydesdale and Tweeddale|Dundee East|Dundee West|Dunfermline and West Fife|East Dunbartonshire|East Kilbride, Strathaven and Lesmahagow|East Lothian|East Renfrewshire|Edinburgh East|Edinburgh North and Leith|Edinburgh South|Edinburgh South West|Edinburgh West|Falkirk|Glasgow Central|Glasgow East|Glasgow North|Glasgow North East|Glasgow North West|Glasgow South|Glasgow South West|Glenrothes|Gordon|Inverclyde|Inverness, Nairn, Badenoch and Strathspey|Kilmarnock and Loudoun|Kirkcaldy and Cowdenbeath|Lanark and Hamilton East|Linlithgow and East Falkirk|Livingston|Midlothian|Moray|Motherwell and Wishaw|Na h-Eileanan an Iar|North Ayrshire and Arran|North East Fife|Ochil and South Perthshire|Orkney and Shetland|Paisley and Renfrewshire North|Paisley and Renfrewshire South|Perth and North Perthshire|Ross, Skye and Lochaber|Rutherglen and Hamilton West|Stirling|West Aberdeenshire and Kincardine|West Dunbartonshire|South Down"
        final_cats = ex_str.split("|")
        careful_replace_and_set_cats( col,  replace, final_cats )      

    if ( dataset_name in ["W14_comb","W14_only","W15_only","W15_comb"] ):    
        col = 'profile_pcon'

        cats = list(BES_Panel['profile_pcon'].cat.categories)
        replace = {cats[x-1]:int(x) for x in range(1,8)}
        cats[0:7] = [int(x) for x in range(1,8)]
        final_cats = [str(x) for x in cats if float(x) <=650]
        careful_replace_and_set_cats( col,  replace, final_cats ) 
    

    # variable name collision (BES 'disability' (wave 6 variable) and yougov profile 'disability)
    if ("disability" in BES_Panel.columns) and (dataset_name != "W6_only"):
        BES_Panel.rename(columns={"disability":"profile_disability"}, inplace=True)
    # similar collision 
#     if ("housing" in BES_Panel.columns) and (dataset_name == "W13_only"):
#         BES_Panel.rename(columns={"housing":"profile_house_tenure"}, inplace=True)  

    # whole column is NaN!
    col = "profile_socialgrade_cie"
    if (col in BES_Panel.columns) and (dataset_name in [ "W6_only", "W4_only", "W3_only", "W2_only", "W1_only" ]):
        var_type.loc[col,"type"] = -2 # set to ignore

    # whole column is NaN!
    col = 'discussPolDays'
    if (col in BES_Panel.columns) and (dataset_name in [ "W3_only" ]):
        var_type.loc[col,"type"] = -2 # set to ignore
        
    # whole column is NaN!
    col = 'partyContactSNP'
    if (col in BES_Panel.columns) and (dataset_name in [ "W2_only","W1_only" ]):
        var_type.loc[col,"type"] = -2 # set to ignore        

        
        
    # now we have actual categories that don't match different versions *of that exact same variable*
    # and can't even be attributed to weasel terms (e.g. 99 -> Don't know, 98 -> Other)
    # so, I'll try just replacing them with NaNs
    
    if ( dataset_name in ["W13_comb"] ):
        col = 'scotRefVoteW4_W13'
        replace = {99.0:"Don't know",111.0:"Don't know"}
        final_cats = ['Not at all certain', 'Somewhat certain', 'Very certain', "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )        
    

    if ( dataset_name in ["W13_comb","W10_comb"] ):
        col = "profile_turnout_2015"
        
        replace = {}
        final_cats = ['No, did not vote',
                      'Yes, voted',
                      "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )


    if ( dataset_name in ["W6_comb"] ):
        col = "zeroHourContractW6"
        
        replace = {}
        final_cats = ['Should definitely be illegal',
                     'Should probably be illegal',
                     'Should probably be legal',
                     'Should definitely be legal',
                     "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )

    if ( dataset_name in ["W6_comb","W5_comb","W4_comb","W3_comb"] ):
        col = "certaintyEUGreenW2"
        
        replace = {99.0:"Don't know"}
        final_cats = ['Not at all certain', 'Somewhat certain', 'Very certain', "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )
        
    if ( dataset_name in ["W6_comb"] ):
        col = "certaintyEUGreenW4"
        
        replace = {99.0:"Don't know"}
        final_cats = ['Not at all certain', 'Somewhat certain', 'Very certain', "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )
        
        col = "certaintyEUGreenW6"
        
        replace = {99.0:"Don't know"}
        final_cats = ['Not at all certain', 'Somewhat certain', 'Very certain', "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )        

        
    if ( dataset_name in ["W10_only"] ):
        col = "econPersonalProsp"
        
        replace = {}
        final_cats = ['Get a lot worse',
                     'Get a little worse',
                     'Stay the same',
                     'Get a little better',
                     'Get a lot better',
                     "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )        

    if ( dataset_name in ["W13_comb","W10_comb"] ):
        col = "econPersonalProspW10"
        
        replace = {}
        final_cats = ['Get a lot worse',
                     'Get a little worse',
                     'Stay the same',
                     'Get a little better',
                     'Get a lot better',
                     "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )

    if ( dataset_name in ["W6_comb","W5_only"] ):
        col = "noDependentsInHousehold"
        
        replace = {}
        final_cats = ['No',
                     'Yes']
        careful_replace_and_set_cats( col,  replace, final_cats )            
        
        
    if ( dataset_name in ["W2_only"] ):
        col = "gor"

        ons_gor_dict = {"E12000001":"North East",
                        "E12000002":"North West",
                        "E12000003":"Yorkshire and The Humber",
                        "E12000004":"East Midlands",
                        "E12000005":"West Midlands",
                        "E12000006":"East of England",
                        "E12000007":"London",
                        "E12000008":"South East",
                        "E12000009":"South West",
                        "N99999999":"Northern Ireland",
                        "S99999999":"Scotland",
                        "W99999999":"Wales",
                        "":"Non UK & Invalid"}

        careful_replace(  col , ons_gor_dict )


    if ( dataset_name in ["W3_comb","W4_comb","W5_comb"] ):
        col = "mapNamesW3"

        BES_Panel[col] = \
            BES_Panel[col].astype('float64')
        var_type.loc[col,"dtype"] = BES_Panel[col].dtype.name        
        var_type.loc[ col , "dataset_specific_hardcoded_fix" ] = list(BES_Panel[col].unique())
    

    if ( dataset_name in ["W12_only","W11_only","W3_only","W2_only","W1_only"] ):
        partyContact = {1.0:"No",
                        2.0:"Yes",
                        9999.0:"Don't know"}
        col = "partyContactGrn"
        careful_replace( col , {el:el for el in partyContact.values()} )     

#                    'Own - outright|Own - with a mortgage|Own (part-own) - through shared ownership scheme (i.e. pay part mortgage, part rent)|Rent - from a private landlord|Rent - from my local authority|Rent - from a housing association|Neither - I live with my parents, family or friends but pay some rent to them|Neither - I live rent-free with my parents, family or friends|Other|9999':
#                        [ 'Own outright',
#                          'Own with a mortgage',
#                          'Own (part-own) through shared ownership scheme (i.e. pay part mortgage, part rent)',
#                          'Rent from a private landlord',
#                          'Rent from my local authority',
#                          'Rent from a housing association',
#                          'Neither I live with my parents, family or friends but pay some rent to them',
#                          'Neither I live rent-free with my parents, family or friends',
#                          'Other',
#                          '9999'], #profile_house_tenureW11|profile_house_tenureW12|profile_house_tenureW13
        
# housing	W13_only	W6_comb	category	3	housing	Own the leasehold/freehold outright|Buying leasehold/freehold on a mortgage|Rented from local authority|Rented from private landlord|It belongs to a Housing Association	Own - outright|Own - with a mortgage|Own (part-own) - through shared ownership scheme (i.e. pay part mortgage, part rent)|Rent - from a private landlord|Rent - from my local authority|Neither - I live with my parents, family or friends but pay some rent to them|Neither - I live rent-free with my parents, family or friends

        
    if ( dataset_name in ["W6_comb"] ):
        housing_replace = {'Own \x96 outright': 'Own outright',
                         'Own \x96 with a mortgage': 'Own with a mortgage',
                         'Own (part-own) \x96 through shared ownership scheme (i.e. pay part mortgage, part rent)': 'Own (part-own) through shared ownership scheme (i.e. pay part mortgage, part rent)',
                         'Rent \x96 from a private landlord': 'Rent from a private landlord',
                         'Rent \x96 from my local authority': 'Rent from my local authority',
                         'Neither \x96 I live with my parents, family or friends but pay some rent to them': 'Neither I live with my parents, family or friends but pay some rent to them',
                         'Neither \x96 I live rent-free with my parents, family or friends': 'Neither I live rent-free with my parents, family or friends',
                         'Other':'Other',
                         '9999':'Rent from a housing association'}
        
        housing_final_cats = ['Own outright',
                         'Own with a mortgage',
                         'Own (part-own) through shared ownership scheme (i.e. pay part mortgage, part rent)',
                         'Rent from a private landlord',
                         'Rent from my local authority',
                         'Rent from a housing association',
                         'Neither I live with my parents, family or friends but pay some rent to them',
                         'Neither I live rent-free with my parents, family or friends',
                         'Other']
        
        col = "housing" 
        careful_replace_and_set_cats( col,  housing_replace, housing_final_cats )
        
# None/ No leader|David Cameron|Ed Miliband|Nick Clegg|Nicola Sturgeon|Leanne Wood|Nigel Farage|Natalie Bennett|222.0|Don't know
# None/ No leader|David Cameron|Ed Miliband|Nick Clegg|Nicola Sturgeon|Leanne Wood|Nigel Farage|Natalie Bennett|222|Don't know
# bestLeaderCampaign	W6_only
# worstLeaderCampaign	W6_only
        
        

    BestWorstLeader_replace = {"None/ No leader":"None/No leader",
                               10.0:"All leaders equally bad",
                               222.0:"All leaders equally bad",
                               222:"All leaders equally bad"}
    BestWorstLeader_final_cats = ["None/No leader","David Cameron","Ed Miliband","Nick Clegg","Nicola Sturgeon",
                                  "Leanne Wood","Nigel Farage","Natalie Bennett","All leaders equally bad"]
    # run on all datasets - wait - only ones in which it exists
    
#     if ( dataset_name in ["W6_comb","W5_comb"] ):
    col = "bestLeaderCampaignW5"
    if ( col in BES_Panel.columns ):
        careful_replace_and_set_cats( col,  BestWorstLeader_replace, BestWorstLeader_final_cats )
    col = "worstLeaderCampaignW5"        
    if ( col in BES_Panel.columns ):  
        careful_replace_and_set_cats( col, BestWorstLeader_replace, BestWorstLeader_final_cats )

#     if ( dataset_name in ["W5_only","W6_only"] ):
    col = "bestLeaderCampaign"
    if ( col in BES_Panel.columns ):
        careful_replace_and_set_cats( col,  BestWorstLeader_replace, BestWorstLeader_final_cats )
        
    col = "worstLeaderCampaign"
    if ( col in BES_Panel.columns ):
        careful_replace_and_set_cats( col,  BestWorstLeader_replace, BestWorstLeader_final_cats )        
      

    scotReferendumIntention_replace = {'Scotland should become an independent country':"Will vote 'Yes'",
                                       111.0:'Will vote no',
                                       99.0:"Don't know",
                                       2.0:"Will not vote",}
    scotReferendumIntention_final_cats = ['Will vote no', "Will vote 'Yes'", 'Will not vote', "Don't know"]
        
    if ( dataset_name in ["W6_comb"] ):
        careful_replace_and_set_cats( "scotReferendumIntentionW6",  scotReferendumIntention_replace, scotReferendumIntention_final_cats )

        
    
    Religion = {'No, I do not regard myself as belonging to any particular religion.': 'No, I do not regard myself as belonging to any particular religion.',
         'Yes - Church of England/Anglican/Episcopal': 'Yes - Church of England/Anglican/Episcopal',
         'Yes - Roman Catholic': 'Yes - Roman Catholic',
         'Yes - Presbyterian/Church of Scotland': 'Yes - Presbyterian/Church of Scotland',
         'Yes - Methodist': 'Yes - Methodist',
         'Yes - Baptist': 'Yes - Baptist',
         'Yes - United Reformed Church': 'Yes - United Reformed Church',
         'Yes - Free Presbyterian': 'Yes - Free Presbyterian',
         'Yes - Brethren': 'Yes - Brethren',
         'Yes - Judaism': 'Yes - Judaism',
         'Yes - Hinduism': 'Yes - Hinduism',
         'Yes - Islam': 'Yes - Islam',
         'Yes - Sikhism': 'Yes - Sikhism',
         'Yes - Buddhism': 'Yes - Buddhism',
         'Yes - Other': 'Yes - Other',
         16.0: 'Prefer not to say',
         17.0: 'Yes - Orthodox Christian',
         18.0: 'Yes - Pentecostal',
         19.0: 'Yes - Evangelical /independent/non-denominational'}

    
    if ( dataset_name in ["W6_comb","W5_comb","W5_only","W4_comb","W3_comb"] ):

        col = "profile_religion"
        careful_replace( col , Religion )        

    if ( dataset_name in ["W1_only"] ):

        col = "profile_religion"
        careful_replace( col , {el:el for el in Religion.values()} )            
        
    if ( dataset_name in ["W7_only"] ):
        col = "ns_sec"
        ns_sec = "Employers in large establishments|Higher managerial and administrative occupations|L3.1 Traditional employees|L3.2 New employees|L3.3 Traditional self-employed|L3.4 New self-employed|L4.1 Traditional employees|L4.2 New employees|L4.3 Traditional self-employed|L4.4 New self-employed|Lower managerial and administrative occupations|Higher supervisory occupations|L7.1 Intermediate clerical and administrative occupations|L7.2 Intermediate sales and service occupations|L7.3 Intermediate technical and auxiliary occupations|L7.4 Intermediate engineering occupations|L8.1 Employers in small establishments in industry, commerce, services etc.|L8.2 Employers in small establishments in agriculture|L9.1 Own account workers (non-professional)|L9.2 Own account workers (agriculture)|Lower supervisory occupations|L11.1 Lower technical craft occupations|L11.2 Lower technical process operative occupations|L12.1 Semi-routine sales occupations|L12.2 Semi-routine service occupations|L12.3 Semi-routine technical occupations|L12.4 Semi-routine operative occupations|L12.5 Semi-routine agricultural occupations|L12.6 Semi-routine clerical occupations|L12.7 Semi routine childcare occupations|L13.1 Routine sales and service occupations|L13.2 Routine production occupations|L13.3 Routine technical occupations|L13.4 Routine operative occupations|L13.5 Routine agricultural occupations"
        
        careful_replace( col , {el:el for el in ns_sec.split("|")} )
#         BES_Panel[col].cat.set_categories(ns_sec.split("|"),inplace=True)
        
        
    if ( dataset_name in ["W1_only"] ):
        ageGroup = {1.0:"Under 18",
                    2.0:"18-25",
                    3.0:"26-35",
                    4.0:"36-45",
                    5.0:"46-55",
                    6.0:"56-65",
                    7.0:"66+"}
        col = "ageGroup"
        careful_replace( col , {el:el for el in ageGroup.values()})      
        
        
    if ( dataset_name in [ "W13_comb" , "W11_only" ] ):
        
        # None|Church of England/Anglican/Episcopal|Roman Catholic|Presbyterian/Church of Scotland|Methodist|Baptist
        # A|B|C1|C2|D|E|Refused|Unknown
        # DOUBLE CHECK DISTRIBUTION
        SocialGrades = {"None":"A",
                        "Church of England/Anglican/Episcopal":"B",
                        "Roman Catholic":"C1",
                        "Presbyterian/Church of Scotland":"C2",
                        "Methodist":"D",
                        "Baptist":"E",
                        "<placeholder1>":"Refused",
                        "<placeholder2>":"Unknown"}
        col = "profile_socialgrade_cie"        
        careful_replace( col , SocialGrades )
        
    NumEmployees = {1.0:"1 to 24 employees",
                    2.0:"25 to 499 employees",
                    3.0:"500 or more employees",
                    9999.0:"Don't know"}

    if ( dataset_name in ["W1_only","W2_only","W3_only","W4_only","W11_only","W12_only","W13_only","W13_comb", "W10_only"] ):
        # necessary because motherNumEmployees lacks some categories!

        col = "fatherNumEmployees"
        careful_replace( col , NumEmployees )

        col = "motherNumEmployees"
        careful_replace( col , NumEmployees )
        
    if ( dataset_name in ["W14_comb", "W14_only"] ):
        
        col = "fatherNumEmployees"
        careful_replace( col , NumEmployees )        
        
        col = "motherNumEmployees"
        careful_replace( col , NumEmployees )        
        
    if ( dataset_name in ["W9_only"] ):        
        
        col = "motherNumEmployees"
        careful_replace( col , {el:el for el in NumEmployees.values()} )        

    if ( dataset_name in ["W6_comb"] ):
        # not entirely necessary to implement it this way, it's just a bit clearer

        churchAttendance = {111.0:"Never or practically never",
                            "Less often than once a year":"Less often than once a year",
                            "Less often but at least once a year":"Less often but at least once a year",
                            "Less often but at least twice a year":"Less often but at least twice a year",
                            "Less often but at least once a month":"Less often but at least once a month",
                            "Less often but at least once in two weeks":"Less often but at least once in two weeks",
                            "Once a week or more":"Once a week or more",
                            222.0:"Varies too much to say",
                            98.0:"I am not religious",
                            99.0:"Don't know"}

        col = "churchAttendanceW6"
        careful_replace( col , churchAttendance )


        partyMember =      {0.0:"No, I have never been a member",
                            "I am not a member now but I used to be":"I am not a member now but I used to be",
                            "Yes, I am a member of a party":"Yes, I am a member of a party",
                            9999.0:"Don't know"}

        col = "partyMemberW6"
        careful_replace( col , partyMember )       


    headHouseholdPast_cat_list = "My father|My mother|Someone else|No one in my house worked|Don't know"
    if ( dataset_name in [ "W3_only","W4_only","W11_only","W12_only","W13_only", "W13_comb","W10_only","W14_comb","W14_only" ] ):
        hardcoded_fix("headHouseholdPast",
                      headHouseholdPast_cat_list)

    generalElectionCertainty_cat_list = "Not at all certain|2|3|4|5|6|Completely certain|Don't know"
    if ( dataset_name in ["W4_comb","W5_comb"] ):
        # array of floats, should be a categorical
        hardcoded_fix("generalElectionCertaintyW1",
                      generalElectionCertainty_cat_list)
        hardcoded_fix("generalElectionCertaintyW2",
                      generalElectionCertainty_cat_list)

    if ( dataset_name in ["W5_comb"] ):
        # array of floats, should be a categorical
        hardcoded_fix("generalElectionCertaintyW3",
                      generalElectionCertainty_cat_list)        


    scotReferendumIntention_cat_list = "Will vote no|Will vote 'Yes'|Will not vote|Don't know"
    if ( dataset_name in ["W4_comb","W5_comb","W6_comb"] ):
        # array of floats, should be a categorical  
        hardcoded_fix("scotReferendumIntentionW4",
                      scotReferendumIntention_cat_list)  

    selfNumEmployees_cat_list = "1 to 24 employees|25 to 499 employees|500 or more employees|Don't know"
#     selfNumEmployeesW6_W12, selfNumEmployeesLastW6_W12
    if ( dataset_name in [ 'W13_comb' ] ):
        hardcoded_fix("selfNumEmployeesW6_W12",
                      selfNumEmployees_cat_list )
        hardcoded_fix("selfNumEmployeesLastW6_W12",
                      selfNumEmployees_cat_list )    

    if ( dataset_name in [ 'W12_only' ] ):
        hardcoded_fix("selfNumEmployeesW6_",
                      selfNumEmployees_cat_list )
        hardcoded_fix("selfNumEmployeesLastW6_",
                      selfNumEmployees_cat_list )          
    
    if ( dataset_name in [ "W7_comb" ] ):  
        hardcoded_fix("selfNumEmployeesW6W7",
                      selfNumEmployees_cat_list )           
        hardcoded_fix("selfNumEmployeesLastW6W7",
                      selfNumEmployees_cat_list )          

    if ( dataset_name in [ "W8_comb" ] ):
        hardcoded_fix("selfNumEmployeesW6W7W8",
                      selfNumEmployees_cat_list )           
        hardcoded_fix("selfNumEmployeesLastW6W7W8",
                      selfNumEmployees_cat_list )  

    if ( dataset_name in [ "W10_comb", "W9_comb", "W9_only" ] ): #"W13_comb", 
        hardcoded_fix("selfNumEmployeesW6W7W8W9",
                      selfNumEmployees_cat_list )
        
    if ( dataset_name in [ "W10_comb", "W9_comb", "W9_only" ] ): #"W13_comb",         
        hardcoded_fix("selfNumEmployeesLastW6W7W8W9",
                      selfNumEmployees_cat_list )
    
    achieveReduceImmigW15_cat_list = "No|Yes"
    if ( dataset_name in [ "W15_only" ] ): #achieveReduceImmigTIG|achieveReduceImmigBrexit     
        hardcoded_fix("achieveReduceImmigTIG",
                      achieveReduceImmigW15_cat_list )    
    if ( dataset_name in [ "W15_only" ] ): #achieveReduceImmigTIG|achieveReduceImmigBrexit     
        hardcoded_fix("achieveReduceImmigBrexit",
                      achieveReduceImmigW15_cat_list )           
        
#     if ( dataset_name in [ "W12_only","W11_only","W10_only","W13_comb" ] ):
        
# #         careful_replace( "selfNumEmployees" , {el:el for el in NumEmployees.values()} )  
# #         careful_replace( "selfNumEmployeesLast" , {el:el for el in NumEmployees.values()} )
        
#         careful_replace_and_set_cats( "selfNumEmployees", {}, NumEmployees.values() )
#         careful_replace_and_set_cats( "selfNumEmployeesLast", {}, NumEmployees.values() )        


    #    "knowf2f2","knowf2f3", #  floats (0.0, 1.0, 99.0)  that should be categories True|False|Don't know
    knowf2_cat_list = "True|False|Don't know"
    if ( dataset_name in ["W12_only"]):
        hardcoded_fix("knowf2f2",
                      knowf2_cat_list )            
        hardcoded_fix("knowf2f3",
                      knowf2_cat_list )  

    if ( dataset_name in [ "W13_comb" ] ):  
        hardcoded_fix("knowf2f2W12",
                      knowf2_cat_list )             
        hardcoded_fix("knowf2f3W12",
                      knowf2_cat_list )

    likeSalmond_list = "Strongly dislike|1.0|2.0|3.0|4.0|5.0|6.0|7.0|8.0|9.0|Strongly like|Don't know"
    if ( dataset_name in [ "W4_comb","W4_comb","W5_comb" ] ):
        hardcoded_fix("likeSalmondW1",
                      likeSalmond_list )   
        hardcoded_fix("likeSalmondW2",
                      likeSalmond_list )
        hardcoded_fix("likeSalmondW3",
                      likeSalmond_list )

    eesEUIntegration_list = "Unification has already gone too far|1.0|2.0|3.0|4.0|5.0|6.0|7.0|8.0|9.0|Unification should be pushed further|Don't know"    
    if ( dataset_name in [ "W3_comb","W4_comb","W4_comb","W5_comb" ] ):
        hardcoded_fix("eesEUIntegrationGreenW2",
                      eesEUIntegration_list )    

    likeSturgeon_list = "Strongly dislike|1.0|2.0|3.0|4.0|5.0|6.0|7.0|8.0|9.0|Strongly like|Don't know"    
    if ( dataset_name in [ "W4_comb","W4_comb","W5_comb" ] ):
        hardcoded_fix("likeSturgeonW4",
                      likeSturgeon_list )

    # W5_comb
    # No|Yes	Got a lot worse|Got a little worse
    # partyContactDKW5, participation_1-6W5, sharedContentOnline_1-5W5, participation_99W5
    participation_list = "No|Yes"    
    if ( dataset_name in [ "W5_comb" ] ):
        hardcoded_fix("partyContactDKW5",
                      participation_list )    
        hardcoded_fix("participation_1W5",
                      participation_list )   
        hardcoded_fix("participation_2W5",
                      participation_list )   
        hardcoded_fix("participation_3W5",
                      participation_list )   
        hardcoded_fix("participation_4W5",
                      participation_list )   
        hardcoded_fix("participation_5W5",
                      participation_list )   
        hardcoded_fix("participation_6W5",
                      participation_list )   
        hardcoded_fix("sharedContentOnline_1W5",
                      participation_list )  
        hardcoded_fix("sharedContentOnline_2W5",
                      participation_list )      
        hardcoded_fix("sharedContentOnline_3W5",
                      participation_list )      
        hardcoded_fix("sharedContentOnline_4W5",
                      participation_list )  
        hardcoded_fix("sharedContentOnline_5W5",
                      participation_list )      
        hardcoded_fix("participation_99W5",
                      participation_list ) 
        
    if ( dataset_name in [ "W15_comb" ] ):        
        hardcoded_fix("profile_education_levelW10",
                      "No formal qualifications|Youth training certificate/skillseekers|Recognised trade apprenticeship completed|Clerical and commercial|City and Guild certificate|City and Guild certificate - advanced|ONC|CSE grades 2-5|CSE grade 1, GCE O level, GCSE, School Certificate|Scottish Ordinary/ Lower Certificate|GCE A level or Higher Certificate|Scottish Higher Certificate|Nursing qualification (eg SEN, SRN, SCM, RGN)|Teaching qualification (not degree)|University diploma|University or CNAA first degree (eg BA, B.Sc, B.Ed)|University or CNAA higher degree (eg M.Sc, Ph.D)|Other technical, professional or higher qualification|Don't know|Prefer not to say" )         
        
        hardcoded_fix("jobzoneW1W2W3W4W5",
              "occupations that need little or no preparation|occupations that need some preparation|occupations that need medium preparation|occupations that need considerable preparation|occupations that need extensive preparation" )
        
        hardcoded_fix("jobzoneW6W7W8W9",
              "occupations that need little or no preparation|occupations that need some preparation|occupations that need medium preparation|occupations that need considerable preparation|occupations that need extensive preparation" )        
                
        hardcoded_fix("scotReferendumOutsideW1",
              "Scotland should remain part of the UK|Scotland should become an independent country|Don't know" )         

        hardcoded_fix("scotReferendumOutsideW2",
              "Scotland should remain part of the UK|Scotland should become an independent country|Don't know" ) 
        
        hardcoded_fix("discussantturnoutName1W2",
              "No, didn't vote|Yes, they voted|Don't know" ) 
        
        hardcoded_fix("discussantturnoutName2W2",
              "No, didn't vote|Yes, they voted|Don't know" )         

        hardcoded_fix("discussantturnoutName3W2",
              "No, didn't vote|Yes, they voted|Don't know" )         
        
        discussantturnout_cats = {0.0:"No, didn't vote",
                    1.0:"Yes, they voted",
                    9997.0:"Other", # not super sure about this, but patience low!
                    9998.0:"They were not eligible to vote",
                    9999.0:"Don't know"}       

        col = "discussantturnoutName1W4"
        careful_replace( col , discussantturnout_cats )
        
        col = "discussantturnoutName2W4"
        careful_replace( col , discussantturnout_cats )
        
        col = "discussantturnoutName3W4"
        careful_replace( col , discussantturnout_cats )
        
        col = "discussantturnoutName1W12"
        careful_replace( col , discussantturnout_cats )
        
        col = "discussantturnoutName2W12"
        careful_replace( col , discussantturnout_cats )
        
        col = "discussantturnoutName3W12"
        careful_replace( col , discussantturnout_cats )  
        
    if (dataset_name in ["W16_comb"] ):
        hardcoded_fix("discussantturnoutName1W2",
              "No, didn't vote|Yes, they voted|Don't know" )
        hardcoded_fix("discussantturnoutName2W2",
              "No, didn't vote|Yes, they voted|Don't know" )         
        hardcoded_fix("discussantturnoutName3W2",
              "No, didn't vote|Yes, they voted|Don't know" )    
        col = "discussantturnoutName1W4"
        discussantturnout_cats = {0.0:"No, didn't vote",
                    1.0:"Yes, they voted",
                    9997.0:"Other", # not super sure about this, but patience low!
                    9998.0:"They were not eligible to vote",
                    9999.0:"Don't know"}         
        careful_replace( col , discussantturnout_cats )     
        col = "discussantturnoutName2W4"
        careful_replace( col , discussantturnout_cats )
        col = "discussantturnoutName3W4"
        careful_replace( col , discussantturnout_cats )
        col = "discussantturnoutName1W12"
        careful_replace( col , discussantturnout_cats )
        col = "discussantturnoutName2W12"
        careful_replace( col , discussantturnout_cats )
        col = "discussantturnoutName3W12"
        careful_replace( col , discussantturnout_cats )         
        
        
        BES_Panel["brexitSNPW16"]       = BES_Panel["brexitSNPW16"].cat.add_categories(1.0).cat.reorder_categories(['Leave the EU without a deal', 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 'Remain in the EU', "Don't know"])
        BES_Panel["immigManipCheckW7"]  = BES_Panel["immigManipCheckW7"].cat.rename_categories([1.0,2.0,3.0,4.0,5.0,6.0,7.0,997.0])
        BES_Panel["immigManipCheck2W7"] = BES_Panel["immigManipCheck2W7"].cat.rename_categories([1.0,2.0,3.0,4.0,5.0,6.0,7.0,997.0])
        
        BES_Panel['identityStrengthAllW11'] = BES_Panel['identityStrengthAllW11'].astype('category')
        BES_Panel['ageW16'] = BES_Panel['ageW16'].astype('category')
        BES_Panel['jobzoneW1W2W3W4W5'] = BES_Panel['jobzoneW1W2W3W4W5'].astype('category')
        BES_Panel['jobzoneW6W7W8W9']   = BES_Panel['jobzoneW6W7W8W9'].astype('category')
        
        
        

In [15]:
# BES_Panel["brexitSNPW16"].cat.add_categories(1.0).cat.categories

In [16]:
# 479	2.0|3.0|4.0|5.0|6.0|7.0|8.0|9.0|Don't know|Leave the EU without a deal|Remain in the EU	1	87	11	brexitSNPW16	-99

# BES_Panel["brexitSNPW16"].cat.categories

In [17]:
def fix_100_seq(col, start, finish, weasel, rng=100):
    lst = list([weasel,start, finish])
    lst_dict = {"0":start,str(rng):finish}

    fullseq = [start]
    [fullseq.append(str(x)) for x in range(1,rng)]
    fullseq.append(finish)
    fullseq.append(weasel)
    # make sure all numbers in same format (string integers)
    BES_Panel[col] = BES_Panel[col].cat.rename_categories( [str(int(x)) if x not in lst else x for x in BES_Panel[col].cat.categories ] )
    BES_Panel[col] = BES_Panel[col].cat.rename_categories( [lst_dict[x] if x in lst_dict.keys() else x for x in BES_Panel[col].cat.categories ] )
    
    # change categories to correct range
    BES_Panel[col] = BES_Panel[col].cat.set_categories(fullseq)
    if len( BES_Panel[col].cat.categories ) != rng+2:
        raise Exception("wrong number of categories!")




def number_and_string_sequences(  ):

# How to deal with large sequences of numbers (e.g. %)
# Which have some values missing (presumably because no entries)
# But also have strings at the ends

# Want to keep the string categories (because they're useful for clarification)
# But also want the numeric coding to be remain accurate
# e.g. "0% no support for X, 1% ... 45%, 83%, 100% complete support for X" -> would normally turn into [0,1...45,46,47]
# should turn into [0,1...45,83,100]

# It's *POSSIBLE* that question answerers don't think this way - might get cleaner results by just assuming positional placement
# Would be useful to have a switch to test that



# run on everything like this

#

# re.match( "(winConstituency[a-zA-Z0-9_]+)", "winConstituencyConW4").groups()[0]

# maybe simply run this on all variables marked 6?
# tweak the ends, drop the DKS, then turn to floats?


    str_float_0_100_cats = [str(float(x)) for x in range(0,101)] # ['0.0', '1.0', '2.0', '3.0' ... '98.0', '99.0', '100.0']

    for col in ["propMPLocalW14","propMPWCW14","propMPFemaleW14","propMPDisabW14","propMPYoungW14",
                "propMPRaceW14","propMPChristW14","propMPLGBTW14","propMPMuslimW14","propMPDegreeW14"]:
        if ( col in  BES_Panel.columns ):    
            start = "0 - None of them"
            finish = "100 - All of them"
            weasel = "Don't know"        
            fix_100_seq(col, start, finish, weasel)     

    for col in ["warmDenmarkW7","warmFranceW7","warmPolandW7","warmRomaniaW7","warmGreeceW7",
                "warmSpainW7","warmUKW7"]:
        if ( col in  BES_Panel.columns ):  
            BES_Panel[col] = BES_Panel[col].astype('category')
            start = "Very cold"
            finish = "Very warm"
            weasel = "Don't know"        
            fix_100_seq(col, start, finish, weasel)     
            
           
    
    
    ### this isn't an error so much as a matter of practicality
    # if I make all these values integers then we don't have to
    # worry about missing categories
    # (assuming they're only missing because of legit. lack of entries)
    col = "scotRefExpectationTurnout"

    if ( col in  BES_Panel.columns ):    

        start = "0% of people will vote"
        finish = "100% of people will vote"
        weasel = "Don't know"        
        fix_100_seq(col, start, finish, weasel)         
        
#         scotRefExpectationTurnout_list = ["100.0" if x=="100% of people will vote" else x for x in BES_Panel[col].cat.categories]
#         BES_Panel[col].cat.rename_categories( scotRefExpectationTurnout_list, inplace=True )
#         add_categories()


    col = "winConstituencyPC"    
    if ( col in  BES_Panel.columns ):
        
        start = "0 - Very unlikely to win"
        finish = "100 - Very likely to win"
        weasel = "Don't know"        
        fix_100_seq(col, start, finish, weasel)          
        
#         winConstituencyPC_list = ["100.0" if x=="100 - Very likely to win" else x for x in BES_Panel[col].cat.categories]
#         winConstituencyPC_list = ["0.0" if x=="0 - Very unlikely to win" else x for x in winConstituencyPC_list]
#         BES_Panel[col].cat.rename_categories( winConstituencyPC_list, inplace=True )

    col = "winConstituencySNP"
    if ( col in  BES_Panel.columns ):    
        start = "0 - Very unlikely to win"
        finish = "100 - Very likely to win"
        weasel = "Don't know"        
        fix_100_seq(col, start, finish, weasel)   

    col = "winConstituencyGreen"
    if ( col in  BES_Panel.columns ):    
        start = "0 - Very unlikely to win"
        finish = "100 - Very likely to win"
        weasel = "Don't know"        
        fix_100_seq(col, start, finish, weasel)    
        
    col = "winConstituencyTIG"
    if ( col in  BES_Panel.columns ):    
        start = "0 - Very unlikely to win"
        finish = "100 - Very likely to win"
        weasel = "Don't know"        
        fix_100_seq(col, start, finish, weasel)   
        
    col = "winConstituencyBrexit"
    if ( col in  BES_Panel.columns ):    
        start = "0 - Very unlikely to win"
        finish = "100 - Very likely to win"
        weasel = "Don't know"        
        fix_100_seq(col, start, finish, weasel)   
        
        
# Allow many fewer|2|4|5|6|7|8|9|Allow many more|Don't know        

    col = "immigSNP"
    if ( col in  BES_Panel.columns ):    
        start = "Allow many fewer"
        finish = "Allow many more"
        weasel = "Don't know"        
        fix_100_seq(col, start, finish, weasel,10)       


    col = "immigPC"
    if ( col in  BES_Panel.columns ):    
        start = "Allow many fewer"
        finish = "Allow many more"
        weasel = "Don't know"        
        fix_100_seq(col, start, finish, weasel,10)      
## NEED TO SET THESE AS TYPE 6!    
    

In [18]:
def process_dataset(dataset_name):

    BES_file_manifest = pd.read_csv( BES_small_data_files + "BES_file_manifest.csv" )
    manifest = BES_file_manifest[ BES_file_manifest["Name"] == dataset_name ]

    data_subfolder = BES_data_folder + dataset_name + os.sep

    filename = manifest["Stata_Filename"].values[0]

    global BES_Panel
    BES_Panel = pd.read_stata( data_subfolder + filename)
    ####################################################

    # use this dataframe to store *everything* we're doing to transform/ignore variables!
    global var_type
    var_type = pd.DataFrame(columns = ["dataset_name","dtype","cat_all_strings","type","pruned","original_cat_list",
                                       "renamed_cat_list","reordered_cat_list","final_cat_list",
                                       "dataset_specific_hardcoded_fix",
                                       "numerical_dont_knows",
                                       "weasel_words","typos" ] )
    ####################################################

    hard_coded_fixes( dataset_name ) # side effects on BES_Panel and var_type
    number_and_string_sequences() # side effects on BES_Panel

    variable_categories = pd.read_csv( BES_small_data_files + "question_categories_correct.csv",
                                       encoding = encoding )
    variable_categories.drop('Unnamed: 0', axis=1,inplace=True)

    ( var_cat_dict_pruned , var_cat_dict_pruned_2 ) = create_var_list( variable_categories )
    ####################################################

    missing_col_names = []
    try:
        for col in BES_Panel.columns:
            print(col)
            dt =  BES_Panel[col].dtype.name # data type
            not_found = False

            var_type.loc[col,"dataset_name"] = dataset_name
            # dtype is either nan because not set -> set
            if not isinstance(var_type.loc[col,"dtype"],str):
                var_type.loc[ col , "dtype"] = dt    
            # if dtype == category *and* cat_all_strings not already set, set
            if (var_type.loc[ col , "dtype" ] == 'category') and careful_isnan( var_type.loc[ col , "cat_all_strings" ] ):
                var_type.loc[ col , "cat_all_strings" ] = np.all([isinstance(x,str) for x in BES_Panel[ col ].cat.categories])

            not_found = False      

            if (col in ignore_list) or (var_type.loc[col,"type"] == -2): # exclude values from ignore_list *and manually coded errors*
                var_type.loc[col,"type"] = -2
                if var_type.loc[ col , "cat_all_strings" ]==True:
                    var_type.loc[ col, "original_cat_list" ] = "|".join( BES_Panel[col].cat.categories )
                elif ('float' in dt) or ('int' in dt):
                    var_type.loc[ col, "original_cat_list" ] = list(BES_Panel[col].unique())

            elif (col in ["id"] ): # id
                var_type.loc[col,"type"] = -5

            elif (dt == 'object'): # (probably) text
                var_type.loc[col,"type"] = -4

            elif ("datetime" in dt): # datetime
                var_type.loc[col,"type"] = -3

        # 0 - personality measures (in steps of .5?), personality_agreeableness ...etc, riskScaleW8        
            elif (col in ["personality_agreeableness",
                         "personality_conscientiousness",
                         "personality_extraversion",
                         "personality_neuroticism",
                         "personality_openness"]) or (re.match("riskScale(W[0-9]+)?",col) is not None):
                var_type.loc[col,"type"] = 0

        # 7 - soc2010(W3-6_comb,W5_only), v1(W5_comb), RandomIDW1(W3-6_comb), mapNames(W3_only), mapNamesW3 (W3-10_comb,W13_comb)        
            elif re.match("soc2010|v1|RandomIDW1|mapNames(W[0-9]+)?" ,col) is not None:
                var_type.loc[col,"type"] = 7

        # 8 - pano, electoratepcon, <party>sh10pcon, turnout10pcon, winnersh10pcon, runnerupsh10pcon, marginsh10pcon
        # don't include 'runnerup10pcon', 'winner10pcon'- these are categorical!
        # all relate to parliamentary constituency (pano applies to different waves - rest are about 2010 general election)
            elif re.match( "pano(W[0-9]+)?|electoratepcon|[a-zA-Z]+sh10pcon|turnout10pcon" , col ) is not None:
                var_type.loc[col,"type"] = 8

            elif col in ['cciW1W2W3W4W5','ccinoITW1W2W3W4W5','justITW1W2W3W4W5','cciW6W7W8W9','ccinoITW6W7W8W9','justITW6W7W8W9']:
                var_type.loc[col,"type"] = 9

            # wave flags/weights (int and float)
            elif re.match("wave[0-9]+|"\
                          "w[0-9]+core|"\
                          "w[0-9]+full|"\
                          "wt_daily_W[0-9]+|"\
                          "wt_core_W[0-9]+|"\
                          "wt_full_[W0-9]+|"\
                          "wt_new_[W0-9]+|"\
                          "CampaignDay(W[0-9]+)?|"\
                          "miilabelcertainty(W[0-9]+)?|"\
                          "Dailyweight(W[0-9]+)?|"\
                          "new_full_weight|"\
                          "w8_wave6_and_wave7|w8_wave2_and_wave6|w8_wave2_and_wave6_and_wave7|w8_wave9_to_wave13"

                          , col) is not None: 

                var_type.loc[col,"type"] = -1

            # waveX - wave int wave 0/1 flag
            # wave 1-11: wt_full_W6, wt_core_W6, wt_full_W1W2W3W4W5W6W7W8W9), 
            # waves 10: wt_new_W10, wt_full_W1_W13
            # CampaignDayWX
            # miilabelcertaintyWX

            else:
                not_found = True
                type_range = set(variable_categories["type"].values)
                for typ in type_range:
                    pruned_variable_name = prune2( prune(col) )
                    if pruned_variable_name in var_cat_dict_pruned_2[typ]:
                        var_type.loc[col,"type"] = typ
                        var_type.loc[col,"pruned"] = pruned_variable_name
                        not_found = False

            if not_found == True:
                var_type.loc[col,"type"] = -99
                pruned_variable_name = prune2( prune(col) )
                var_type.loc[col,"pruned"] = pruned_variable_name
                missing_col_names.append(col)
    except Exception as e:
        print(col, e)            

    var_type["type"] = var_type["type"].astype("int8")

    # reset order of var_type rows to be same as BES_Panel
    var_type = var_type.loc[BES_Panel.columns]
    
    ####################################################
    
    missing_col_names_cat_only = []

    for col in missing_col_names:
        if BES_Panel[col].dtypes.name == 'category':
            missing_col_names_cat_only.append(col)
            
    ####################################################

    if missing_col_names:
        updated_variable_categories = variable_categories.copy()
        # question	frequency	question_length	question_options	column_name	type

        for i in missing_col_names_cat_only:
            str_list = [ str(cat) for cat in BES_Panel[i].cat.categories ]
            joined_list = "|".join(str_list)
            match  = (joined_list == updated_variable_categories["question"])

            if match.any(): # answer set already in records
                index = updated_variable_categories[match].index
                if len(index)>1: # answer set ("question") index should be unique!
                    raise ValueError('answer set ("question") index should be unique!')

                # add column name and increase frequency
                updated_variable_categories.loc[index,"frequency"] = updated_variable_categories.loc[index,"frequency"]+1
                current_list_col_names = updated_variable_categories.loc[index,"column_name"].values[0].split("|")
                current_list_col_names.append(i)
                updated_variable_categories.loc[index,"column_name"] = "|".join( current_list_col_names )

            else: # answer set not already in records - add new line to dataframe
                df = pd.DataFrame([],  columns = updated_variable_categories.columns )

                # no need to add index
                # updated_variable_categories.shape[0], 
                df.loc[0] = [joined_list,
                             1,
                             len(joined_list),
                             len(str_list),
                             i,-99]
                updated_variable_categories = updated_variable_categories.append(df, ignore_index=True)

        variable_categories = updated_variable_categories
        updated_variable_categories.to_csv(BES_small_data_files + "question_categories_correct_updatesneeded!.csv",
                                           encoding = encoding )


        display([x for x in zip(missing_col_names, BES_Panel[missing_col_names].dtypes)])

        manual_fixing_advice_string = "Stop - new variables detected\n"\
                                      "Go look at question_categories_correct_updatesneeded!.csv\n"\
                                      "fill in types, save as question_categories_correct.csv and rerun this code"


        raise Exception(manual_fixing_advice_string)
    ####################################################
    
    # [-5, -4, -3, -2, -1, 4, 7, 8, 9] -> meta list
    # [0, 1, 2, 3, 5, 6] ->     
    content_list = [0, 1, 2, 3, 5, 6]
    meta_list = [-5, -4, -3, -2, -1, 7, 8, 9] # -99, 4 excluded because could be categorical
    # 'numeric' columns (ones that can be transformed into numbers)
    num_cols     = BES_Panel.columns[ (var_type["type"].apply( lambda x: x in [0,1,2,3,5,6] )).values ]
    # can't be transformed into numbers / are numbers but are meta-data rather than raw content (e.g. weights)
    non_num_cols = BES_Panel.columns[ (var_type["type"].apply( lambda x: x in [-99,-5,-4,-3,-1 ]  )).values ]

    BES_numeric  = BES_Panel[num_cols].copy()
    for col in BES_numeric:

        if col not in var_type["type"].index:
            raise Exception( "variable not registered - and somehow slipped past!" )

        if var_type.loc[ col, "type" ] in [0,7]:
            continue

        # force all category elements into strings
        # ARE THEY EVER NOT?
        BES_numeric[col].cat.rename_categories( BES_numeric[col].cat.categories.map(str), inplace=True )

        join_list = "|".join( BES_numeric[col].cat.categories ) # create category_list_string "strongly agree|agree|neither|..."
        var_type.loc[ col, "original_cat_list" ] = join_list    

        # typos - things with weird characters
        fixed_cat_string = fix_a_hat_chars( join_list )
        if fixed_cat_string is not None:
            var_type.loc[ col, "typos" ]   = join_list      
            BES_numeric[col].cat.rename_categories( fixed_cat_string , inplace=True )
            join_list = "|".join( BES_numeric[col].cat.categories )        
            
        # rename categories
        if join_list in rename_cat_dict.keys():
            var_type.loc[ col, "renamed_cat_list" ]   = join_list        
            BES_numeric[col].cat.rename_categories(  rename_cat_dict[join_list], inplace=True )
            join_list = "|".join( BES_numeric[col].cat.categories )        

        # reorder categories
        if join_list in change_cat_dict.keys():
            var_type.loc[ col, "reordered_cat_list" ] = join_list        
            BES_numeric[col].cat.reorder_categories( change_cat_dict[join_list], inplace=True )
            join_list = "|".join( BES_numeric[col].cat.categories )        

        # remove "Don't Know"s that are in weird numerical form (eg. [ "9999.0", "997.0", "222.0", "99.0", "0.0" ])
        # de_weasel numbers
        numerical_dont_knows = de_weasel_nums( BES_numeric[col].cat.categories )
        if len(numerical_dont_knows) != 0:
            BES_numeric[col].cat.remove_categories( numerical_dont_knows , inplace=True )
            var_type.loc[ col, "numerical_dont_knows" ] = "|".join( numerical_dont_knows )

        # set all digits to floating point format, one decimal place
        BES_numeric[col].cat.rename_categories( de_num( BES_numeric[col].cat.categories ), inplace=True )

        # de_weasel
        weasel_words = BES_numeric[col].cat.categories.intersection(Weasel_set)
        if len(weasel_words) != 0:    
            BES_numeric[col].cat.remove_categories( weasel_words, inplace=True )
            var_type.loc[ col, "weasel_words" ] = "|".join( weasel_words )

        # Laziness - I want an extra column with the destination category sets
        # (should be a smaller set than original category sets)
        var_type.loc[ col, "final_cat_list" ] = "|".join( BES_numeric[col].cat.categories )        
    ####################################################

    # save category data
    cat_dictionary = {}
    for col in BES_numeric.columns:
        if var_type["type"][col] in [1, 2, 3, 5]: # not just cat, but one not already numerical!
            cat_dictionary[col] = BES_numeric[col].cat.categories


    # turn categories into numbers
    for col in BES_numeric:

        if var_type["type"][col] in [1,2,3,5]: # category type variables (other than indicators)
            BES_numeric[col] = BES_numeric[col].cat.codes

        if var_type["type"][col] in [0,1,2,3,5,6,7]:
            BES_numeric[col] = BES_numeric[col].astype('float64')

    BES_numeric.replace(-1,np.nan, inplace=True) # replace -1 cat code for NaN with actual NaN - downside, requires dtype float
    ####################################################

    fname = data_subfolder + "cat_dictionary.pkl"
    with open(fname, "wb") as f:
        pickle.dump( cat_dictionary, f )

    BES_non_numeric = BES_Panel[non_num_cols].copy()
    BES_non_numeric.to_hdf( data_subfolder + "BESnon_numeric.hdf", "BESnon_numeric" )

    BES_numeric.to_hdf( data_subfolder + "BESnumeric.hdf", "BESnumeric" )

    var_type.to_csv( data_subfolder + "var_type.csv", encoding = encoding )
    # don't think the performance warning will be relevant on such a small dataframe

In [19]:
global BES_Panel
# BES_Panel = pd.read_stata( data_subfolder + filename)
####################################################

# use this dataframe to store *everything* we're doing to transform/ignore variables!
global var_type
var_type = pd.DataFrame(columns = ["dataset_name","dtype","cat_all_strings","type","pruned","original_cat_list",
                                   "renamed_cat_list","reordered_cat_list","final_cat_list",
                                   "dataset_specific_hardcoded_fix",
                                   "numerical_dont_knows",
                                   "weasel_words","typos" ] )
####################################################

hard_coded_fixes( dataset_name ) # side effects on BES_Panel and var_type
number_and_string_sequences() # side effects on BES_Panel

variable_categories = pd.read_csv( BES_small_data_files + "question_categories_correct.csv",
                                   encoding = encoding )
variable_categories.drop('Unnamed: 0', axis=1,inplace=True)

( var_cat_dict_pruned , var_cat_dict_pruned_2 ) = create_var_list( variable_categories )
####################################################

missing_col_names = []
try:
    for col in BES_Panel.columns:
        print(col)
        dt =  BES_Panel[col].dtype.name # data type
        not_found = False

        var_type.loc[col,"dataset_name"] = dataset_name
        # dtype is either nan because not set -> set
        if not isinstance(var_type.loc[col,"dtype"],str):
            var_type.loc[ col , "dtype"] = dt    
        # if dtype == category *and* cat_all_strings not already set, set
        if (var_type.loc[ col , "dtype" ] == 'category') and careful_isnan( var_type.loc[ col , "cat_all_strings" ] ):
            var_type.loc[ col , "cat_all_strings" ] = np.all([isinstance(x,str) for x in BES_Panel[ col ].cat.categories])

        not_found = False      

        if (col in ignore_list) or (var_type.loc[col,"type"] == -2): # exclude values from ignore_list *and manually coded errors*
            var_type.loc[col,"type"] = -2
            if var_type.loc[ col , "cat_all_strings" ]==True:
                var_type.loc[ col, "original_cat_list" ] = "|".join( BES_Panel[col].cat.categories )
            elif ('float' in dt) or ('int' in dt):
                var_type.loc[ col, "original_cat_list" ] = list(BES_Panel[col].unique())

        elif (col in ["id"] ): # id
            var_type.loc[col,"type"] = -5

        elif (dt == 'object'): # (probably) text
            var_type.loc[col,"type"] = -4

        elif ("datetime" in dt): # datetime
            var_type.loc[col,"type"] = -3

    # 0 - personality measures (in steps of .5?), personality_agreeableness ...etc, riskScaleW8        
        elif (col in ["personality_agreeableness",
                     "personality_conscientiousness",
                     "personality_extraversion",
                     "personality_neuroticism",
                     "personality_openness"]) or (re.match("riskScale(W[0-9]+)?|(cog|aff)empathyIRT(W[0-9]+)?|zeroSumIRT(W[0-9]+)?",col) is not None):
            var_type.loc[col,"type"] = 0

    # 7 - soc2010(W3-6_comb,W5_only), v1(W5_comb), RandomIDW1(W3-6_comb), mapNames(W3_only), mapNamesW3 (W3-10_comb,W13_comb)        
        elif re.match("soc2010|v1|RandomIDW1|mapNames(W[0-9]+)?" ,col) is not None:
            var_type.loc[col,"type"] = 7

    # 8 - pano, electoratepcon, <party>sh10pcon, turnout10pcon, winnersh10pcon, runnerupsh10pcon, marginsh10pcon
    # don't include 'runnerup10pcon', 'winner10pcon'- these are categorical!
    # all relate to parliamentary constituency (pano applies to different waves - rest are about 2010 general election)
        elif re.match( "pano(W[0-9]+)?|electoratepcon|[a-zA-Z]+sh10pcon|turnout10pcon" , col ) is not None:
            var_type.loc[col,"type"] = 8

        elif col in ['cciW1W2W3W4W5','ccinoITW1W2W3W4W5','justITW1W2W3W4W5',
                     'cciW6W7W8W9','ccinoITW6W7W8W9','justITW6W7W8W9',
                     'ptvexpgrpW14','ptvexpgrp']:
            var_type.loc[col,"type"] = 9

        # wave flags/weights (int and float)
        elif re.match("wave[0-9]+|"\
                      "w[0-9]+core|"\
                      "w[0-9]+full|"\
                      "wt_daily_W[0-9]+|"\
                      "wt_core_W[0-9]+|"\
                      "wt_full_[W0-9]+|"\
                      "wt_new_[W0-9]+|"\
                      "CampaignDay(W[0-9]+)?|"\
                      "miilabelcertainty(W[0-9]+)?|"\
                      "Dailyweight(W[0-9]+)?|"\
                      "new_full_weight|"\
                      "w8_wave6_and_wave7|w8_wave2_and_wave6|w8_wave2_and_wave6_and_wave7|w8_wave9_to_wave13|"\
                      "wt_new_|"\
                      "sample_fullW1|"\
                      "wt_fresh_W16|wt_fresh_"

                      , col) is not None: 

            var_type.loc[col,"type"] = -1

        # waveX - wave int wave 0/1 flag
        # wave 1-11: wt_full_W6, wt_core_W6, wt_full_W1W2W3W4W5W6W7W8W9), 
        # waves 10: wt_new_W10, wt_full_W1_W13
        # CampaignDayWX
        # miilabelcertaintyWX

        else:
            not_found = True
            type_range = set(variable_categories["type"].values)
            for typ in type_range:
                pruned_variable_name = prune2( prune(col) )
                if pruned_variable_name in var_cat_dict_pruned_2[typ]:
                    var_type.loc[col,"type"] = typ
                    var_type.loc[col,"pruned"] = pruned_variable_name
                    not_found = False

        if not_found == True:
            var_type.loc[col,"type"] = -99
            pruned_variable_name = prune2( prune(col) )
            var_type.loc[col,"pruned"] = pruned_variable_name
            missing_col_names.append(col)
except Exception as e:
    print(col, e)            

var_type["type"] = var_type["type"].astype("int8")

# reset order of var_type rows to be same as BES_Panel
var_type = var_type.loc[BES_Panel.columns]

####################################################

missing_col_names_cat_only = []

for col in missing_col_names:
    if BES_Panel[col].dtypes.name == 'category':
        missing_col_names_cat_only.append(col)

####################################################

if missing_col_names:
    updated_variable_categories = variable_categories.copy()
    # question	frequency	question_length	question_options	column_name	type

    for i in missing_col_names_cat_only:
        str_list = [ str(cat) for cat in BES_Panel[i].cat.categories ]
        joined_list = "|".join(str_list)
        match  = (joined_list == updated_variable_categories["question"])

        if match.any(): # answer set already in records
            index = updated_variable_categories[match].index
            if len(index)>1: # answer set ("question") index should be unique!
                raise ValueError('answer set ("question") index should be unique!')

            # add column name and increase frequency
            updated_variable_categories.loc[index,"frequency"] = updated_variable_categories.loc[index,"frequency"]+1
            current_list_col_names = updated_variable_categories.loc[index,"column_name"].values[0].split("|")
            current_list_col_names.append(i)
            updated_variable_categories.loc[index,"column_name"] = "|".join( current_list_col_names )

        else: # answer set not already in records - add new line to dataframe
            df = pd.DataFrame([],  columns = updated_variable_categories.columns )

            # no need to add index
            # updated_variable_categories.shape[0], 
            df.loc[0] = [joined_list,
                         1,
                         len(joined_list),
                         len(str_list),
                         i,-99]
            updated_variable_categories = updated_variable_categories.append(df, ignore_index=True)

    variable_categories = updated_variable_categories
    updated_variable_categories.to_csv(BES_small_data_files + "question_categories_correct_updatesneeded!.csv",
                                       encoding = encoding )


    display([x for x in zip(missing_col_names, BES_Panel[missing_col_names].dtypes)])

    manual_fixing_advice_string = "Stop - new variables detected\n"\
                                  "Go look at question_categories_correct_updatesneeded!.csv\n"\
                                  "fill in types, save as question_categories_correct.csv and rerun this code"


    raise Exception(manual_fixing_advice_string)
####################################################

# [-5, -4, -3, -2, -1, 4, 7, 8, 9] -> meta list
# [0, 1, 2, 3, 5, 6] ->     
content_list = [0, 1, 2, 3, 5, 6]
meta_list = [-5, -4, -3, -2, -1, 7, 8, 9] # -99, 4 excluded because could be categorical
# 'numeric' columns (ones that can be transformed into numbers)
num_cols     = BES_Panel.columns[ (var_type["type"].apply( lambda x: x in [0,1,2,3,5,6] )).values ]
# can't be transformed into numbers / are numbers but are meta-data rather than raw content (e.g. weights)
non_num_cols = BES_Panel.columns[ (var_type["type"].apply( lambda x: x in [-99,-5,-4,-3,-1 ]  )).values ]

BES_numeric  = BES_Panel[num_cols].copy()
for col in BES_numeric:
    print(col)

    if col not in var_type["type"].index:
        raise Exception( "variable not registered - and somehow slipped past!" )

    if var_type.loc[ col, "type" ] in [0,7]:
        continue

    # force all category elements into strings
    # ARE THEY EVER NOT?
    BES_numeric[col].cat.rename_categories( BES_numeric[col].cat.categories.map(str), inplace=True )

    join_list = "|".join( BES_numeric[col].cat.categories ) # create category_list_string "strongly agree|agree|neither|..."
    var_type.loc[ col, "original_cat_list" ] = join_list    

    # typos - things with weird characters
    fixed_cat_string = fix_a_hat_chars( join_list )
    if fixed_cat_string is not None:
        var_type.loc[ col, "typos" ]   = join_list      
        BES_numeric[col].cat.rename_categories( fixed_cat_string , inplace=True )
        join_list = "|".join( BES_numeric[col].cat.categories )        

    # rename categories
    if join_list in rename_cat_dict.keys():
        var_type.loc[ col, "renamed_cat_list" ]   = join_list        
        BES_numeric[col].cat.rename_categories(  rename_cat_dict[join_list], inplace=True )
        join_list = "|".join( BES_numeric[col].cat.categories )        

    # reorder categories
    if join_list in change_cat_dict.keys():
        var_type.loc[ col, "reordered_cat_list" ] = join_list        
        BES_numeric[col].cat.reorder_categories( change_cat_dict[join_list], inplace=True )
        join_list = "|".join( BES_numeric[col].cat.categories )        

    # remove "Don't Know"s that are in weird numerical form (eg. [ "9999.0", "997.0", "222.0", "99.0", "0.0" ])
    # de_weasel numbers
    numerical_dont_knows = de_weasel_nums( BES_numeric[col].cat.categories )
    if len(numerical_dont_knows) != 0:
        BES_numeric[col].cat.remove_categories( numerical_dont_knows , inplace=True )
        var_type.loc[ col, "numerical_dont_knows" ] = "|".join( numerical_dont_knows )

    # set all digits to floating point format, one decimal place
    BES_numeric[col].cat.rename_categories( de_num( BES_numeric[col].cat.categories ), inplace=True )

    # de_weasel
    weasel_words = BES_numeric[col].cat.categories.intersection(Weasel_set)
    if len(weasel_words) != 0:    
        BES_numeric[col].cat.remove_categories( weasel_words, inplace=True )
        var_type.loc[ col, "weasel_words" ] = "|".join( weasel_words )

    # Laziness - I want an extra column with the destination category sets
    # (should be a smaller set than original category sets)
    var_type.loc[ col, "final_cat_list" ] = "|".join( BES_numeric[col].cat.categories )        
####################################################

# save category data
cat_dictionary = {}
for col in BES_numeric.columns:
    if var_type["type"][col] in [1, 2, 3, 5]: # not just cat, but one not already numerical!
        cat_dictionary[col] = BES_numeric[col].cat.categories


# turn categories into numbers
for col in BES_numeric:

    if var_type["type"][col] in [1,2,3,5]: # category type variables (other than indicators)
        BES_numeric[col] = BES_numeric[col].cat.codes

    if var_type["type"][col] in [0,1,2,3,5,6,7]:
        BES_numeric[col] = BES_numeric[col].astype('float64')

BES_numeric.replace(-1,np.nan, inplace=True) # replace -1 cat code for NaN with actual NaN - downside, requires dtype float
####################################################

fname = data_subfolder + "cat_dictionary.pkl"
with open(fname, "wb") as f:
    pickle.dump( cat_dictionary, f )

BES_non_numeric = BES_Panel[non_num_cols].copy()
BES_non_numeric.to_msgpack( data_subfolder + "BESnon_numeric.msgpack" )

BES_numeric = BES_numeric.apply(pd.to_numeric,downcast='float')
BES_numeric.to_msgpack( data_subfolder + "BESnumeric.msgpack" )

var_type.to_csv( data_subfolder + "var_type.csv", encoding = encoding )
# don't think the performance warning will be relevant on such a small dataframe



id
wt_core_W3
wt_core_W4
wt_core_W5
wt_core_W6
wt_core_W7
wt_core_W8
wt_core_W9
wt_core_W1
wt_core_W2
wt_full_W3
wt_full_W1W2W3
wt_full_W4
wt_full_W1W2W3W4
wt_full_W5
wt_full_W1W2W3W4W5
wt_full_W4W5
wt_full_W6
wt_full_W1W2W3W4W5W6
wt_full_W4W5W6
wt_full_W4W6
wt_full_W7
wt_full_W8
wt_full_W1W2W3W4W5W6W7W8
wt_full_W7W8
wt_full_W9
wt_full_W1W2W3W4W5W6W7W8W9
wt_full_W7W8W9
wt_full_W10
wt_full_W11
wt_full_W1_W11
wt_full_W1
wt_full_W2
wt_full_W1W2
wt_full_W1W2W3W4W5W6W7
wave2ptvW14
wave2ptvW15
wave2ptvW16
wave1
wave2
wave3
wave4
wave5
wave6
wave7
wave8
wave9
wave10
wave11
wave12
wave13
wave14
wave15
wave16
country
starttimeW1
endtimeW1
sample_fullW1
wt_daily_W5
wt_new_W11
wt_new_W14
wt_new_W15
wt_new_W16
wt_new_W1_W16
wt_fresh_W16
wt_new_W6W7
wt_new_W2W6
wt_new_W2W6W7
wt_new_W9_W13
wt_new_W12
wt_daily_W12
wt_new_W13
wt_new_W1_W11
wt_new_W1_W12
wt_new_W1_W13
wt_new_W6_W11
wt_new_W6_W12
wt_new_W6_W13
wt_new_W11_W13
wt_new_W13W16
wt_new_W13_result
wt_new_W6W16
wt_new_W2W16
wt_new_W9
wt_new_W8
w

proposalTuitionW1
conPriorities_econW1
conPriorities_costLiveW1
conPriorities_nhsW1
conPriorities_immigW1
conPriorities_crimeW1
conPriorities_schoolW1
conPriorities_noneW1
labPriorities_econW1
labPriorities_costLiveW1
labPriorities_nhsW1
labPriorities_immigW1
labPriorities_crimeW1
labPriorities_schoolW1
labPriorities_noneW1
ldPriorities_econW1
ldPriorities_costLiveW1
ldPriorities_nhsW1
ldPriorities_immigW1
ldPriorities_crimeW1
ldPriorities_schoolW1
ldPriorities_noneW1
ukipPriorities_econW1
ukipPriorities_costLiveW1
ukipPriorities_nhsW1
ukipPriorities_immigW1
ukipPriorities_crimeW1
ukipPriorities_schoolW1
ukipPriorities_noneW1
partyContact1W1
partyContactConW1
partyContactLabW1
partyContactLDW1
partyContactSNPW1
partyContactPCW1
partyContactUKIPW1
partyContactOtherPartyW1
partyContactNoneW1
partyContactCon_1W1
partyContactCon_2W1
partyContactCon_3W1
partyContactCon_4W1
partyContactCon_5W1
partyContactCon_6W1
partyContactCon_7W1
partyContactLab_1W1
partyContactLab_2W1
partyContactLab_3W1

partyContactLab_4W2
partyContactLab_5W2
partyContactLab_6W2
partyContactLab_7W2
partyContactLD_1W2
partyContactLD_2W2
partyContactLD_3W2
partyContactLD_4W2
partyContactLD_5W2
partyContactLD_6W2
partyContactLD_7W2
partyContactPC_1W2
partyContactPC_2W2
partyContactPC_3W2
partyContactPC_4W2
partyContactPC_5W2
partyContactPC_6W2
partyContactPC_7W2
partyContactUKIP_1W2
partyContactUKIP_2W2
partyContactUKIP_3W2
partyContactUKIP_4W2
partyContactUKIP_5W2
partyContactUKIP_6W2
partyContactUKIP_7W2
partyContactOther_1W2
partyContactOther_2W2
partyContactOther_3W2
partyContactOther_4W2
partyContactOther_5W2
partyContactOther_6W2
partyContactOther_7W2
conUnitedW2
labUnitedW2
ldUnitedW2
pcUnitedW2
ukipUnitedW2
knowMPW2
registeredW2
registeredUpToDateW2
reasonNotRegistered_1W2
reasonNotRegistered_2W2
reasonNotRegistered_3W2
reasonNotRegistered_4W2
reasonNotRegistered_5W2
reasonNotRegistered_6W2
reasonNotRegistered_7W2
reasonNotRegistered_8W2
reasonNotRegistered_noneW2
reasonNotRegistered_dkW2
regSati

partyContactPC_5W3
partyContactPC_6W3
partyContactPC_7W3
partyContactUKIP_1W3
partyContactUKIP_2W3
partyContactUKIP_3W3
partyContactUKIP_4W3
partyContactUKIP_5W3
partyContactUKIP_6W3
partyContactUKIP_7W3
partyContactOther_1W3
partyContactOther_2W3
partyContactOther_3W3
partyContactOther_4W3
partyContactOther_5W3
partyContactOther_6W3
partyContactOther_7W3
conUnitedW3
labUnitedW3
ldUnitedW3
pcUnitedW3
ukipUnitedW3
knowMPW3
registeredW3
registeredUpToDateW3
reasonNotRegistered_1W3
reasonNotRegistered_2W3
reasonNotRegistered_3W3
reasonNotRegistered_4W3
reasonNotRegistered_5W3
reasonNotRegistered_6W3
reasonNotRegistered_7W3
reasonNotRegistered_8W3
reasonNotRegistered_noneW3
reasonNotRegistered_dkW3
registerChangeW3
govtHandleCostLiveW3
govtHandleEconW3
govtHandleImmigW3
govtHandleNHSW3
govtHandleEducW3
govtHandleLevelCrimeW3
labHandleCostLiveW3
labHandleEconW3
labHandleImmigW3
labHandleNHSW3
labHandleEducW3
labHandleLevelCrimeW3
conPriorities_econW3
conPriorities_costLiveW3
conPriorities_n

inequalityChangeW4
inequalityChangeLargerW4
inequalityChangeSmallerW4
inequalityGoodBadW4
satDemUKW4
satDemScotW4
satDemWalesW4
satDemEngW4
satDemEUW4
electionInterestW4
partyContact1W4
partyContactConW4
partyContactLabW4
partyContactLDW4
partyContactSNPW4
partyContactPCW4
partyContactUKIPW4
partyContactGrnW4
partyContactOtherPartyW4
partyContactNoneW4
partyContactCon_1W4
partyContactCon_2W4
partyContactCon_3W4
partyContactCon_4W4
partyContactCon_5W4
partyContactCon_6W4
partyContactCon_7W4
partyContactLab_1W4
partyContactLab_2W4
partyContactLab_3W4
partyContactLab_4W4
partyContactLab_5W4
partyContactLab_6W4
partyContactLab_7W4
partyContactLD_1W4
partyContactLD_2W4
partyContactLD_3W4
partyContactLD_4W4
partyContactLD_5W4
partyContactLD_6W4
partyContactLD_7W4
partyContactSNP_1W4
partyContactSNP_2W4
partyContactSNP_3W4
partyContactSNP_4W4
partyContactSNP_5W4
partyContactSNP_6W4
partyContactSNP_7W4
partyContactPC_1W4
partyContactPC_2W4
partyContactPC_3W4
partyContactPC_4W4
partyContactPC_5

partyContactPC_4W5
partyContactPC_5W5
partyContactPC_6W5
partyContactPC_7W5
partyContactUKIP_1W5
partyContactUKIP_2W5
partyContactUKIP_3W5
partyContactUKIP_4W5
partyContactUKIP_5W5
partyContactUKIP_6W5
partyContactUKIP_7W5
partyContactGreen_1W5
partyContactGreen_2W5
partyContactGreen_3W5
partyContactGreen_4W5
partyContactGreen_5W5
partyContactGreen_6W5
partyContactGreen_7W5
partyContactOther_1W5
partyContactOther_2W5
partyContactOther_3W5
partyContactOther_4W5
partyContactOther_5W5
partyContactOther_6W5
partyContactOther_7W5
participation_1W5
participation_2W5
participation_3W5
participation_4W5
participation_5W5
participation_6W5
discussPolDaysW5
infoSourceTVW5
infoSourcePaperW5
infoSourceRadioW5
infoSourceInternetW5
infoSourcePeopleW5
tvWatchW5
radioListenW5
paperReadW5
internetReadW5
twitterUseW5
twitterInfo_1W5
twitterInfo_2W5
twitterInfo_3W5
fbUseW5
fbInfo_1W5
fbInfo_2W5
fbInfo_3W5
visitPartySiteW5
partySiteConW5
partySiteLabW5
partySiteLDW5
partySiteSNPW5
partySitePCW5
partySiteU

infoSourcePaperW6
infoSourceRadioW6
infoSourceInternetW6
infoSourcePeopleW6
twitterUseW6
twitterInfof2fW6
fbUseW6
fbInfof2fW6
educationW6
w6full
w6core
starttimeW7
endtimeW7
miiW7
bestOnMIIW7
euRefTurnoutW7
euRefVoteW7
euRefVoteSqueezeW7
euRefInterestW7
euRefExpectationW7
euRefFinalW7
miieuW7
pidWeTheyW7
pidInterestedOthersW7
pidCriticisePartyW7
pidCommonPartyW7
pidRuinDayW7
pidConnectedW7
pidMyPartyW7
pidPraiseGoodW7
britishnessW7
scottishnessW7
welshnessW7
englishnessW7
europeannessW7
ethno1W7
ethno2W7
ethno3W7
ethno4W7
ethno5W7
ethno6W7
radicalW7
harkBackW7
polAttentionW7
trustMPsW7
likeCameronW7
likeCorbynW7
likeFarronW7
likeSturgeonW7
likeWoodW7
likeFarageW7
likeBennettW7
likeBorisW7
likeGoveW7
likeOsborneW7
likeAlanJohnsonW7
bestPMW7
goodTimePurchaseW7
riskPovertyW7
riskUnemploymentW7
econPersonalRetroW7
econGenRetroW7
likeConW7
likeLabW7
likeLDW7
likeSNPW7
likePCW7
likeUKIPW7
likeGrnW7
cutsTooFarNationalW7
cutsTooFarNHSW7
cutsTooFarLocalW7
privatTooFarW7
enviroProtectionW7
crime

ukCitizenW8
euCitizenW8
commonwealthCitizenW8
otherCitizenW8
miiW8
bestOnMIIW8
postalapplyW8
postalTurnoutW8
euRefVotePostW8
euRefTurnoutW8
euRefVoteW8
euRefVoteSqueezeW8
euRefCertaintyW8
euRefInterestW8
euRefExpectationW8
euRefFinalW8
miieuW8
britishnessW8
scottishnessW8
welshnessW8
englishnessW8
europeannessW8
polAttentionW8
likeCameronW8
likeCorbynW8
likeFarronW8
likeSturgeonW8
likeWoodW8
likeFarageW8
likeBennettW8
likeBorisW8
likeGoveW8
likeOsborneW8
likeAlanJohnsonW8
bestPMW8
likeConW8
likeLabW8
likeLDW8
likeSNPW8
likePCW8
likeUKIPW8
likeGrnW8
changeEconomyW8
changeNHSW8
changeEducationW8
changeCostLiveW8
changeImmigW8
changeCrimeW8
localTurnoutRetroW8
localElectionVoteW8
welshTurnoutRetroW8
welshElectionVoteConstW8
welshElectionVoteListW8
scotTurnoutRetroW8
scotElectionVoteConstW8
scotElectionVoteListW8
pccTurnoutW8
pccVote1W8
pccVote2W8
londonTurnoutW8
londonFirstPostW8
londonSecondPostW8
londonAssemblyConstituencyPostW8
londonAssemblyWidePostW8
EUIntegrationSelfW8
selfEUCertain

passportOtherW9
passportExpiredW9
passportNeverW9
euIDW9
euID1W9
euID2W9
euID3W9
euID4W9
euID5W9
euID6W9
euID7W9
handleMIIConW9
handleMIILabW9
handleMIILDW9
handleMIISNPW9
handleMIIPCW9
handleMIIUKIPW9
handleMIIGrnW9
ethno1W9
ethno2W9
ethno3W9
ethno4W9
ethno5W9
ethno6W9
prPreferenceW9
locus1W9
locus2W9
conPriorities_econW9
conPriorities_costLiveW9
conPriorities_nhsW9
conPriorities_immigW9
conPriorities_crimeW9
conPriorities_schoolW9
conPriorities_noneW9
labPriorities_econW9
labPriorities_costLiveW9
labPriorities_nhsW9
labPriorities_immigW9
labPriorities_crimeW9
labPriorities_schoolW9
labPriorities_noneW9
ldPriorities_econW9
ldPriorities_costLiveW9
ldPriorities_nhsW9
ldPriorities_immigW9
ldPriorities_crimeW9
ldPriorities_schoolW9
ldPriorities_noneW9
ukipPriorities_econW9
ukipPriorities_costLiveW9
ukipPriorities_nhsW9
ukipPriorities_immigW9
ukipPriorities_crimeW9
ukipPriorities_schoolW9
ukipPriorities_noneW9
leftRightW9
lrConW9
lrLabW9
lrLDW9
lrUKIPW9
lrSNPW9
lrPCW9
lrgreensW9
likeMayW9


harkBackW11
renationaliseRailW11
overseasAidW11
EUIntegrationSelfW11
EUIntegrationConW11
EUIntegrationLabW11
EUIntegrationLDW11
EUIntegrationSNPW11
EUIntegrationPCW11
EUIntegrationUKIPW11
EUIntegrationGreenW11
satDemUKW11
satDemScotW11
satDemWalesW11
satDemEngW11
controlImmigW11
negotiationSpecifics_1W11
negotiationSpecifics_2W11
negotiationSpecifics_3W11
negotiationSpecifics_noneW11
expectAccessW11
handleEUNegotiateW11
euPriorityBalanceW11
effectsEUUnemploymentW11
effectsEUTradeW11
effectsEUImmigrationW11
effectsEUTerrorW11
euLeaveVoiceW11
effectsEUWorkersW11
effectsEUEconW11
effectsEUFinanceW11
effectsEUNHSW11
effectsEUEconScotW11
effectsEUEconWalesW11
euLeaveBigBusinessW11
euLeaveScotIndepW11
UKsovereigntyPostW11
euIDW11
euID1W11
euID2W11
euID3W11
euID4W11
euID6W11
euID7W11
immigEconW11
immigCulturalW11
immigSelfW11
immigConW11
immigLabW11
immigLDW11
immigSNPW11
immigPCW11
immigUKIPW11
immigGreenW11
achieveReduceImmigConW11
achieveReduceImmigLabW11
achieveReduceImmigLDW11
achieveRed

partyContactUKIP_7W12
partyContactOther_1W12
partyContactOther_2W12
partyContactOther_3W12
partyContactOther_4W12
partyContactOther_5W12
partyContactOther_6W12
partyContactOther_7W12
partyAgree2W12
localTurnoutRetroW12
localElectionVoteW12
manchesterFirstW12
manchesterSecondW12
liverpoolFirstW12
liverpoolSecondW12
teesFirstW12
teesSecondW12
wmidsFirstW12
wmidsSecondW12
westFirstW12
westSecondW12
doncasterFirstW12
doncasterSecondW12
tyneFirstW12
tyneSecondW12
conLookAfterMCW12
conLookAfterWCW12
labLookAfterMCW12
labLookAfterWCW12
ukipLookAfterMCW12
ukipLookAfterWCW12
snpLookAfterMCW12
snpLookAfterWCW12
discussPolDaysW12
debateOneWatchW12
debateThreeWatchW12
debateTwoWatchW12
conToneW12
labToneW12
ldToneW12
snpToneW12
pcToneW12
ukipToneW12
grnToneW12
participation_1W12
participation_2W12
participation_3W12
participation_4W12
participation_5W12
participation_6W12
infoSourceTVW12
infoSourcePaperW12
infoSourceRadioW12
infoSourceInternetW12
twitterUseW12
twitterInfo_1W12
twitterInfo_2W12
twi

euPriorityBalanceW14
effectsEUUnemploymentW14
effectsEUTradeW14
effectsEUImmigrationW14
effectsEUTerrorW14
euLeaveVoiceW14
effectsEUWorkersW14
effectsEUEconW14
effectsEUFinanceW14
effectsEUNHSW14
effectsEUEconScotW14
effectsEUEconWalesW14
euLeaveBigBusinessW14
euLeaveScotIndepW14
UKsovereigntyPostW14
localTurnoutRetroW14
localElectionVoteW14
leftRightW14
taxSpendSelfW14
taxSpendConW14
taxSpendLabW14
taxSpendLDW14
taxSpendUKIPW14
taxSpendSNPW14
taxSpendPCW14
taxSpendGreenW14
immigEconW14
immigCulturalW14
immigSelfW14
immigConW14
immigLabW14
immigLDW14
immigSNPW14
immigPCW14
immigUKIPW14
immigGreenW14
achieveReduceImmigConW14
achieveReduceImmigLabW14
achieveReduceImmigLDW14
achieveReduceImmigSNPW14
achieveReduceImmigPCW14
achieveReduceImmigUKIPW14
achieveReduceImmigGrnW14
achieveReduceImmigNoneW14
redistSelfW14
redistConW14
redistLabW14
redistLDW14
redistUKIPW14
redistSNPW14
redistPCW14
redistGreenW14
referendumSettledW14
scotReferendumIntentionW14
strongLeaderW14
welfarePreferenceW14
re

dutyToVote2W15
antiIntellectualW15
efficacyUnderstandW15
efficacyTooMuchEffortW15
efficacyNotUnderstandW15
efficacyPolCareW15
efficacyNoMatterW15
referendumSettledW15
scotReferendumIntentionW15
scotIndepEconomyW15
scotIndepJoinEUW15
scotIndepMeBetterOffW15
scotRefIDW15
scotRefID1W15
scotRefID2W15
scotRefID3W15
scotRefID4W15
scotRefID6W15
scotRefID7W15
scotConW15
scotLabW15
scotLDW15
scotSNPW15
scotGrnW15
scotElectionVoteConstW15
welshConW15
welshLabW15
welshLDW15
welshPCW15
welshGrnW15
welshElectionVoteConstW15
welshElectionVoteListW15
prPreferenceW15
refAreGoodW15
strongLeaderW15
normEUW15
willOfPeopleW15
cancelBrexitW15
echrW15
britishPrideW15
euIDW15
euID1W15
euID2W15
euID3W15
euID4W15
euID6W15
euID7W15
euIDStrengthW15
changeEconomyW15
changeNHSW15
changeEducationW15
changeCostLiveW15
changeImmigW15
changeCrimeW15
britishnessW15
scottishnessW15
welshnessW15
englishnessW15
europeannessW15
partyMemberW15
partyMemberPastW15
partyMemberNowW15
genTrustW15
newspaperReadW15
infoSourceTVW15

achieveReduceImmigGrnW16
achieveReduceImmigTIGW16
achieveReduceImmigBrexitW16
achieveReduceImmigNoneW16
redistSelfW16
redistConW16
redistLabW16
redistLDW16
redistUKIPW16
redistSNPW16
redistPCW16
redistGreenW16
redistTIGW16
redistBrexitW16
antiIntellectualW16
efficacyUnderstandW16
efficacyTooMuchEffortW16
efficacyNotUnderstandW16
efficacyPolCareW16
efficacyNoMatterW16
referendumSettledW16
scotReferendumIntentionW16
prPreferenceW16
conUnitedW16
labUnitedW16
ldUnitedW16
snpUnitedW16
pcUnitedW16
ukipUnitedW16
grnUnitedW16
tigUnitedW16
brexitUnitedW16
refAreGoodW16
strongLeaderW16
willOfPeopleW16
cancelBrexitW16
britishPrideW16
euIDW16
euID1W16
euID2W16
euID3W16
euID4W16
euID6W16
euID7W16
euIDStrengthW16
changeEconomyW16
changeNHSW16
changeEducationW16
changeCostLiveW16
changeImmigW16
changeCrimeW16
partyMemberOrSupporterW16
registeredSupporterNowW16
partyMemberNowW16
lr1W16
lr2W16
lr3W16
lr4W16
lr5W16
al1W16
al2W16
al3W16
al4W16
al5W16
mpLooksAfterConstInterestW16
blackEqualityW16
femaleEq

turnoutUKGeneralW8
turnoutUKGeneralW9
turnoutUKGeneralW10
turnoutUKGeneralW11
turnoutUKGeneralW12
turnoutUKGeneralW14
turnoutUKGeneralW15
turnoutUKGeneralW16
generalElectionVoteW1
generalElectionVoteW2
generalElectionVoteW3
generalElectionVoteW4
generalElectionVoteUnsqueezeW4
generalElectionVoteSqueezeW4
generalElectionVoteW5
generalElectionVotePostW5
generalElectionVoteUnsqueezeW5
generalElectionVoteSqueezeW5
generalElectionVoteW6
generalElectionVoteW7
generalElectionVoteW8
generalElectionVoteW9
generalElectionVoteW10
generalElectionVoteW11
generalElectionVoteUnsqueezeW11
generalElectionVoteSqueezeW11
generalElectionVoteW12
generalElectionVotePostW12
generalElectionVoteUnsqueezeW12
generalElectionVoteSqueezeW12
generalElectionVoteW13
generalElectionVoteW14
generalElectionVoteW15
generalElectionVoteTIGW15
generalElectionVoteW16
generalElectionCertaintyW1
generalElectionCertaintyW2
generalElectionCertaintyW3
generalElectionCertaintyW4
generalElectionCertaintyUnsqW5
generalElectionCertai

partyContactPC_6W1
partyContactPC_7W1
partyContactUKIP_1W1
partyContactUKIP_2W1
partyContactUKIP_3W1
partyContactUKIP_4W1
partyContactUKIP_5W1
partyContactUKIP_6W1
partyContactUKIP_7W1
partyContactOther_1W1
partyContactOther_2W1
partyContactOther_3W1
partyContactOther_4W1
partyContactOther_5W1
partyContactOther_6W1
partyContactOther_7W1
conUnitedW1
labUnitedW1
ldUnitedW1
pcUnitedW1
ukipUnitedW1
mpNameW1
knowMPW1
countryOfBirth
selfOccSuperviseW1
selfOccOrgSizeW1
selfOccEmployeesW1
selfOccSuperviseLastW1
selfOccOrgSizeLastW1
selfOccEmployeesLastW1
headHouseholdPastW1
fatherNumEmployeesW1
motherNumEmployeesW1
educationW1_W6
bestOnMIIW2
polAttentionW2
trustMPsW2
trustYourMPW2
likeCameronW2
likeMilibandW2
likeCleggW2
likeSalmondW2
likeWoodW2
likeFarageW2
goodTimePurchaseW2
riskPovertyW2
riskUnemploymentW2
econPersonalRetroW2
econGenRetroW2
likeConW2
likeLabW2
likeLDW2
likeSNPW2
likePCW2
likeUKIPW2
likeGrnW2
likeBNPW2
ptvConW2
ptvLabW2
ptvLDW2
ptvSNPW2
ptvPCW2
ptvUKIPW2
ptvGrnW2
ptvBNPW2
re

ldPriorities_nhsW2
ldPriorities_immigW2
ldPriorities_crimeW2
ldPriorities_schoolW2
ldPriorities_noneW2
ukipPriorities_econW2
ukipPriorities_costLiveW2
ukipPriorities_nhsW2
ukipPriorities_immigW2
ukipPriorities_crimeW2
ukipPriorities_schoolW2
ukipPriorities_noneW2
bes1aW2
bes2aW2
bes3aW2
bes4aW2
bes4bW2
govtHandleCostLiveW2
govtHandleEconW2
govtHandleImmigW2
govtHandleNHSW2
govtHandleEducW2
govtHandleLevelCrimeW2
labHandleCostLiveW2
labHandleEconW2
labHandleImmigW2
labHandleNHSW2
labHandleEducW2
labHandleLevelCrimeW2
econPersonalProspW2
econGenProspW2
RV1W2
RV2W2
RV3W2
RV4W2
RV5W2
RV6W2
RV7W2
RV8W2
dv1ClassExpW2
dv2ClassExpW2
dv3ClassExpW2
dv4ClassExpW2
immigExpDVW2
immigManipCheckW2
immigManipCheck2W2
finlit1W2
finlit2W2
finlit3W2
spendMost1W2
spendLeast1W2
spendMost2W2
spendLeast2W2
spendMost3W2
spendLeast3W2
spendMost4W2
spendLeast4W2
spendMost5W2
spendLeast5W2
spendMost6W2
spendLeast6W2
minIncomeWellOffW2
minIncomeGetByW2
incomeWelfareW2
welfarePreferenceExpW2
csplConductPublicOffic

ukipPriorities_schoolW3
ukipPriorities_noneW3
accessReliableTradesmanW3
accessFluentSpeakerW3
accessFixComputerW3
accessGovtRegulatorW3
accessMediaW3
accessHealthW3
accessCouncilWorkerW3
accessCouncillorW3
accessEmployerW3
accessProfessionalW3
accessWorkAdviceW3
accessSmallJobsW3
accessShoppingHelpW3
accessSmallLoanW3
accessDiscussPoliticsW3
accessLegalAdviceW3
accessJobReferenceW3
accessNewHomeW3
pidWeTheyW3
pidInterestedOthersW3
pidCriticisePartyW3
pidCommonPartyW3
pidRuinDayW3
pidConnectedW3
pidMyPartyW3
pidPraiseGoodW3
minIncomeWellOffW3
minIncomeGetByW3
incomeWelfareW3
welfarePreferenceExpW3
educationW3
selfOccSuperviseW3
selfOccOrgSizeW3
selfOccSuperviseLastW3
selfOccOrgSizeLastW3
headHouseholdPastW3
mapdefined_1W3
mapdefined_2W3
mapdefined_3W3
mapdefined_4W3
mapdefined_5W3
mapdefined_6W3
mapdefined_7W3
mapdefined_8W3
mapdefined_9W3
mapBelongW3
mapRepresentW3
mapLiveWorkW3
mapLiveTimeW3
mapEconW3
mapInequalityW3
localUnemployment_a_1W3
nationalUnemployment_a_1W3
mapTurnoutW3
mapC

reasonNotRegistered_2W4
reasonNotRegistered_3W4
reasonNotRegistered_4W4
reasonNotRegistered_5W4
reasonNotRegistered_6W4
reasonNotRegistered_7W4
reasonNotRegistered_8W4
reasonNotRegistered_dkW4
regSatisfactionW4
howDidYouFillRegFormW4
referendumSettledW4
scotReferendumIntentionW4
devoResponsibleScotWelfareW4
devoResponsibleScotNHSW4
devoResponsibleScotSchoolsW4
devoResponsibleScotDefenceW4
devoResponsibleScotTaxW4
devoResponsibleScotPoliceW4
scotDevoMaxW4
expectationDevoScotW4
expectationDevoWalesW4
engFairShareW4
scotFairShareW4
walesFairShareW4
approveUKGovtW4
approveScotGovtW4
approveWelshGovtW4
scotElectionVoteConstW4
englandGovernW4
devoPrefWalesW4
devoResponsibleWalesWelfareW4
devoResponsibleWalesNHSW4
devoResponsibleWalesSchoolsW4
devoResponsibleWalesDefenceW4
devoResponsibleWalesTaxW4
devoResponsibleWalesPoliceW4
devoResponsibleWalesJusticeW4
welshElectionVoteConstW4
ashcroftW4
embesExpectationsGapW4
embesDiscriminationW4
mpLikelyToHelpW4
embeshelpW4
govtHandleCostLiveW4
govtHan

participation_3W6
participation_4W6
likeConW6
likeLabW6
likeLDW6
likeSNPW6
likePCW6
likeUKIPW6
likeGrnW6
likeBNPW6
ptvConW6
ptvLabW6
ptvLDW6
ptvSNPW6
ptvPCW6
ptvUKIPW6
ptvGrnW6
ptvBNPW6
cutsTooFarNationalW6
cutsTooFarLocalW6
cutsTooFarNHSW6
privatTooFarW6
enviroProtectionW6
tuitionFeesTooFarW6
immigrationLevelW6
handleMIIConW6
handleMIILabW6
handleMIILDW6
handleMIIUKIPW6
handleMIIGrnW6
deficitReduceW6
howToReduceDeficitW6
zeroHourContractW6
efficacyUnderstandW6
efficacyTooMuchEffortW6
efficacyNotUnderstandW6
efficacyPolCareW6
efficacyVoteEffortW6
efficacyEnjoyVoteW6
smallPartyWastedVoteW6
smallVoterPrefW6
efficacyNoMatterW6
leftRightW6
lrConW6
lrLabW6
lrLDW6
lrUKIPW6
lrSNPW6
lrPCW6
lrgreensW6
euRefVoteW6
eesEUIntegrationSelfW6
eesEUIntegrationConW6
eesEUIntegrationLabW6
eesEUIntegrationLDW6
eesEUIntegrationUKIPW6
eesEUIntegrationSNPW6
eesEUIntegrationPCW6
eesEUIntegrationGreenW6
EUIntegrationSelfW6
EUIntegrationConW6
EUIntegrationLabW6
EUIntegrationLDW6
EUIntegrationUKIPW6
EUIntegratio

ukCoopMovementW7
ukCoopEuroW7
effectsEUUnemploymentW7
effectsEUTradeW7
effectsEUImmigrationW7
effectsEUTerrorW7
euLeaveVoiceW7
effectsEUWorkersW7
effectsEUEconW7
effectsEUFinanceW7
euLeaveBigBusinessW7
euLeaveScotIndepW7
certaintyUKLeaveW7
certaintyUKRemainW7
businessSupportW7
tuSupportW7
labSupportW7
conSupportW7
ldSupportW7
snpSupportW7
plaidSupportW7
parliamentSupportW7
cabinetSupportW7
franceFairShareW7
britainFairShareW7
germanyFairShareW7
euPreventWarW7
euUKRichW7
euCloserW7
UKsovereigntyW7
euRedTapeW7
euMemberJoinW7
turkeyW7
euParlOverRideW7
euLawsLevelW7
echrW7
normEUW7
euUndermineIdentityW7
socialIdentityGlobalLeaveW7
socialIdentityGlobalRemainW7
euScepticismW7
infoSourceTVW7
infoSourcePaperW7
infoSourceRadioW7
infoSourceInternetW7
infoSourcePeopleW7
euSources_1W7
euSources_2W7
euSources_3W7
euSources_4W7
euSources_5W7
euSources_111W7
expectGoodConductEURefW7
euGovtLeafletW7
partyContact1W7
partyContactConW7
partyContactLabW7
partyContactLDW7
partyContactSNPW7
partyContactPCW7

infoSourceRadioW8
infoSourceInternetW8
infoSourcePeopleW8
euSources_1W8
euSources_2W8
euSources_3W8
euSources_4W8
euSources_5W8
euSources_111W8
discussPolDaysW8
euGovtLeafletW8
partyContact1W8
partyContactConW8
partyContactLabW8
partyContactLDW8
partyContactSNPW8
partyContactPCW8
partyContactUKIPW8
partyContactGrnW8
partyContactOtherPartyW8
partyContactStrongerInW8
partyContactVoteLeaveW8
partyContactOthLeaveW8
partyContactOthRemainW8
partyContactNoneW8
partyContactCon_1W8
partyContactCon_2W8
partyContactCon_3W8
partyContactCon_4W8
partyContactCon_5W8
partyContactCon_6W8
partyContactCon_7W8
partyContactLab_1W8
partyContactLab_2W8
partyContactLab_3W8
partyContactLab_4W8
partyContactLab_5W8
partyContactLab_6W8
partyContactLab_7W8
partyContactLD_1W8
partyContactLD_2W8
partyContactLD_3W8
partyContactLD_4W8
partyContactLD_5W8
partyContactLD_6W8
partyContactLD_7W8
partyContactSNP_1W8
partyContactSNP_2W8
partyContactSNP_3W8
partyContactSNP_4W8
partyContactSNP_5W8
partyContactSNP_6W8
partyCont

redistUKIPW10
redistSNPW10
redistPCW10
redistGreenW10
antiIntellectualW10
efficacyUnderstandW10
efficacyTooMuchEffortW10
efficacyNotUnderstandW10
efficacyPolCareW10
efficacyNoMatterW10
radicalW10
harkBackW10
gayMarriageW10
womenJobsW10
genderRolesW10
renationaliseRailW10
overseasAidW10
EUIntegrationSelfW10
EUIntegrationConW10
EUIntegrationLabW10
EUIntegrationLDW10
EUIntegrationSNPW10
EUIntegrationPCW10
EUIntegrationUKIPW10
EUIntegrationGreenW10
satDemUKW10
satDemScotW10
satDemWalesW10
satDemEngW10
satDemEUW10
controlImmigW10
negotiationSpecifics_1W10
negotiationSpecifics_2W10
negotiationSpecifics_3W10
negotiationSpecifics_noneW10
expectAccessW10
handleEUNegotiateW10
euPriorityBalanceW10
ukCoopTradeW10
ukCoopMovementW10
socialIdentityGlobalLeaveW10
socialIdentityGlobalRemainW10
effectsEUUnemploymentW10
effectsEUTradeW10
effectsEUImmigrationW10
effectsEUTerrorW10
euLeaveVoiceW10
effectsEUWorkersW10
effectsEUEconW10
effectsEUFinanceW10
effectsEUNHSW10
euLeaveBigBusinessW10
euLeaveScotInde

partyContactLDW11
partyContactSNPW11
partyContactPCW11
partyContactUKIPW11
partyContactOtherPartyW11
partyContactNoneW11
partyContactCon_1W11
partyContactCon_2W11
partyContactCon_3W11
partyContactCon_4W11
partyContactCon_5W11
partyContactCon_6W11
partyContactCon_7W11
partyContactLab_1W11
partyContactLab_2W11
partyContactLab_3W11
partyContactLab_4W11
partyContactLab_5W11
partyContactLab_6W11
partyContactLab_7W11
partyContactLD_1W11
partyContactLD_2W11
partyContactLD_3W11
partyContactLD_4W11
partyContactLD_5W11
partyContactLD_6W11
partyContactLD_7W11
partyContactSNP_1W11
partyContactSNP_2W11
partyContactSNP_3W11
partyContactSNP_4W11
partyContactSNP_5W11
partyContactSNP_6W11
partyContactSNP_7W11
partyContactPC_1W11
partyContactPC_2W11
partyContactPC_3W11
partyContactPC_4W11
partyContactPC_5W11
partyContactPC_6W11
partyContactPC_7W11
partyContactUKIP_1W11
partyContactUKIP_2W11
partyContactUKIP_3W11
partyContactUKIP_4W11
partyContactUKIP_5W11
partyContactUKIP_6W11
partyContactUKIP_7W11
part

participation_4W13
participation_5W13
euRefDoOverW13
euRefVoteW13
goodConductGeneralW13
polAttentionW13
pidWeTheyW13
pidInterestedOthersW13
pidCriticisePartyW13
pidCommonPartyW13
pidRuinDayW13
pidConnectedW13
pidMyPartyW13
pidPraiseGoodW13
electionInterestW13
likeMayW13
likeCorbynW13
likeFarronW13
likeSturgeonW13
likeWoodW13
likeNuttallW13
likeLucasW13
likeConW13
likeLabW13
likeLDW13
likeSNPW13
likePCW13
likeUKIPW13
likeGrnW13
conUnitedW13
labUnitedW13
ldUnitedW13
snpUnitedW13
pcUnitedW13
ukipUnitedW13
grnUnitedW13
econPersonalRetroW13
econGenRetroW13
EUIntegrationSelfW13
EUIntegrationConW13
EUIntegrationLabW13
EUIntegrationLDW13
EUIntegrationSNPW13
EUIntegrationPCW13
EUIntegrationUKIPW13
EUIntegrationGreenW13
leftRightW13
satDemUKW13
satDemScotW13
satDemWalesW13
satDemEngW13
controlImmigW13
negotiationSpecifics_1W13
negotiationSpecifics_2W13
negotiationSpecifics_3W13
negotiationSpecifics_noneW13
expectAccessW13
handleEUNegotiateW13
euPriorityBalanceW13
effectsEUUnemploymentW13
effects

europeannessW14
partyMemberW14
partyMemberPastW14
partyMemberNowW14
discrimMenW14
discrimWomenW14
discrimChristiansW14
discrimMuslimsW14
discrimBMEW14
discrimWhiteW14
anyUniW14
homeOwnW14
homeAmtbW14
mortgagePaymentW14
secondHomeW14
secondHomeAmtbW14
rentPaymentW14
buyHomeFutureW14
savingsW14
savingsAmtbW14
debtW14
debtAmtbW14
studentloanW14
borrowEssentialsW14
smallEmergency_1W14
smallEmergency_2W14
smallEmergency_3W14
smallEmergency_4W14
smallEmergency_5W14
smallEmergency_99W14
numChildrenW14
worryEconSecurityW14
moreParl_1W14
moreParl_2W14
moreParl_3W14
moreParl_4W14
moreParl_5W14
moreParl_6W14
moreParl_7W14
moreParl_8W14
moreParl_9W14
moreParl_10W14
moreParl_111W14
propMPLocalW14
propMPWCW14
propMPFemaleW14
propMPDisabW14
propMPYoungW14
propMPRaceW14
propMPChristW14
propMPLGBTW14
propMPMuslimW14
propMPDegreeW14
impOccW14
impRaceW14
ImpReligW14
impLocalW14
impGenderW14
impAgeW14
impEdW14
impGayW14
sexualityW14
disabilityW14
groupempathy1W14
groupempathy2W14
groupempathy3W14
groupemp

sdoantiegal3W15
sdoantiegal4W15
regionEconW15
londonEconW15
emEconW15
wbEconW15
mcEconW15
wcEconW15
localEconW15
richEconW15
poorEconW15
selfEconW15
anyUniW15
knowf2f1W15
knowf2f2W15
knowf2f3W15
knowf2f4W15
knowf2f5W15
knowf2f6W15
workingStatusW15
educationW15
subjectHEW15
subjectHECurrentW15
subjClassW15
prevJobW15
selfOccStatusW15
selfOccSuperviseW15
selfOccOrgSizeW15
selfOccEmployeesW15
selfNumEmployeesW15
selfOccStatusLastW15
selfOccSuperviseLastW15
selfOccOrgSizeLastW15
selfOccEmployeesLastW15
selfNumEmployeesLastW15
blackEqualityW15
femaleEqualityW15
gayEqualityW15
scotReferendumRetroW15
scotReferendumVoteW15
euRefTurnoutRetroW15
euRefpastVoteW15
fresh_sampleW16
gorW16
ukCitizenW16
euCitizenW16
commonwealthCitizenW16
otherCitizenW16
bestOnMIIW16
polAttentionW16
pidWeTheyW16
pidInterestedOthersW16
pidCriticisePartyW16
pidCommonPartyW16
pidRuinDayW16
pidConnectedW16
pidMyPartyW16
pidPraiseGoodW16
likeMayW16
likeCorbynW16
likeCableW16
likeSturgeonW16
likePriceW16
likeBattenW16
likeF

profile_marital_statW11
profile_work_statW11
profile_work_typeW11
profile_house_tenureW11
profile_newspaperW11
profile_religionW11
profile_newspaper2W12
profile_gross_householdW12
profile_socgradeW12
profile_work_statW12
profile_house_tenureW12
profile_religionW12
profile_education_ageW13
profile_gross_householdW13
profile_marital_statW13
profile_newspaperW13
profile_socgradeW13
profile_work_statW13
profile_house_tenureW13
profile_religionW13
profile_gross_householdW14
profile_marital_statW14
profile_socgradeW14
profile_work_industryW14
profile_work_organisationW14
profile_work_responsibilityW14
profile_work_statW14
profile_work_typeW14
profile_religionW14
profile_gross_householdW15
profile_religionW15
profiles_newspaper2W16
profile_education_age
profile_ethnicity
profile_lea
profile_gross_household
profile_gross_personal
profile_household_size
profile_household_children
profile_newspaper
profile_past_vote_2005
profile_past_vote_2010
profile_religion
profile_religion_denom
profile_work

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
It is recommended to use pyarrow for on-the-wire transmission of pandas objects.


In [20]:
col

'small_mii_catW10'

In [21]:
BES_Panel[col]

0        Europe
1           NaN
2        Europe
3           NaN
4           NaN
          ...  
92582       NaN
92583       NaN
92584       NaN
92585       NaN
92586       NaN
Name: small_mii_catW10, Length: 92587, dtype: category
Categories (12, object): [Austerity/spending, Economy, Environment, Europe, ..., Other, Other Left-right, Other lib-auth, Terrorism]

In [36]:
BES_numeric.dtypes.value_counts()

float32    6253
dtype: int64

In [37]:
BES_numeric.to_msgpack( data_subfolder + "BESnumeric.msgpack" )

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  """Entry point for launching an IPython kernel.


In [38]:
BES_numeric2 = pd.read_msgpack( data_subfolder + "BESnumeric.msgpack" )

It is recommended to use pyarrow for on-the-wire transmission of pandas objects.
  exec(code_obj, self.user_global_ns, self.user_ns)


In [43]:
BES_numeric.memory_usage().sum()

2316526740

In [51]:
BES_numeric['country'].astype('Int64').memory_usage()

1573979

In [52]:
BES_numeric['country'].memory_usage()

1111044

In [62]:
BES_numeric.astype(pd.SparseDtype("float32", np.nan)).memory_usage().sum()

777903592

In [64]:
BES_numeric.astype(pd.SparseDtype("float32", np.nan)).to_pickle( data_subfolder + "BESnumeric.pkl" )

In [44]:

BES_numeric.astype('Int64').memory_usage().sum()

TypeError: cannot safely cast non-equivalent float32 to int64

In [22]:
# "|".join( BES_Panel[col].cat.categories )

In [23]:
# BES_Panel["identityStrengthAllW11"].value_counts()

In [24]:
# socLibW15|socLib2W15|traditionalismW15|respectAuthW15
# BES_Panel["nuclearGridW4"].value_counts()

In [25]:
# search(BES_Panel,"warm.*W7")
# discussantturnoutName3W2

In [26]:
# BES_Panel["warmUKW7"]

In [27]:
# BES_Panel["warmChristianW15"]

In [28]:
# rows = updated_variable_categories[updated_variable_categories["question_length"]>10000]
# rows
# import difflib
# # rows.loc[320,"question"]
# # rows.loc[397,"question"]
# cases=[(rows.loc[320,"question"], rows.loc[397,"question"])] 

# for a,b in cases:     
#     print('{} => {}'.format(a,b))  
#     for i,s in enumerate(difflib.ndiff(a, b)):
#         if s[0]==' ': continue
#         elif s[0]=='-':
#             print(u'Delete "{}" from position {}'.format(s[-1],i))
#         elif s[0]=='+':
#             print(u'Add "{}" to position {}'.format(s[-1],i))    
#     print()      

In [29]:
# updated_variable_categories[match]

In [30]:
# [x for x in BES_Panel.columns if 'selfNumEmployeesW6' in x]

In [31]:
# selfNumEmployeesW6_W12, selfNumEmployeesLastW6_W12

In [32]:
# dataset_name

In [33]:
# BES_Panel[[x for x in BES_Panel.columns  if "selfNumEmployees" in x][3]].value_counts()

In [34]:
# BES_numeric[col]

In [35]:
gc.collect()

0