In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import gc
import re

from itertools import cycle
from IPython.display import display
import pickle, os

import seaborn as sns

from IPython.core.debugger import set_trace


In [2]:
# you should clone this git to this subdirectory (in some directory - I call it BES_analysis - doesn't matter though)

if os.getcwd().split(os.sep)[-1] != 'BES_analysis_code':
    raise Exception("Stop! You're in the wrong directory - should be in 'BES_analysis_code'")

BES_code_folder   = "../BES_analysis_code/" # we should be here!
BES_small_data_files = BES_code_folder + "small data files" + os.sep
if not os.path.exists( BES_small_data_files ):
    os.makedirs( BES_small_data_files )

# we should create these if they don't already exist
BES_data_folder   = "../BES_analysis_data/"
if not os.path.exists( BES_data_folder ):
    os.makedirs( BES_data_folder )

BES_output_folder = "../BES_analysis_output/"
if not os.path.exists( BES_output_folder ):
    os.makedirs( BES_output_folder )

In [3]:
# import Jupyter_module_loader

In [4]:
encoding = "ISO-8859-1"

In [5]:
# function to load

In [6]:
## HELPER FUNCTIONS / REPLACEMENT VALUE DICTIONARIES

# Rename -> Reorder

# changing the order of some sets of categories
change_cat_dict = {"Bad time to buy|Good time to buy|Neither good nor bad time to buy|Don't know": ["Bad time to buy",
                                                                                                    "Neither good nor bad time to buy",
                                                                                                    "Good time to buy",
                                                                                                    "Don't know"],
                   "Larger|Smaller|About the same|Don't know": ["Larger", "About the same", "Smaller","Don't know"],
                   "Yes|No|99.0":       ['No', 'Yes', '99.0'],
                   "Yes|No|Don't know": ['No', 'Yes', "Don't know"],
                   "Yes|No" :           ['No', 'Yes'],                   
                   "Yes|No|Did not vote|Don't know" : ["No","Yes","Did not vote","Don't know"],
                   "Yes, voted|No, did not vote|Don't know" : ["No, did not vote", "Yes, voted", "Don't know"],
                   "I would/will not vote|Leave the EU|Stay in the EU|Don't know":
                       ['Stay in the EU', 'Leave the EU', 'I would/will not vote', "Don't know"],
                   "Mainly leave|Mainly remain|Fairly evenly divided|Don't know": ["Mainly remain",
                                                                                   "Fairly evenly divided", "Mainly leave", "Don't know"],
                   "An individual share in a company|A portfolio of different company shares|The risk is the same|Don't know|Prefer not to say":
                       ['An individual share in a company', 'The risk is the same', 'A portfolio of different company shares',"Prefer not to say","Don't know"],
                   "No, I have never been a member|Yes, I am a member of a party|I am not a member now but I used to be|Don't know":
                       ['No, I have never been a member', 'I am not a member now but I used to be', 'Yes, I am a member of a party', "Don't know"],
                   "Never or practically never|Less often than once a year|Less often but at least once a year|Less often but at least twice a year|Less often but at least once a month|Less often but at least once in two weeks|Once a week or more|Varies too much to say|I am not religious|Don't know":
                       ['I am not religious', 'Never or practically never', 'Less often than once a year',
                        'Less often but at least once a year', 'Less often but at least twice a year',
                        'Less often but at least once a month', 'Less often but at least once in two weeks',
                        'Once a week or more', "Varies too much to say","Don't know"],
                   "under £5,000 per year|£5,000 to £9,999 per year|£10,000 to £14,999 per year|£15,000 to £19,999 per year|£20,000 to £24,999 per year|£25,000 to £29,999 per year|£30,000 to £34,999 per year|£35,000 to £39,999 per year|£40,000 to £44,999 per year|£45,000 to £49,999 per year|£50,000 to £59,999 per year|£60,000 to £69,999 per year|£70,000 to £99,999 per year|£100,000 to £149,999 per year|£150,000 and over|Don't know|Prefer not to answer":
                       [ 'under £5,000 per year',
                         '£5,000 to £9,999 per year',
                         '£10,000 to £14,999 per year',
                         '£15,000 to £19,999 per year',
                         '£20,000 to £24,999 per year',
                         '£25,000 to £29,999 per year',
                         '£30,000 to £34,999 per year',
                         '£35,000 to £39,999 per year',
                         '£40,000 to £44,999 per year',
                         '£45,000 to £49,999 per year',
                         '£50,000 to £59,999 per year',
                         '£60,000 to £69,999 per year',
                         '£70,000 to £99,999 per year',
                         '£100,000 to £149,999 per year',
                         '£150,000 and over',                         
                         'Prefer not to answer',
                         "Don't know",], # change order of "don't know" and "prefer not to answer" to keep don't knows last
                   "1|2|3|4|5|6|7|8 or more|Don't know|Prefer not to say":
                       ["1","2","3","4","5","6","7","8 or more","Prefer not to say","Don't know"],
                   "The Yes side|The No side|Neither|Don't know":
                       ["The Yes side","Neither","The No side","Don't know"], # is this ordinal - meh?
                   "1|2|3|4|5|6|7|8|9|Right  10|Don't know|Left  0":
                       ["Left  0","1","2","3","4","5","6","7","8","9","Right  10","Don't know"], # lrMayW12
                   "No|Yes, received a dose|Yes, booked an appointment|Don't know":
                       ["No","Yes, booked an appointment","Yes, received a dose","Don't know"],#

                   
                  }

reorder_variable_dict = pd.DataFrame.from_dict({k : "|".join(v) for k, v in change_cat_dict.items()},orient='index').reset_index()
reorder_variable_dict.columns = ["original_cat_list","reordered_cat_list"]
reorder_variable_dict.to_csv( BES_small_data_files + "reorder_variable_dict.csv" )

# reorder categories
def re_order(ques):
    if ques in change_cat_dict.keys():
        return "|".join( change_cat_dict[ques] )
    else:
        return ques


In [7]:
## typos - more directly useful for the BES!
# typos = set(['Do\x92t know', 'Dont know', 'Donât know', 'Don??t know','DonaÂ€Â™t know'])# ,
#          "9999.0", "997.0", "222.0", "99.0", "0.0", "1.0", "2.0"   ]) # problem here, is this picks up numeric sequences ...



# Big set of actual answers **I interpet** as non-answers (and set to NaN)
# REALLY MERITS RECHECKING WHAT THE IMPACT OF THIS IS!
Weasel_answers = ["Don't know","Donâ€™t know",
                  "Prefer not to say", "Prefer not to answer", "Refused", "Unknown",
                  "Neither", "Other", "I would/will not vote", "Will not vote",
                  "I would not vote", "It depends", "Other",
                  "Don't follow politics on twitter",
                  "Yes, other", "Haven't thought about it",
                  "There wasn't a local election in my area", "No, haven't received it",
                  "I don't know what was negotiated", "I never received a response",
                  "There are not local elections in my area", "Can't remember",
                  "Varies too much to say", "Will not state a choice",
                  "All leaders equally good", "They are not eligible to vote",
                  "There are not local elections in my area", "Both/neither",
                  "Did not vote","Can't remember",
                  "Not sure","Did not choose a candidate","There wasn't a Mayoral Election in my area",
                  "NA","They did not vote","They were not eligible to vote",
]

# BES codes for NaN/other/misc/none of the above
Weasel_number_answers = [ "9999.0", "997.0", "222.0", "99.0", "0.0", "9999", "98.0" ]

# non-answer answers
Weasel_set = set(Weasel_answers) # gets rid of duplicates!


## define 'de_Weasel' function to remove Weasel Words from lists of options
## ie. "Yes|No|Don't know" -> "Yes|No"

# Weasel_answers = ["Don't know", 'Don?t know', 'Donâ??t know', 'Do\x92t know', 'Dont know', 'Donât know',
#                   "Prefer not to say", "Prefer not to answer", "Refused", "Unknown", "Neither", "Other",
#                   "I would/will not vote", "Will not vote", "No - not decided", "I would not vote", "It depends",
#                   "Other", "Don’t follow politics on Facebook", "Don't follow politics on twitter", "9999.0", "997.0",
#                   "222.0", "Yes, other", "Haven't thought about it", "There wasn't a local election in my area",
#                   "No, haven't received it", "I don't know what was negotiated", "I never received a response",
#                   "There are not local elections in my area", "Can't remember", "Varies too much to say" ]

# # non-answer answers
# Weasel_set = set(Weasel_answers) # gets rid of duplicates!

# remove weasel phrases
def de_weasel(ques): 
    return "|".join( [x for x in ques.split("|") if x not in Weasel_answers] )

def de_num_el(el):
    if el.isdigit():
        el = "%.1f" % int( el )
    return el

def de_number(ques):
    return "|".join( [de_num_el(x) for x in ques.split("|")] )

def de_num(ques):
    return [de_num_el(x) for x in ques]

def floatable(flt):
    try:
        float(flt)
        return True
    except:
        return False

# Weasel_number_answers
# Remove 'weasel' numbers
# but only if they are the last element
# or not the last element, but the next is not a number
# to avoid catching parts of sequential numerical categories
def de_weasel_numbers(ques):
    el_list = ques.split("|")
    el_list_len = len(el_list)
    remove_list = []
    for el_pos in range( 0, el_list_len ):
        if el_list[el_pos] in Weasel_number_answers:
            # last element, or not last element but next element is a not a number
            if el_pos==(el_list_len-1) or not floatable(el_list[el_pos+1]):
                remove_list.append(el_list[el_pos])

    return "|".join( [x for x in el_list if x not in remove_list] )


# version to act directly on cat.categories array
def de_weasel_nums(el_list):

    el_list_len = len(el_list)
    remove_list = []
    for el_pos in range( 0, el_list_len ):
        if el_list[el_pos] in Weasel_number_answers:
            # last element, or not last element but next element is a not a number
            if el_pos==(el_list_len-1) or not floatable(el_list[el_pos+1]):
                remove_list.append(el_list[el_pos])

    return remove_list

In [8]:
# s.cat.rename_categories([1,2,3])
# EUContactRemainConW8|EUContactRemainLabW8|EUContactRemainLDW8|
# EUContactRemainSNPW8|EUContactRemainPCW8|EUContactRemainUKIPW8|
# EUContactRemainGreenW8|EUContactRemainOthW8|EUContactRemainNoneW8|
# EUContactRemainDKW8|EUContactLeaveConW8|EUContactLeaveLabW8|
# EUContactLeaveLDW8|EUContactLeaveSNPW8|EUContactLeavePCW8|
# EUContactLeaveUKIPW8|EUContactLeaveGreenW8|EUContactLeaveOthW8|
# EUContactLeaveNoneW8|EUContactLeaveDKW8

# pattern match "EUContact*****W8"
# debateOneWatchW8|debateTwoWatchW8

# "1.0|2.0|99.0" -> 

# euRefVoteSqueezeW7 "Will not vote|Yes - Leave|Yes - Remain|No - not decided"
#    -> Stay/remain in the EU|Leave the EU|I would/will not vote|Don't know
#    HMM - RENAME AND REORDER!

# miieuW7
# "Issue stated|Nothing|Don't know" -> "Issue stated|None|Don't know"
# MIIEUW8
# "1.0|Nothing|Don't know" -> "Issue stated|None|Don't know"
# partyIdEUW7|partyIdEUW8
# "Mainly leave|Mainly remain|Fairly evenly split|Don't know" -> "Mainly remain|Fairly evenly divided|Mainly leave|Don't know"
#    HMM - RENAME AND REORDER!

# 1. campaignVisionYesW3|campaignVisionNoW3, govtNatSecuritySuccessW4
# Very unsuccessful|Fairly unsuccessful|Neither successful nor unsuccessful|Fairly successful|Very successful|Don't know
# Very unsuccessful|Somewhat unsuccessful|Neither successful or unsuccessful|Somewhat successful|Very successful|Don't know

# Fairly <-> Somewhat

# 2. euroTurnoutW1, scotReferendumTurnoutW1|scotReferendumTurnoutW2|welshTurnoutW7|scotTurnoutW7, turnoutUKGeneralW1|turnoutUKGeneralW2|turnoutUKGeneralW3|turnoutUKGeneralW4|turnoutUKGeneralW5|euRefTurnoutW7|euRefTurnoutW8
# Very unlikely that I vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I vote|Don't know
# Very unlikely that I would vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I would vote|Don't know
# There are not local elections in my area
    #|Very unlikely that I will vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I will vote|Don't know
# Very unlikely that I will vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I will vote|Don't know

# "Very unlikely that I vote", "Very unlikely that I would vote" ->  "Very unlikely that I will vote" 

rename_cat_dict = {"North East|North West": [ "No", "Yes" ],
                   "1.0|2.0|99.0": ["No", "Yes", "99.0"],
                   "Will not vote|Yes - Leave|Yes - Remain|No - not decided":
                       ['I would/will not vote', 'Leave the EU','Stay in the EU', "Don't know"], 
                   "Stay/remain in the EU|Leave the EU|I would/will not vote|Don't know":
                       ['Stay in the EU','Leave the EU',  'I would/will not vote', "Don't know"],   # euRefVote    
                   "Stay/remain in the EU|Leave the EU|Don't know":
                       ['Stay in the EU','Leave the EU', "Don't know"],   # profile_eurefvote                    
                   "Issue stated|Nothing|Don't know":  ['Issue stated', 'None', "Don't know"],
                   "1.0|Nothing|Don't know":           ['Issue stated', 'None', "Don't know"],
                   "a|b|C1|C2|d|e|Refused|Unknown" : ['A', 'B', 'C1', 'C2', 'D', 'E', 'Refused', 'Unknown'],
                   "Mainly leave|Mainly remain|Fairly evenly split|Don't know":
                       ['Mainly leave','Mainly remain', 'Fairly evenly divided', "Don't know"],
                   "Very unsuccessful|Somewhat unsuccessful|Neither successful or unsuccessful|Somewhat successful|Very successful|Don't know":
                       ['Very unsuccessful', 'Fairly unsuccessful', 'Neither successful nor unsuccessful',
                        'Fairly successful', 'Very successful', "Don't know"],
                   "Very unlikely that I vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I vote|Don't know":
                       ['Very unlikely that I will vote', 'Fairly unlikely', 'Neither likely nor unlikely',
                        'Fairly likely', 'Very likely that I will vote', "Don't know"],
                   "Very unlikely that I would vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I would vote|Don't know":
                       ['Very unlikely that I will vote', 'Fairly unlikely', 'Neither likely nor unlikely',
                        'Fairly likely', 'Very likely that I will vote', "Don't know"],
                   "No, did not vote|Yes, voted|3.0|4.0|5.0|Don't know":   
                       ["Very unlikely that I will vote", "Fairly unlikely", 'Neither likely nor unlikely',
                        "Fairly likely", "Very likely that I will vote", "Don't know"], #londonTurnoutW7
                   'No, I do not regard myself as belonging to any particular religion.|Yes - Church of England/Anglican/Episcopal|Yes - Roman Catholic|Yes - Presbyterian/Church of Scotland|Yes - Methodist|Yes - Baptist|Yes - United Reformed Church|Yes - Free Presbyterian|Yes - Brethren|Yes - Judaism|Yes - Hinduism|Yes - Islam|Yes - Sikhism|Yes - Buddhism|Yes - Other|Prefer not to say|Yes Orthodox Christian|Yes - Pentecostal (e.g. Assemblies of God, Elim Pentecostal Church, New Testament Church of God, Redeemed Christian Chur|Yes - Evangelical independent/non-denominational (e.g. FIEC, Pioneer, Vineyard, Newfrontiers)':
                       ["No, I do not regard myself as belonging to any particular religion.","Yes - Church of England/Anglican/Episcopal",
                        "Yes - Roman Catholic","Yes - Presbyterian/Church of Scotland","Yes - Methodist","Yes - Baptist",
                        "Yes - United Reformed Church","Yes - Free Presbyterian","Yes - Brethren","Yes - Judaism","Yes - Hinduism",
                        "Yes - Islam","Yes - Sikhism","Yes - Buddhism","Yes - Other","Prefer not to say","Yes - Orthodox Christian",
                        "Yes - Pentecostal","Yes - Evangelical /independent/non-denominational"], #xprofile_religionW10
                   'No, I do not regard myself as belonging to any particular religion.|Yes - Church of England/Anglican/Episcopal|Yes - Roman Catholic|Yes - Presbyterian/Church of Scotland|Yes - Methodist|Yes - Baptist|Yes - United Reformed Church|Yes - Free Presbyterian|Yes - Brethren|Yes - Judaism|Yes - Hinduism|Yes - Islam|Yes - Sikhism|Yes - Buddhism|Yes - Other|Prefer not to say|Yes - Orthodox Christian|Yes - Pentecostal (e.g. Assemblies of God, Elim Pentecostal Church, New Testament Church of God, Redeemed Christian Chur|Yes - Evangelical - independent/non-denominational (e.g. FIEC, Pioneer, Vineyard, Newfrontiers)':
                       ["No, I do not regard myself as belonging to any particular religion.","Yes - Church of England/Anglican/Episcopal",
                        "Yes - Roman Catholic","Yes - Presbyterian/Church of Scotland","Yes - Methodist","Yes - Baptist",
                        "Yes - United Reformed Church","Yes - Free Presbyterian","Yes - Brethren","Yes - Judaism","Yes - Hinduism",
                        "Yes - Islam","Yes - Sikhism","Yes - Buddhism","Yes - Other","Prefer not to say","Yes - Orthodox Christian",
                        "Yes - Pentecostal","Yes - Evangelical /independent/non-denominational"], #xprofile_religionW10                   
                   'Own - outright|Own - with a mortgage|Own (part-own) - through shared ownership scheme (i.e. pay part mortgage, part rent)|Rent - from a private landlord|Rent - from my local authority|Rent - from a housing association|Neither - I live with my parents, family or friends but pay some rent to them|Neither - I live rent-free with my parents, family or friends|Other|9999':
                       [ 'Own outright',
                         'Own with a mortgage',
                         'Own (part-own) through shared ownership scheme (i.e. pay part mortgage, part rent)',
                         'Rent from a private landlord',
                         'Rent from my local authority',
                         'Rent from a housing association',
                         'Neither I live with my parents, family or friends but pay some rent to them',
                         'Neither I live rent-free with my parents, family or friends',
                         'Other',
                         '9999'], #profile_house_tenureW11|profile_house_tenureW12|profile_house_tenureW13
                   "I voted 'No' (Scotland should not be an independent country)|I voted 'Yes' (Scotland should be an independent country)|111.0|Don't know":
                       ["No","Yes","Did not vote","Don't know"], # referendumrecall
                   "Voted Yes|Voted No|Did not vote|Can't remember":
                       ["Yes","No","Did not vote","Don't know"], # scotRefVoteW4_
                   "No|Yes|3.0|Don't know":
                       ["No","Yes","Did not vote","Don't know"], # regretsIHaveAFewEUW10|regretsIHaveAFewEUW11   
                   "No|Yes|3|Don't know":
                       ["No","Yes","Did not vote","Don't know"], # regretsIHaveAFewEU W11_only|regretsIHaveAFew W10_only 
                   "Professional or higher technical work - work that requires at least degree-level qualifications (e.g. doctor, accountant|Manager or Senior Administrator (e.g. company director, finance manager, personnel manager, senior sales manager, senior|Clerical (e.g. clerk, secretary)|Sales or Services (e.g. commercial traveller, shop assistant, nursery nurse, care assistant, paramedic)|Foreman or Supervisor of Other Workers (e.g building site foreman, supervisor of cleaning workers)|Skilled Manual Work (e.g. plumber, electrician, fitter)|Semi-Skilled or Unskilled Manual Work (e.g. machine operator, assembler, postman, waitress, cleaner, labourer, driver, b|Other|Have never worked":
                       ['Professional or higher technical work / higher managerial - work that requires at least degree-level qualifications (e.g',
                        'Manager or Senior Administrator / intermediate managerial / professional (e.g. company director, finance manager, person',
                        'Clerical/junior managerial/professional/administrator (e.g. office worker, student doctor, sales person, clerk, secretar',
                        'Sales or Services (e.g. commercial traveller, shop assistant, nursery nurse, care assistant, paramedic)',
                        'Foreman or Supervisor of Other Workers (e.g. building site foreman, supervisor of cleaning workers)',
                        'Skilled Manual Work (e.g. plumber, electrician, fitter)',
                        'Semi-Skilled or Unskilled Manual Work (e.g. machine operator, assembler, postman, waitress, cleaner, labourer, driver, b',
                        'Other',
                        'Have never worked'], # work_type -> profile_work_typeW7
                   "No formal qualifications|Youth training certificate/skillseekers|Recognised trade apprenticeship completed|Clerical and commercial|City & Guilds certificate|City & Guilds certificate - advanced|onc|CSE grades 2-5|CSE grade 1, GCE O level, GCSE, School Certificate|Scottish Ordinary/ Lower Certificate|GCE A level or Higher Certificate|Scottish Higher Certificate|Nursing qualification (eg SEN, SRN, SCM, RGN)|Teaching qualification (not degree)|University diploma|University or CNAA first degree (eg BA, B.Sc, B.Ed)|University or CNAA higher degree (eg M.Sc, Ph.D)|Other technical, professional or higher qualification|Don't know|Prefer not to say":
                       ['No formal qualifications','Youth training certificate/skillseekers','Recognised trade apprenticeship completed',
                        'Clerical and commercial','City and Guild certificate','City and Guild certificate - advanced','onc','CSE grades 2-5',
                        'CSE grade 1, GCE O level, GCSE, School Certificate','Scottish Ordinary/ Lower Certificate','GCE A level or Higher Certificate',
                        'Scottish Higher Certificate','Nursing qualification (eg SEN, SRN, SCM, RGN)','Teaching qualification (not degree)',
                        'University diploma','University or CNAA first degree (eg BA, B.Sc, B.Ed)','University or CNAA higher degree (eg M.Sc, Ph.D)',
                        'Other technical, professional or higher qualification',"Don't know",'Prefer not to say'], # W6_comb: qeducationW6
                   "Strongly disapprove|Disapprove|Don't know":
                       ["Approve","Disapprove","Don't know"], # approveEUW2 # W7_comb, W10_comb, W13_comb, W8_comb, W9_comb
                   '1 to 24 employees|25 to 499 employees|500 or more employees|':
                       ['1 to 24 employees','25 to 499 employees','500 or more employees',"Don't know"], #fatherNumEmployees,motherNumEmployees #W6_comb,W5_comb,W5_only,W3_comb
                   "Yes, voted|No, did not vote|Don't know":
                       ['Yes',"No","Don't know"],
                   "No, did not vote|Yes, voted|Don't know":
                       ['No','Yes',"Don't know"],
                   "No, did not vote|Yes, voted|2.0":
                       ['No','Yes',"Don't know"],
                   "Strongly disagree|Disagree|Neither nor disagree|Agree|Strongly agree|Don't know":
                       ["Strongly disagree","Disagree","Neither agree nor disagree","Agree","Strongly agree","Don't know"],# euFinancialHelpW2 W3-6_comb
                   "I am very unsure what will happen|I am quite unsure what will happen|I am quite sure what will happen|I am very sure what will happen|Don't know":
                       ["I am very unsure what would happen","I am quite unsure what would happen","I am quite sure what would happen","I am very sure what would happen","Don't know"], # certaintyScotUnionW3 W3-5_comb
                   "0.0|1.0|2.0|3.0|4.0|5.0|6.0|7.0|997.0":
                       ["0 days","1 day","2 days","3 days","4 days","5 days","6 days","7 days","Don't know"], # discussPolDaysW5	W5_comb
                   "A major transfer of powers from Westminster to the Scottish Parliament (\"devo-max\")|Some powers will be transferred but well short of \"devo-max\"|No change to the relationship between Westminster and the Scottish Parliament":
                       ["A major transfer of powers from Westminster to the Scottish Parliament (devo-max)","Some powers will be transferred but well short of devo-max","No change to the relationship between Westminster and the Scottish Parliament"], # expectationManipCheckW1 # W13,10,9,8,7 vs W6-3_comb
                   "No, I did not vote|Yes, I voted|There wasn't a local election in my area|Don't know":
                       ["No, did not vote","Yes, voted","There wasn't a local election in my area","Don't know"], # localTurnoutRetroW2 W3-6_comb
                   "Focuses mainly on criticising other parties|2.0|3.0|4.0|Focuses mainly on putting forward their own policies and personalities|Don't know":
                       ["1 - Focused mainly on criticising other parties","2.0","3.0","4.0","5 - Focused mainly on putting forward their own policies and personalities","Don't know"], # <party>ToneW5 # W5-6_comb, W5_only
                   "Environmental Policy|Defence|Education|Pensions":
                       ["No, I think they *will not* vote","Yes, I think they *will* vote","They are not eligible to vote","Don't know"], # discussantturnoutName1-3W4 # W4-5_comb
                   "Employers in large organisations and higher managerial|Higher professional occupations|Lower professional and managerial and higher supervisory|Intermediate occupations|Employers in small organisations and own account workers|Lower suprivsory and technical occupations|Semi-routine occupations|Routine occupations":
                       ['Employers in large organisations and higher managerial', 'Higher professional occupations',
                        'Lower professional and managerial and higher supervisory', 'Intermediate occupations',
                        'Employers in small organisations and own account workers', 'Lower supervisory and technical occupations',
                        'Semi-routine occupations', 'Routine occupations'], # ns_sec_analytic	 W5_only, W3-6_comb                   
                   "Employers in large organisations and higher managerial|Higher professional occupations|Lower professional and managerail and higher supervisory|Intermediate occupations|Employers in small organisations and own account workers|Lower suprivsory and technical occupations|Semi-routine occupations|Routine occupations":
                       ['Employers in large organisations and higher managerial', 'Higher professional occupations',
                        'Lower professional and managerial and higher supervisory', 'Intermediate occupations',
                        'Employers in small organisations and own account workers', 'Lower supervisory and technical occupations',
                        'Semi-routine occupations', 'Routine occupations'], # ns_sec_analytic	 W5_only, W3-6_comb    # v slight typo!
                   "A major transfer of powers from Westminster to the Scottish Parliament (\"devo-max\")|Some powers will be transferred but well short of \"devo-max\"|No change to the relationship between Westminster and the Scottish Parliament|Don't know":
                       ['A major transfer of powers from Westminster to the Scottish Parliament (devo-max)',
                        'Some powers will be transferred but well short of devo-max',
                        'No change to the relationship between Westminster and the Scottish Parliament',"Don't know"], # expectationManipCheckW1 W3-6_comb
                   "Employers in large establishments|Higher managerial and administrative occupations|L3.1 'Traditional' employees|L3.2 'New' employees|L3.3 'Traditional' self-employed|L3.4 'New' self-employed|L4.1 'Traditional' employees|L4.2 'New' employees|L4.3 'Traditional' self-employed|L4.4 'New' self-employed|Lower managerial and administrative occupations|Higher supervisory occupations|L7.1 Intermediate clerical and administrative occupations|L7.2 Intermediate sales and service occupations|L7.3 Intermediate technical and auxiliary occupations|L7.4 Intermediate engineering occupations|L8.1 Employers in small establishments in industry, commerce, services etc.|L8.2 Employers in small establishments in agriculture|L9.1 Own account workers (non-professional)|L9.2 Own account workers (agriculture)|Lower supervisory occupations|L11.1 Lower technical craft occupations|L11.2 Lower technical process operative occupations|L12.1 Semi-routine sales occupations|L12.2 Semi-routine service occupations|L12.3 Semi-routine technical occupations|L12.4 Semi-routine operative occupations|L12.5 Semi-routine agricultural occupations|L12.6 Semi-routine clerical occupations|L12.7 Semi routine childcare occupations|L13.1 Routine sales and service occupations|L13.2 Routine production occupations|L13.3 Routine technical occupations|L13.4 Routine operative occupations|L13.5 Routine agricultural occupations":
                       ['Employers in large establishments', 'Higher managerial and administrative occupations',
                        'L3.1 Traditional employees', 'L3.2 New employees', 'L3.3 Traditional self-employed',
                        'L3.4 New self-employed', 'L4.1 Traditional employees', 'L4.2 New employees',
                        'L4.3 Traditional self-employed', 'L4.4 New self-employed', 'Lower managerial and administrative occupations',
                        'Higher supervisory occupations', 'L7.1 Intermediate clerical and administrative occupations',
                        'L7.2 Intermediate sales and service occupations', 'L7.3 Intermediate technical and auxiliary occupations',
                        'L7.4 Intermediate engineering occupations', 'L8.1 Employers in small establishments in industry, commerce, services etc.',
                        'L8.2 Employers in small establishments in agriculture', 'L9.1 Own account workers (non-professional)',
                        'L9.2 Own account workers (agriculture)', 'Lower supervisory occupations', 'L11.1 Lower technical craft occupations',
                        'L11.2 Lower technical process operative occupations', 'L12.1 Semi-routine sales occupations',
                        'L12.2 Semi-routine service occupations', 'L12.3 Semi-routine technical occupations', 'L12.4 Semi-routine operative occupations',
                        'L12.5 Semi-routine agricultural occupations', 'L12.6 Semi-routine clerical occupations', 'L12.7 Semi routine childcare occupations',
                        'L13.1 Routine sales and service occupations', 'L13.2 Routine production occupations', 'L13.3 Routine technical occupations',
                        'L13.4 Routine operative occupations', 'L13.5 Routine agricultural occupations'],
                   "Employers in large establishments|Higher managerial and administrative occupations|L3.1 ?Traditional? employees|L3.2 ?New? employees|L3.3 ?Traditional? self-employed|L3.4 ?New? self-employed|L4.1 ?Traditional? employees|L4.2 ?New? employees|L4.3 ?Traditional? self-employed|L4.4 ?New? self-employed|Lower managerial and administrative occupations|Higher supervisory occupations|L7.1 Intermediate clerical and administrative occupations|L7.2 Intermediate sales and service occupations|L7.3 Intermediate technical and auxiliary occupations|L7.4 Intermediate engineering occupations|L8.1 Employers in small establishments in industry, commerce, services etc.|L8.2 Employers in small establishments in agriculture|L9.1 Own account workers (non-professional)|L9.2 Own account workers (agriculture)|Lower supervisory occupations|L11.1 Lower technical craft occupations|L11.2 Lower technical process operative occupations|L12.1 Semi-routine sales occupations|L12.2 Semi-routine service occupations|L12.3 Semi-routine technical occupations|L12.4 Semi-routine operative occupations|L12.5 Semi-routine agricultural occupations|L12.6 Semi-routine clerical occupations|L12.7 Semi routine childcare occupations|L13.1 Routine sales and service occupations|L13.2 Routine production occupations|L13.3 Routine technical occupations|L13.4 Routine operative occupations|L13.5 Routine agricultural occupations":
                       ['Employers in large establishments', 'Higher managerial and administrative occupations',
                        'L3.1 Traditional employees', 'L3.2 New employees', 'L3.3 Traditional self-employed',
                        'L3.4 New self-employed', 'L4.1 Traditional employees', 'L4.2 New employees',
                        'L4.3 Traditional self-employed', 'L4.4 New self-employed', 'Lower managerial and administrative occupations',
                        'Higher supervisory occupations', 'L7.1 Intermediate clerical and administrative occupations',
                        'L7.2 Intermediate sales and service occupations', 'L7.3 Intermediate technical and auxiliary occupations',
                        'L7.4 Intermediate engineering occupations', 'L8.1 Employers in small establishments in industry, commerce, services etc.',
                        'L8.2 Employers in small establishments in agriculture', 'L9.1 Own account workers (non-professional)',
                        'L9.2 Own account workers (agriculture)', 'Lower supervisory occupations', 'L11.1 Lower technical craft occupations',
                        'L11.2 Lower technical process operative occupations', 'L12.1 Semi-routine sales occupations',
                        'L12.2 Semi-routine service occupations', 'L12.3 Semi-routine technical occupations', 'L12.4 Semi-routine operative occupations',
                        'L12.5 Semi-routine agricultural occupations', 'L12.6 Semi-routine clerical occupations', 'L12.7 Semi routine childcare occupations',
                        'L13.1 Routine sales and service occupations', 'L13.2 Routine production occupations', 'L13.3 Routine technical occupations',
                        'L13.4 Routine operative occupations', 'L13.5 Routine agricultural occupations'],
                   "1|2":
                       ["No","Yes"], # tryReduceImmigDKW4, achieveReduceImmigUKIPW4, achieveReduceImmigGrnW4, achieveReduceImmigDKW4, tryReduceInequalityDKW4, successReduceInequalityDKW4 # W4-5_comb # sharedContentOnline_1-5W4 W5_comb # voteMethodEurope_dkW2, discussantsAskedYouToVote_DKW2 ,discussantsAccompaniedVote_dkW2, referendumContact_dkW2 # W3_comb
                   "1.0|2.0":
                       ["No","Yes"], # tryReduceImmigDKW4, achieveReduceImmigUKIPW4, achieveReduceImmigGrnW4, achieveReduceImmigDKW4, tryReduceInequalityDKW4, successReduceInequalityDKW4 # W4-5_comb # sharedContentOnline_1-5W4 W5_comb # voteMethodEurope_dkW2, discussantsAskedYouToVote_DKW2 ,discussantsAccompaniedVote_dkW2, referendumContact_dkW2 # W3_comb
                   "Should definitely be illegal|Should probably be illegal|Should probably be legal|Should definitely be legal|5.0":
                       ["Should definitely be illegal","Should probably be illegal","Should probably be legal","Should definitely be legal","Don't know"], # zeroHourContractW6
                  }


rename_variable_dict = pd.DataFrame.from_dict( {k : "|".join(v) for k, v in rename_cat_dict.items()} , orient='index' ).reset_index()
rename_variable_dict.columns = ["original_cat_list","renameed_cat_list"]
rename_variable_dict.to_csv( BES_small_data_files + "rename_variable_dict.csv" )

def re_name(ques):
    if ques in rename_cat_dict.keys():
        return "|".join( rename_cat_dict[ques] )
    else:
        return ques

In [9]:
## COLUMNS THAT EITHER LACK ALL DATA OR HAVE ACTUAL ERRORS
# check back on these periodically - one assumes they will get fixed!
# maybe tell them about them so that they can?

# {'changeIssue1W9', 'conLeaderLikeW9'}
# these variables appear to have disappeared! Fixed in an updated version?

ignore_list = ['whichPartiesHelped_99W6',
               'partyContactGrnW1',
               'partyContactGrnW2',
               'partyContactGrnW3',
               'reasonNotRegistered_noneW2',               
               'reasonNotRegistered_noneW3',
               'reasonNotRegistered_noneW4',
               'reasonNotRegistered_noneW6',
               'reasonNotRegistered_noneW7',
               'reasonNotRegistered_noneW8',
               'reasonNotRegistered_none',
               'partyContactSNPW1',
               'partyContactSNPW2',
               "locusControlW9",
               "generalElecCertaintyW1", # wave 10 forwards
               "generalElecCertaintyW2",
               "generalElecCertaintyW3",
               "londonMayorVoteW7",
               "fatherNumEmployeesW4",
               "motherNumEmployeesW4",
               "profile_pcon_2010_newW3", # W3_comb: this is parl. constit. ... but by number!
               "euroElectionVoteYoungW2", # W3_comb: all NaNs!
               "profile_GOR_pdlW4", # W4_comb: misnamed selection, probably fixable 
               "participation_111W5", ### -->
               "sharedContentOnline_111W5",
               "sharedContentOnline_99W5", ### <-- W5_comb "Got a lot worse|Got a little worse" doesn't look right (indicator vars?)
               "csplScotRefW3", ### W5_comb: "North East" - just broken!
              ]

#- approveEUW2 'Strongly disapprove|Disapprove|Don't know' - should be "approve|disapprove|don't know"??? NOT SURE (distribution weird)
#- whichPartiesHelped_99W6 - answer set = ["No"]
#- partyContactGrnW1 ... reasonNotRegistered_noneW8 answer set = ["No", "Don't know"]
# -partyContactSNPW1, partyContactSNPW2 - answer set = ["Don't know"]
# -changeIssue1W9|conLeaderLikeW9|locusControlW9 - answer set = ["No formal qualifications"]

In [10]:
## define 'prune' function to prune wave indicators and return question stubs
## ie. "ptvConW1|ptvLabW1" -> "ptvCon|ptvLab"

def prune(x):
    
    y= []
    for el in x.split("|"):
        match_attempt = re.match('(\w*?)_?(W[0-9]+)+' , el )   
        if match_attempt:
            el = match_attempt.groups()[0]
        y.append(el)
    # should we ditch identical repeats?
    # return "|".join(set(y)) NEEDS TO BE TESTED
    return "|".join(y)

               
def prune2(x):
    
    y= []
    for el in x.split("|"):
        # fgdfhfghg_5, fgdfhfghg_4, fgdfhfghg_3 -> fgdfhfghg
        # problem - indicator variables fgdfhfghg_99, fgdfhfghg_111 really are different!
        # solution - leave them distinct
        indicator_variable = re.match('(\w*?)_?(99|111)' , el )       
        match_attempt = re.match('(\w*?)_?[0-9]+' , el )   
        if (not indicator_variable) and (match_attempt):
            el = match_attempt.groups()[0]
        y.append(el)
    # should we ditch identical repeats?
    # return "|".join(set(y)) NEEDS TO BE TESTED
    return "|".join(y)


def hardcoded_fix(col,cat_list):
    
    var_type.loc[ col , "dtype" ]           = BES_Panel[col].dtype.name
    if (var_type.loc[ col , "dtype" ] == 'category'):
        var_type.loc[ col , "cat_all_strings" ] = np.all([isinstance(x,str) for x in BES_Panel[ col ].cat.categories])
        
    BES_Panel[col] = \
        BES_Panel[col].astype("category").cat.rename_categories( cat_list.split("|") )
        
    var_type.loc[ col , "dataset_specific_hardcoded_fix" ] = "|".join( BES_Panel[col].cat.categories.values )
    
# "Â–" -> "-"
# "Â£" -> "£"

# " â€“ " -> " "
# " Â‘" -> " "
# "Â’ " -> " "

# "Â‘" -> "'"
# "Â’" -> "'"
# "Â€Â™" -> "'"
# "â??" -> "'"
# "â€™" -> "'"    

# detect any matching pattern of weird Â stuff in cat1|cat2|cat3... string
# return the fixed version of string if present
# return None if not
def fix_a_hat_chars(cat_string):
    cat_array = cat_string.split("|")
    a_hat_present = False
    for el_no in range( 0, len(cat_array) ):
        el = cat_array[el_no]
        el = re.sub( "SiÃƒÂ¢n C. Jame|SiÃ¢n C. James|SiÃ¢n C. Jame|Siân C. James", "Sian C. James", el)
        el = re.sub( "ThÃ©rÃ¨se  Coff|Thérèse  Coffey", "Therese  Coffey", el)
        el = re.sub( "RA©union|RÃ©union|RAÂ©union|RÃƒÂ©union", "Reunion", el)
        el = re.sub( "\xa0Lower supervisory occupations", "Lower supervisory occupations", el)
        el = re.sub( "Don‘t know|Don?t know|Dona??t know|Dona€™t know|Donâ€™t know|Don’t know|Don‘t know|Don\x91t know|Don\x92t know|Dona\x80\x99t know|Do\x92t know","Don't know", el  )
        el = re.sub( "Â–|\x96|–", "-", el )
        el = re.sub( "Â£|\xc2£", "£", el )
        el = re.sub( "\xa0|\sâ€“\s|\s\xe2\x80\x93\s|\sÂ‘|Â’\s" , " ", el )
        el = re.sub( "Â‘|Â’|Â€Â™|â\?\?|\x80\x99|â€™|\xe2\x80\x99|â|â\x80\x99|\?\?|\x92|‘|\x91|’", "'", el )
        el = re.sub( "'\u2013'", "-", el )
        
        
        
        if el != cat_array[el_no]:
            a_hat_present = True
            cat_array[el_no] = el
            
    if a_hat_present:
        return cat_array
    else:
        return None
        
## typos - more directly useful for the BES!
# typos = set(['Do\x92t know', 'Dont know', 'Donât know', 'Don??t know','DonaÂ€Â™t know'])# ,

In [11]:
def create_var_list( variable_categories ):
    # load question_categories_correct (it could have been updated)
    # input: 
    # output:
    # var_cat_dict_pruned, var_cat_dict_pruned_2

    # flipping list
    var_cat_dict = dict()
    # range defined by types that exist in question_categories_correct.csv
    type_range = set(variable_categories["type"].values)

    for typ in type_range:

        e = variable_categories[variable_categories.type==typ]["column_name"].values
        var_cat_dict[typ] = [item for sublist in [i.split("|") for i in e] for item in sublist]
        var_cat_dict[typ] = [item for item in var_cat_dict[typ] if item not in ignore_list]

    # dictionary comprehension to prune column-names to wave non-specific stubs
    # list(set()) gets rid of repetitions
    var_cat_dict_pruned   = {k: list(set([prune(x)  for x in v])) for k, v in var_cat_dict.items()}
    var_cat_dict_pruned_2 = {k: list(set([prune2(x) for x in v])) for k, v in var_cat_dict_pruned.items()}
    
    return ( var_cat_dict_pruned , var_cat_dict_pruned_2 )

In [12]:
def careful_isnan(x):
    return ( (not isinstance(x,str)) and np.isnan(x) )

def careful_replace( col,replace_dict ):
    var_type.loc[col,"dtype"] = BES_Panel[col].dtype.name
    if (var_type.loc[ col , "dtype" ] == 'category'):
        var_type.loc[ col , "cat_all_strings" ] = np.all([isinstance(x,str) for x in BES_Panel[ col ].cat.categories])     
    
    BES_Panel[col] = BES_Panel[col]\
        .apply(lambda x: x if careful_isnan(x) else replace_dict[x] )\
        .astype('category').cat.set_categories( replace_dict.values() , ordered = True)
        
    var_type.loc[ col , "dataset_specific_hardcoded_fix" ] = "|".join( BES_Panel[col].cat.categories.values )

def careful_replace_and_set_cats( col, replace_dict, final_cats ):
    var_type.loc[col,"dtype"] = BES_Panel[col].dtype.name
    if (var_type.loc[ col , "dtype" ] == 'category'):
        var_type.loc[ col , "cat_all_strings" ] = np.all([isinstance(x,str) for x in BES_Panel[ col ].cat.categories])     
    
    BES_Panel[col] = BES_Panel[col]\
        .apply(lambda x: x if x not in replace_dict.keys() else replace_dict[x] )\
        .astype('category').cat.set_categories( final_cats , ordered = True)
        
    var_type.loc[ col , "dataset_specific_hardcoded_fix" ] = "|".join( BES_Panel[col].cat.categories.values )    
    

In [14]:
if ( dataset_name in ["W25_comb"] ):
    print("!!!!!!!!!!!!!!!!!!!!")
    col = 'riskScaleW8' 
    replace_dict = {'Most risk averse':0, '2':1,'3':2 ,'4':3,
                    '5':4,'6':5,'7':6,'8':7,'9':8,'10':9,'11':10,'12':11,
                    '13':12,'14':13,'15':14, 'Most risk inclined':15}

BES_Panel[col].replace(replace_dict)

NameError: name 'dataset_name' is not defined

In [16]:
def hard_coded_fixes( dataset_name ):

    ## dataset specific issues
    # (i.e. probably what I should have done all along!)
    
    col = "age"
    if dataset_name=="W22_only":
        BES_Panel[col]= BES_Panel[col].astype('category')
        
    if ( dataset_name in ["W25_comb"] ):
        print("!!!!!!!!!!!!!!!!!!!!")
        col = 'riskScaleW8' 
        replace_dict = {'Most risk averse':0, '2':1,'3':2 ,'4':3,
                        '5':4,'6':5,'7':6,'8':7,'9':8,'10':9,'11':10,'12':11,
                        '13':12,'14':13,'15':14, 'Most risk inclined':15}
    
        BES_Panel[col] = BES_Panel[col].replace(replace_dict)
        col = 'riskScaleW20' 
        replace_dict = {'Most risk averse':0, '2':1,'3':2 ,'4':3,
                        '5':4,'6':5,'7':6,'8':7,'9':8,'10':9,'11':10,'12':11,
                        '13':12,'14':13,'15':14, 'Most risk inclined':15}
    
        BES_Panel[col] = BES_Panel[col].replace(replace_dict)        

    if ( dataset_name in ["W23_only"] ):
        col = 'preschoolKidsInHouseW21_'
        replace = {0.0:"No",1.0:"Yes",9.0:"Don't know"}
        final_cats = ["No", "Yes", "Don't know"]
        careful_replace_and_set_cats( col, replace, final_cats )        
        BES_Panel[col] = BES_Panel[col].astype('category')

    if ( dataset_name in ["W23_only"] ):
        col = 'schoolKidsInHouseW21_'
        replace = {0.0:"No",1.0:"Yes",9.0:"Don't know"}
        final_cats = ["No", "Yes", "Don't know"]
        careful_replace_and_set_cats( col, replace, final_cats )        
        BES_Panel[col] = BES_Panel[col].astype('category')        
        
    if ( dataset_name in ["W23_only"] ):
        col = 'sickElderlyInHouseW21_'
        replace = {0.0:"No",1.0:"Yes",9.0:"Don't know"}
        final_cats = ["No", "Yes", "Don't know"]
        careful_replace_and_set_cats( col, replace, final_cats )        
        BES_Panel[col] = BES_Panel[col].astype('category')           
        
    if ( dataset_name in ["W23_only"] ):
        col = 'noDependentsInHouseW21_'
        replace = {0.0:"No",1.0:"Yes",9.0:"Don't know"}
        final_cats = ["No",  "Yes", "Don't know"]
        careful_replace_and_set_cats( col, replace, final_cats )        
        BES_Panel[col] = BES_Panel[col].astype('category')           
        
    if ( dataset_name in ["W23_only"] ):
        col = 'privPrimSchlW1_'
        replace = {0.0:"No",1.0:"Yes",9.0:"Don't know"}
        final_cats = ["No",  "Yes", "Don't know"]
        careful_replace_and_set_cats( col, replace, final_cats )        
        BES_Panel[col] = BES_Panel[col].astype('category')           
        
    if ( dataset_name in ["W23_only"] ):
        col = 'privScndSchlW1_'
        replace = {0.0:"No",1.0:"Yes",9.0:"Don't know"}
        final_cats = ["No",  "Yes", "Don't know"]
        careful_replace_and_set_cats( col, replace, final_cats )        
        BES_Panel[col] = BES_Panel[col].astype('category')         
        
    if ( dataset_name in ["W23_only"] ):
        col = 'privScndSchlW1_'
        replace = {0.0:"No",1.0:"Yes",9.0:"Don't know"}
        final_cats = ["No",  "Yes", "Don't know"]
        careful_replace_and_set_cats( col, replace, final_cats )        
        BES_Panel[col] = BES_Panel[col].astype('category')   
        
    if ( dataset_name in ["W23_only"] ):
        col = 'privScndSchlW1_'
        replace = {0.0:"No",1.0:"Yes",9.0:"Don't know"}
        final_cats = ["No",  "Yes", "Don't know"]
        careful_replace_and_set_cats( col, replace, final_cats )        
        BES_Panel[col] = BES_Panel[col].astype('category')   

    if ( dataset_name in ["W23_only"] ):
        col = 'neverPrivSchlW1_'
        replace = {0.0:"No",1.0:"Yes",9.0:"Don't know",9999.0:"Don't know"}
        final_cats = ["No", "Yes", "Don't know"]
        careful_replace_and_set_cats( col, replace, final_cats )        
        BES_Panel[col] = BES_Panel[col].astype('category')         
                
    if ( dataset_name in ["W23_only"] ):
        col = 'speakWelshW1_'
        replace = {0.0:"No",1.0:"Yes, but not fluently",2.0:"Yes, fluently",9999.0:"Don't know"}
        final_cats = ["No", "Yes", "Don't know"]
        careful_replace_and_set_cats( col, replace, final_cats )        
        BES_Panel[col] = BES_Panel[col].astype('category')         

        
        
    # "BES2017_W13_v1.0.dta"

    ## Should I make this *filename specific* or *wave specific*?
    ## Comes down to a question of whether it's safer to assume that things get fixed
    ## or that they probably won't get fixed


    # gor W3_only, W2_only (3->-4, category -> object)
    # # grr - some point BES switched from ONS codes to text names
    # # I feel like percolating the change backwards would have been a good idea
    # ONS codes available here:
    # http://webarchive.nationalarchives.gov.uk/20160128190831/http://www.ons.gov.uk/ons/guide-method/geography/beginner-s-guide/administrative/england/government-office-regions/index.html

    # variable name collision (BES 'disability' (wave 6 variable) and yougov profile 'disability)
    if ("disability" in BES_Panel.columns) and (dataset_name != "W6_only"):
        BES_Panel.rename(columns={"disability":"profile_disability"}, inplace=True)
    # similar collision 
#     if ("housing" in BES_Panel.columns) and (dataset_name == "W13_only"):
#         BES_Panel.rename(columns={"housing":"profile_house_tenure"}, inplace=True)  

    # whole column is NaN!
    col = "profile_socialgrade_cie"
    if (col in BES_Panel.columns) and (dataset_name in [ "W6_only", "W4_only", "W3_only", "W2_only", "W1_only" ]):
        var_type.loc[col,"type"] = -2 # set to ignore

    # whole column is NaN!
    col = 'discussPolDays'
    if (col in BES_Panel.columns) and (dataset_name in [ "W3_only" ]):
        var_type.loc[col,"type"] = -2 # set to ignore
        
    # whole column is NaN!
    col = 'partyContactSNP'
    if (col in BES_Panel.columns) and (dataset_name in [ "W2_only","W1_only" ]):
        var_type.loc[col,"type"] = -2 # set to ignore        

        
        
    # now we have actual categories that don't match different versions *of that exact same variable*
    # and can't even be attributed to weasel terms (e.g. 99 -> Don't know, 98 -> Other)
    # so, I'll try just replacing them with NaNs
    
    if ( dataset_name in ["W13_comb"] ):
        col = 'scotRefVoteW4_W13'
        replace = {99.0:"Don't know",111.0:"Don't know"}
        final_cats = ['Not at all certain', 'Somewhat certain', 'Very certain', "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )        
    

    if ( dataset_name in ["W13_comb","W10_comb"] ):
        col = "profile_turnout_2015"
        
        replace = {}
        final_cats = ['No, did not vote',
                      'Yes, voted',
                      "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )


    if ( dataset_name in ["W6_comb"] ):
        col = "zeroHourContractW6"
        
        replace = {}
        final_cats = ['Should definitely be illegal',
                     'Should probably be illegal',
                     'Should probably be legal',
                     'Should definitely be legal',
                     "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )

    if ( dataset_name in ["W6_comb","W5_comb","W4_comb","W3_comb"] ):
        col = "certaintyEUGreenW2"
        
        replace = {99.0:"Don't know"}
        final_cats = ['Not at all certain', 'Somewhat certain', 'Very certain', "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )
        
    if ( dataset_name in ["W6_comb"] ):
        col = "certaintyEUGreenW4"
        
        replace = {99.0:"Don't know"}
        final_cats = ['Not at all certain', 'Somewhat certain', 'Very certain', "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )
        
        col = "certaintyEUGreenW6"
        
        replace = {99.0:"Don't know"}
        final_cats = ['Not at all certain', 'Somewhat certain', 'Very certain', "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )        

        
    if ( dataset_name in ["W10_only"] ):
        col = "econPersonalProsp"
        
        replace = {}
        final_cats = ['Get a lot worse',
                     'Get a little worse',
                     'Stay the same',
                     'Get a little better',
                     'Get a lot better',
                     "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )        

    if ( dataset_name in ["W13_comb","W10_comb"] ):
        col = "econPersonalProspW10"
        
        replace = {}
        final_cats = ['Get a lot worse',
                     'Get a little worse',
                     'Stay the same',
                     'Get a little better',
                     'Get a lot better',
                     "Don't know"]
        careful_replace_and_set_cats( col,  replace, final_cats )

    if ( dataset_name in ["W6_comb","W5_only"] ):
        col = "noDependentsInHousehold"
        
        replace = {}
        final_cats = ['No',
                     'Yes']
        careful_replace_and_set_cats( col,  replace, final_cats )            
        
        
    if ( dataset_name in ["W2_only"] ):
        col = "gor"

        ons_gor_dict = {"E12000001":"North East",
                        "E12000002":"North West",
                        "E12000003":"Yorkshire and The Humber",
                        "E12000004":"East Midlands",
                        "E12000005":"West Midlands",
                        "E12000006":"East of England",
                        "E12000007":"London",
                        "E12000008":"South East",
                        "E12000009":"South West",
                        "N99999999":"Northern Ireland",
                        "S99999999":"Scotland",
                        "W99999999":"Wales",
                        "":"Non UK & Invalid"}

        careful_replace(  col , ons_gor_dict )
        
    if ( dataset_name in ["W21_only"] ):
        col = "gor"
        
        ons_gor_dict = {1:"North East",
                        2:"North West",
                        3:"Yorkshire and The Humber",
                        4:"East Midlands",
                        5:"West Midlands",
                        6:"East of England",
                        7:"London",
                        8:"South East",
                        9:"South West",
                        10:"Wales",
                        11:"Scotland",
                }

        careful_replace(  col , ons_gor_dict )   
        
    if ( dataset_name in ["W21_only"] ):
        col = "p_country_birth"
        
        ons_gor_dict = {1.0:"UK",
                        2.0:"Ireland",
                        3.0:"EU: pre-2004",
                        4.0:"EU: post-2004",
                        5.0:"European outside EU",
                        6.0:"Africa",
                        7.0:"East Asia",
                        8.0:"South-East/Central Asia",
                        9.0:"South Asia",
                        10.0:"North America",
                        11.0:"Caribbean/Central America",
                        12.0:"South America",
                        13.0:"Oceania & Antarctica",
                        14.0:"Middle East",
                        9999.0:"Not coded",
                }

        careful_replace(  col , ons_gor_dict )          

    if ( dataset_name in ["W3_comb","W4_comb","W5_comb"] ):
        col = "mapNamesW3"

        BES_Panel[col] = \
            BES_Panel[col].astype('float64')
        var_type.loc[col,"dtype"] = BES_Panel[col].dtype.name        
        var_type.loc[ col , "dataset_specific_hardcoded_fix" ] = list(BES_Panel[col].unique())
    

    if ( dataset_name in ["W12_only","W11_only","W3_only","W2_only","W1_only"] ):
        partyContact = {1.0:"No",
                        2.0:"Yes",
                        9999.0:"Don't know"}
        col = "partyContactGrn"
        careful_replace( col , {el:el for el in partyContact.values()} )     

#                    'Own - outright|Own - with a mortgage|Own (part-own) - through shared ownership scheme (i.e. pay part mortgage, part rent)|Rent - from a private landlord|Rent - from my local authority|Rent - from a housing association|Neither - I live with my parents, family or friends but pay some rent to them|Neither - I live rent-free with my parents, family or friends|Other|9999':
#                        [ 'Own outright',
#                          'Own with a mortgage',
#                          'Own (part-own) through shared ownership scheme (i.e. pay part mortgage, part rent)',
#                          'Rent from a private landlord',
#                          'Rent from my local authority',
#                          'Rent from a housing association',
#                          'Neither I live with my parents, family or friends but pay some rent to them',
#                          'Neither I live rent-free with my parents, family or friends',
#                          'Other',
#                          '9999'], #profile_house_tenureW11|profile_house_tenureW12|profile_house_tenureW13
        
# housing	W13_only	W6_comb	category	3	housing	Own the leasehold/freehold outright|Buying leasehold/freehold on a mortgage|Rented from local authority|Rented from private landlord|It belongs to a Housing Association	Own - outright|Own - with a mortgage|Own (part-own) - through shared ownership scheme (i.e. pay part mortgage, part rent)|Rent - from a private landlord|Rent - from my local authority|Neither - I live with my parents, family or friends but pay some rent to them|Neither - I live rent-free with my parents, family or friends

        
    if ( dataset_name in ["W6_comb"] ):
        housing_replace = {'Own \x96 outright': 'Own outright',
                         'Own \x96 with a mortgage': 'Own with a mortgage',
                         'Own (part-own) \x96 through shared ownership scheme (i.e. pay part mortgage, part rent)': 'Own (part-own) through shared ownership scheme (i.e. pay part mortgage, part rent)',
                         'Rent \x96 from a private landlord': 'Rent from a private landlord',
                         'Rent \x96 from my local authority': 'Rent from my local authority',
                         'Neither \x96 I live with my parents, family or friends but pay some rent to them': 'Neither I live with my parents, family or friends but pay some rent to them',
                         'Neither \x96 I live rent-free with my parents, family or friends': 'Neither I live rent-free with my parents, family or friends',
                         'Other':'Other',
                         '9999':'Rent from a housing association'}
        
        housing_final_cats = ['Own outright',
                         'Own with a mortgage',
                         'Own (part-own) through shared ownership scheme (i.e. pay part mortgage, part rent)',
                         'Rent from a private landlord',
                         'Rent from my local authority',
                         'Rent from a housing association',
                         'Neither I live with my parents, family or friends but pay some rent to them',
                         'Neither I live rent-free with my parents, family or friends',
                         'Other']
        
        col = "housing" 
        careful_replace_and_set_cats( col,  housing_replace, housing_final_cats )
        
# None/ No leader|David Cameron|Ed Miliband|Nick Clegg|Nicola Sturgeon|Leanne Wood|Nigel Farage|Natalie Bennett|222.0|Don't know
# None/ No leader|David Cameron|Ed Miliband|Nick Clegg|Nicola Sturgeon|Leanne Wood|Nigel Farage|Natalie Bennett|222|Don't know
# bestLeaderCampaign	W6_only
# worstLeaderCampaign	W6_only
        
        

    BestWorstLeader_replace = {"None/ No leader":"None/No leader",
                               10.0:"All leaders equally bad",
                               222.0:"All leaders equally bad",
                               222:"All leaders equally bad"}
    BestWorstLeader_final_cats = ["None/No leader","David Cameron","Ed Miliband","Nick Clegg","Nicola Sturgeon",
                                  "Leanne Wood","Nigel Farage","Natalie Bennett","All leaders equally bad"]
    # run on all datasets - wait - only ones in which it exists
    
#     if ( dataset_name in ["W6_comb","W5_comb"] ):
    col = "bestLeaderCampaignW5"
    if ( col in BES_Panel.columns ):
        careful_replace_and_set_cats( col,  BestWorstLeader_replace, BestWorstLeader_final_cats )
    col = "worstLeaderCampaignW5"        
    if ( col in BES_Panel.columns ):  
        careful_replace_and_set_cats( col, BestWorstLeader_replace, BestWorstLeader_final_cats )

#     if ( dataset_name in ["W5_only","W6_only"] ):
    col = "bestLeaderCampaign"
    if ( col in BES_Panel.columns ):
        careful_replace_and_set_cats( col,  BestWorstLeader_replace, BestWorstLeader_final_cats )
        
    col = "worstLeaderCampaign"
    if ( col in BES_Panel.columns ):
        careful_replace_and_set_cats( col,  BestWorstLeader_replace, BestWorstLeader_final_cats )        
      

    scotReferendumIntention_replace = {'Scotland should become an independent country':"Will vote 'Yes'",
                                       111.0:'Will vote no',
                                       99.0:"Don't know",
                                       2.0:"Will not vote",}
    scotReferendumIntention_final_cats = ['Will vote no', "Will vote 'Yes'", 'Will not vote', "Don't know"]
        
    if ( dataset_name in ["W6_comb"] ):
        careful_replace_and_set_cats( "scotReferendumIntentionW6",  scotReferendumIntention_replace, scotReferendumIntention_final_cats )

        
    
    Religion = {'No, I do not regard myself as belonging to any particular religion.': 'No, I do not regard myself as belonging to any particular religion.',
         'Yes - Church of England/Anglican/Episcopal': 'Yes - Church of England/Anglican/Episcopal',
         'Yes - Roman Catholic': 'Yes - Roman Catholic',
         'Yes - Presbyterian/Church of Scotland': 'Yes - Presbyterian/Church of Scotland',
         'Yes - Methodist': 'Yes - Methodist',
         'Yes - Baptist': 'Yes - Baptist',
         'Yes - United Reformed Church': 'Yes - United Reformed Church',
         'Yes - Free Presbyterian': 'Yes - Free Presbyterian',
         'Yes - Brethren': 'Yes - Brethren',
         'Yes - Judaism': 'Yes - Judaism',
         'Yes - Hinduism': 'Yes - Hinduism',
         'Yes - Islam': 'Yes - Islam',
         'Yes - Sikhism': 'Yes - Sikhism',
         'Yes - Buddhism': 'Yes - Buddhism',
         'Yes - Other': 'Yes - Other',
         16.0: 'Prefer not to say',
         17.0: 'Yes - Orthodox Christian',
         18.0: 'Yes - Pentecostal',
         19.0: 'Yes - Evangelical /independent/non-denominational'}

    
    if ( dataset_name in ["W6_comb","W5_comb","W5_only","W4_comb","W3_comb"] ):

        col = "profile_religion"
        careful_replace( col , Religion )        

    if ( dataset_name in ["W1_only"] ):

        col = "profile_religion"
        careful_replace( col , {el:el for el in Religion.values()} )            
        
    if ( dataset_name in ["W7_only"] ):
        col = "ns_sec"
        ns_sec = "Employers in large establishments|Higher managerial and administrative occupations|L3.1 Traditional employees|L3.2 New employees|L3.3 Traditional self-employed|L3.4 New self-employed|L4.1 Traditional employees|L4.2 New employees|L4.3 Traditional self-employed|L4.4 New self-employed|Lower managerial and administrative occupations|Higher supervisory occupations|L7.1 Intermediate clerical and administrative occupations|L7.2 Intermediate sales and service occupations|L7.3 Intermediate technical and auxiliary occupations|L7.4 Intermediate engineering occupations|L8.1 Employers in small establishments in industry, commerce, services etc.|L8.2 Employers in small establishments in agriculture|L9.1 Own account workers (non-professional)|L9.2 Own account workers (agriculture)|Lower supervisory occupations|L11.1 Lower technical craft occupations|L11.2 Lower technical process operative occupations|L12.1 Semi-routine sales occupations|L12.2 Semi-routine service occupations|L12.3 Semi-routine technical occupations|L12.4 Semi-routine operative occupations|L12.5 Semi-routine agricultural occupations|L12.6 Semi-routine clerical occupations|L12.7 Semi routine childcare occupations|L13.1 Routine sales and service occupations|L13.2 Routine production occupations|L13.3 Routine technical occupations|L13.4 Routine operative occupations|L13.5 Routine agricultural occupations"
        
        careful_replace( col , {el:el for el in ns_sec.split("|")} )
#         BES_Panel[col].cat.set_categories(ns_sec.split("|"),inplace=True)
        
        
    if ( dataset_name in ["W1_only"] ):
        ageGroup = {1.0:"Under 18",
                    2.0:"18-25",
                    3.0:"26-35",
                    4.0:"36-45",
                    5.0:"46-55",
                    6.0:"56-65",
                    7.0:"66+"}
        col = "ageGroup"
        careful_replace( col , {el:el for el in ageGroup.values()})      
        
        
    if ( dataset_name in [ "W13_comb" , "W11_only" ] ):
        
        # None|Church of England/Anglican/Episcopal|Roman Catholic|Presbyterian/Church of Scotland|Methodist|Baptist
        # A|B|C1|C2|D|E|Refused|Unknown
        # DOUBLE CHECK DISTRIBUTION
        SocialGrades = {"None":"A",
                        "Church of England/Anglican/Episcopal":"B",
                        "Roman Catholic":"C1",
                        "Presbyterian/Church of Scotland":"C2",
                        "Methodist":"D",
                        "Baptist":"E",
                        "<placeholder1>":"Refused",
                        "<placeholder2>":"Unknown"}
        col = "profile_socialgrade_cie"        
        careful_replace( col , SocialGrades )
        
    NumEmployees = {1.0:"1 to 24 employees",
                    2.0:"25 to 499 employees",
                    3.0:"500 or more employees",
                    9999.0:"Don't know"}

    if ( dataset_name in ["W1_only","W2_only","W3_only","W4_only","W11_only","W12_only","W13_only","W13_comb", "W10_only"] ):
        # necessary because motherNumEmployees lacks some categories!

        col = "fatherNumEmployees"
        careful_replace( col , NumEmployees )

        col = "motherNumEmployees"
        careful_replace( col , NumEmployees )
        
    if ( dataset_name in ["W9_only"] ):        
        
        col = "motherNumEmployees"
        careful_replace( col , {el:el for el in NumEmployees.values()} )        

    if ( dataset_name in ["W6_comb"] ):
        # not entirely necessary to implement it this way, it's just a bit clearer

        churchAttendance = {111.0:"Never or practically never",
                            "Less often than once a year":"Less often than once a year",
                            "Less often but at least once a year":"Less often but at least once a year",
                            "Less often but at least twice a year":"Less often but at least twice a year",
                            "Less often but at least once a month":"Less often but at least once a month",
                            "Less often but at least once in two weeks":"Less often but at least once in two weeks",
                            "Once a week or more":"Once a week or more",
                            222.0:"Varies too much to say",
                            98.0:"I am not religious",
                            99.0:"Don't know"}

        col = "churchAttendanceW6"
        careful_replace( col , churchAttendance )


        partyMember =      {0.0:"No, I have never been a member",
                            "I am not a member now but I used to be":"I am not a member now but I used to be",
                            "Yes, I am a member of a party":"Yes, I am a member of a party",
                            9999.0:"Don't know"}

        col = "partyMemberW6"
        careful_replace( col , partyMember )       


    headHouseholdPast_cat_list = "My father|My mother|Someone else|No one in my house worked|Don't know"
    if ( dataset_name in [ "W3_only","W4_only","W11_only","W12_only","W13_only", "W13_comb","W10_only" ] ):
        hardcoded_fix("headHouseholdPast",
                      headHouseholdPast_cat_list)

    generalElectionCertainty_cat_list = "Not at all certain|2|3|4|5|6|Completely certain|Don't know"
    if ( dataset_name in ["W4_comb","W5_comb"] ):
        # array of floats, should be a categorical
        hardcoded_fix("generalElectionCertaintyW1",
                      generalElectionCertainty_cat_list)
        hardcoded_fix("generalElectionCertaintyW2",
                      generalElectionCertainty_cat_list)

    if ( dataset_name in ["W5_comb"] ):
        # array of floats, should be a categorical
        hardcoded_fix("generalElectionCertaintyW3",
                      generalElectionCertainty_cat_list)        


    scotReferendumIntention_cat_list = "Will vote no|Will vote 'Yes'|Will not vote|Don't know"
    if ( dataset_name in ["W4_comb","W5_comb","W6_comb"] ):
        # array of floats, should be a categorical  
        hardcoded_fix("scotReferendumIntentionW4",
                      scotReferendumIntention_cat_list)  

    selfNumEmployees_cat_list = "1 to 24 employees|25 to 499 employees|500 or more employees|Don't know"
#     selfNumEmployeesW6_W12, selfNumEmployeesLastW6_W12
    if ( dataset_name in [ 'W13_comb' ] ):
        hardcoded_fix("selfNumEmployeesW6_W12",
                      selfNumEmployees_cat_list )
        hardcoded_fix("selfNumEmployeesLastW6_W12",
                      selfNumEmployees_cat_list )    

    if ( dataset_name in [ 'W12_only' ] ):
        hardcoded_fix("selfNumEmployeesW6_",
                      selfNumEmployees_cat_list )
        hardcoded_fix("selfNumEmployeesLastW6_",
                      selfNumEmployees_cat_list )          
    
    if ( dataset_name in [ "W7_comb" ] ):  
        hardcoded_fix("selfNumEmployeesW6W7",
                      selfNumEmployees_cat_list )           
        hardcoded_fix("selfNumEmployeesLastW6W7",
                      selfNumEmployees_cat_list )          

    if ( dataset_name in [ "W8_comb" ] ):
        hardcoded_fix("selfNumEmployeesW6W7W8",
                      selfNumEmployees_cat_list )           
        hardcoded_fix("selfNumEmployeesLastW6W7W8",
                      selfNumEmployees_cat_list )  

    if ( dataset_name in [ "W10_comb", "W9_comb", "W9_only" ] ): #"W13_comb", 
        hardcoded_fix("selfNumEmployeesW6W7W8W9",
                      selfNumEmployees_cat_list )
        
    if ( dataset_name in [ "W10_comb", "W9_comb", "W9_only" ] ): #"W13_comb",         
        hardcoded_fix("selfNumEmployeesLastW6W7W8W9",
                      selfNumEmployees_cat_list )
        
#     if ( dataset_name in [ "W12_only","W11_only","W10_only","W13_comb" ] ):
        
# #         careful_replace( "selfNumEmployees" , {el:el for el in NumEmployees.values()} )  
# #         careful_replace( "selfNumEmployeesLast" , {el:el for el in NumEmployees.values()} )
        
#         careful_replace_and_set_cats( "selfNumEmployees", {}, NumEmployees.values() )
#         careful_replace_and_set_cats( "selfNumEmployeesLast", {}, NumEmployees.values() )        


    #    "knowf2f2","knowf2f3", #  floats (0.0, 1.0, 99.0)  that should be categories True|False|Don't know
    knowf2_cat_list = "True|False|Don't know"
    if ( dataset_name in ["W12_only"]):
        hardcoded_fix("knowf2f2",
                      knowf2_cat_list )            
        hardcoded_fix("knowf2f3",
                      knowf2_cat_list )  

    if ( dataset_name in [ "W13_comb" ] ):  
        hardcoded_fix("knowf2f2W12",
                      knowf2_cat_list )             
        hardcoded_fix("knowf2f3W12",
                      knowf2_cat_list )

    likeSalmond_list = "Strongly dislike|1.0|2.0|3.0|4.0|5.0|6.0|7.0|8.0|9.0|Strongly like|Don't know"
    if ( dataset_name in [ "W4_comb","W4_comb","W5_comb" ] ):
        hardcoded_fix("likeSalmondW1",
                      likeSalmond_list )   
        hardcoded_fix("likeSalmondW2",
                      likeSalmond_list )
        hardcoded_fix("likeSalmondW3",
                      likeSalmond_list )

    eesEUIntegration_list = "Unification has already gone too far|1.0|2.0|3.0|4.0|5.0|6.0|7.0|8.0|9.0|Unification should be pushed further|Don't know"    
    if ( dataset_name in [ "W3_comb","W4_comb","W4_comb","W5_comb" ] ):
        hardcoded_fix("eesEUIntegrationGreenW2",
                      eesEUIntegration_list )    

    likeSturgeon_list = "Strongly dislike|1.0|2.0|3.0|4.0|5.0|6.0|7.0|8.0|9.0|Strongly like|Don't know"    
    if ( dataset_name in [ "W4_comb","W4_comb","W5_comb" ] ):
        hardcoded_fix("likeSturgeonW4",
                      likeSturgeon_list )

    # W5_comb
    # No|Yes	Got a lot worse|Got a little worse
    # partyContactDKW5, participation_1-6W5, sharedContentOnline_1-5W5, participation_99W5
    participation_list = "No|Yes"    
    if ( dataset_name in [ "W5_comb" ] ):
        hardcoded_fix("partyContactDKW5",
                      participation_list )    
        hardcoded_fix("participation_1W5",
                      participation_list )   
        hardcoded_fix("participation_2W5",
                      participation_list )   
        hardcoded_fix("participation_3W5",
                      participation_list )   
        hardcoded_fix("participation_4W5",
                      participation_list )   
        hardcoded_fix("participation_5W5",
                      participation_list )   
        hardcoded_fix("participation_6W5",
                      participation_list )   
        hardcoded_fix("sharedContentOnline_1W5",
                      participation_list )  
        hardcoded_fix("sharedContentOnline_2W5",
                      participation_list )      
        hardcoded_fix("sharedContentOnline_3W5",
                      participation_list )      
        hardcoded_fix("sharedContentOnline_4W5",
                      participation_list )  
        hardcoded_fix("sharedContentOnline_5W5",
                      participation_list )      
        hardcoded_fix("participation_99W5",
                      participation_list )       
        
    return BES_Panel

In [17]:
def fix_100_seq(col, start, finish, weasel, rng=100):
    lst = list([weasel,start, finish])
    lst_dict = {"0":start,str(rng):finish}

    fullseq = [start]
    [fullseq.append(str(x)) for x in range(1,rng)]
    fullseq.append(finish)
    fullseq.append(weasel)
    # make sure all numbers in same format (string integers)
    BES_Panel[col] = BES_Panel[col].cat.rename_categories( [str(int(x)) if x not in lst else x for x in BES_Panel[col].cat.categories ] )
    BES_Panel[col] = BES_Panel[col].cat.rename_categories( [lst_dict[x] if x in lst_dict.keys() else x for x in BES_Panel[col].cat.categories ] )
    
    # change categories to correct range
    BES_Panel[col] = BES_Panel[col].cat.set_categories(fullseq)
    if len( BES_Panel[col].cat.categories ) != rng+2:
        raise Exception("wrong number of categories!")




def number_and_string_sequences(  ):

# How to deal with large sequences of numbers (e.g. %)
# Which have some values missing (presumably because no entries)
# But also have strings at the ends

# Want to keep the string categories (because they're useful for clarification)
# But also want the numeric coding to be remain accurate
# e.g. "0% no support for X, 1% ... 45%, 83%, 100% complete support for X" -> would normally turn into [0,1...45,46,47]
# should turn into [0,1...45,83,100]

# It's *POSSIBLE* that question answerers don't think this way - might get cleaner results by just assuming positional placement
# Would be useful to have a switch to test that



# run on everything like this

#

# re.match( "(winConstituency[a-zA-Z0-9_]+)", "winConstituencyConW4").groups()[0]

# maybe simply run this on all variables marked 6?
# tweak the ends, drop the DKS, then turn to floats?


    str_float_0_100_cats = [str(float(x)) for x in range(0,101)] # ['0.0', '1.0', '2.0', '3.0' ... '98.0', '99.0', '100.0']

    ### this isn't an error so much as a matter of practicality
    # if I make all these values integers then we don't have to
    # worry about missing categories
    # (assuming they're only missing because of legit. lack of entries)
    col = "scotRefExpectationTurnout"

    if ( col in  BES_Panel.columns ):    

        start = "0% of people will vote"
        finish = "100% of people will vote"
        weasel = "Don't know"        
        fix_100_seq(col, start, finish, weasel)         
        
#         scotRefExpectationTurnout_list = ["100.0" if x=="100% of people will vote" else x for x in BES_Panel[col].cat.categories]
#         BES_Panel[col].cat.rename_categories( scotRefExpectationTurnout_list, inplace=True )
#         add_categories()


    col = "winConstituencyPC"    
    if ( col in  BES_Panel.columns ):
        
        start = "0 - Very unlikely to win"
        finish = "100 - Very likely to win"
        weasel = "Don't know"        
        fix_100_seq(col, start, finish, weasel)          
        
#         winConstituencyPC_list = ["100.0" if x=="100 - Very likely to win" else x for x in BES_Panel[col].cat.categories]
#         winConstituencyPC_list = ["0.0" if x=="0 - Very unlikely to win" else x for x in winConstituencyPC_list]
#         BES_Panel[col].cat.rename_categories( winConstituencyPC_list, inplace=True )

    col = "winConstituencySNP"
    if ( col in  BES_Panel.columns ):    
        start = "0 - Very unlikely to win"
        finish = "100 - Very likely to win"
        weasel = "Don't know"        
        fix_100_seq(col, start, finish, weasel)   

    col = "winConstituencyGreen"
    if ( col in  BES_Panel.columns ):    
        start = "0 - Very unlikely to win"
        finish = "100 - Very likely to win"
        weasel = "Don't know"        
        fix_100_seq(col, start, finish, weasel)    
        
        
# Allow many fewer|2|4|5|6|7|8|9|Allow many more|Don't know        

    col = "immigSNP"
    if ( col in  BES_Panel.columns ):    
        start = "Allow many fewer"
        finish = "Allow many more"
        weasel = "Don't know"        
        fix_100_seq(col, start, finish, weasel,10)       


    col = "immigPC"
    if ( col in  BES_Panel.columns ):    
        start = "Allow many fewer"
        finish = "Allow many more"
        weasel = "Don't know"        
        fix_100_seq(col, start, finish, weasel,10)      
## NEED TO SET THESE AS TYPE 6!    
    

In [18]:
# dataset_name = "W1_only"
# BES_file_manifest = pd.read_csv( BES_small_data_files + "BES_file_manifest.csv" )
# manifest = BES_file_manifest[ BES_file_manifest["Name"] == dataset_name ]

# data_subfolder = BES_data_folder + dataset_name + os.sep

# filename = manifest["Stata_Filename"].values[0]

# BES_Panel = pd.read_stata( data_subfolder + filename)




In [19]:
# col = "fatherNumEmployees"
# careful_replace( col , NumEmployees )

In [20]:
# col = "motherNumEmployees"
# careful_replace( col , NumEmployees )

In [29]:
def process_dataset(dataset_name):

    BES_file_manifest = pd.read_csv( BES_small_data_files + "BES_file_manifest.csv" )
    manifest = BES_file_manifest[ BES_file_manifest["Name"] == dataset_name ]

    data_subfolder = BES_data_folder + dataset_name + os.sep

    filename = manifest["Stata_Filename"].values[0]

    global BES_Panel
    BES_Panel = pd.read_stata( data_subfolder + filename)
    ####################################################

    # use this dataframe to store *everything* we're doing to transform/ignore variables!
    global var_type
    var_type = pd.DataFrame(columns = ["dataset_name","dtype","cat_all_strings","type","pruned","original_cat_list",
                                       "renamed_cat_list","reordered_cat_list","final_cat_list",
                                       "dataset_specific_hardcoded_fix",
                                       "numerical_dont_knows",
                                       "weasel_words","typos" ] )
    ####################################################

    BES_Panel = hard_coded_fixes( dataset_name ) # side effects on BES_Panel and var_type
    number_and_string_sequences() # side effects on BES_Panel

    variable_categories = pd.read_csv( BES_small_data_files + "question_categories_correct.csv",
                                       encoding = encoding,index_col=False )
    variable_categories.drop('Unnamed: 0', axis=1,inplace=True)

    ( var_cat_dict_pruned , var_cat_dict_pruned_2 ) = create_var_list( variable_categories )
    ####################################################

    missing_col_names = []
    try:
        for col in BES_Panel.columns:
            print(col)
            dt =  BES_Panel[col].dtype.name # data type
    #         not_found = False

            var_type.loc[col,"dataset_name"] = dataset_name
            # dtype is either nan because not set -> set
            if not isinstance(var_type.loc[col,"dtype"],str):
                var_type.loc[ col , "dtype"] = dt    
            # if dtype == category *and* cat_all_strings not already set, set
            if (var_type.loc[ col , "dtype" ] == 'category') and careful_isnan( var_type.loc[ col , "cat_all_strings" ] ):
                var_type.loc[ col , "cat_all_strings" ] = np.all([isinstance(x,str) for x in BES_Panel[ col ].cat.categories])

            not_found = False      

            if (col in ignore_list) or (var_type.loc[col,"type"] == -2): # exclude values from ignore_list *and manually coded errors*
                var_type.loc[col,"type"] = -2
                if var_type.loc[ col , "cat_all_strings" ]==True:
                    var_type.loc[ col, "original_cat_list" ] = "|".join( BES_Panel[col].cat.categories )
                elif ('float' in dt) or ('int' in dt):
                    var_type.loc[ col, "original_cat_list" ] = list(BES_Panel[col].unique())

            elif (col in ["id"] ): # id
                var_type.loc[col,"type"] = -5

            elif (dt == 'object'): # (probably) text
                var_type.loc[col,"type"] = -4

            elif ("datetime" in dt): # datetime
                var_type.loc[col,"type"] = -3

        # 0 - personality measures (in steps of .5?), personality_agreeableness ...etc, riskScaleW8        
            elif (col in ["personality_agreeableness",
                         "personality_conscientiousness",
                         "personality_extraversion",
                         "personality_neuroticism",
                         "personality_openness"]) or (re.match("(cogempathy|affempathy|zeroSum)IRT",col) is not None) or (re.match("riskScale(W[0-9]+)?",col) is not None) :

                var_type.loc[col,"type"] = 0

        # 7 - soc2010(W3-6_comb,W5_only), v1(W5_comb), RandomIDW1(W3-6_comb), mapNames(W3_only), mapNamesW3 (W3-10_comb,W13_comb)        
            elif re.match("soc2010|v1|RandomIDW1|mapNames(W[0-9]+)?" ,col) is not None:
                var_type.loc[col,"type"] = 7

        # 8 - pano, electoratepcon, <party>sh10pcon, turnout10pcon, winnersh10pcon, runnerupsh10pcon, marginsh10pcon
        # don't include 'runnerup10pcon', 'winner10pcon'- these are categorical!
        # all relate to parliamentary constituency (pano applies to different waves - rest are about 2010 general election)
            elif re.match( "pano(W[0-9]+)?|electoratepcon|[a-zA-Z]+sh10pcon|turnout10pcon" , col ) is not None:
                var_type.loc[col,"type"] = 8

            elif col in ['cciW1W2W3W4W5','ccinoITW1W2W3W4W5','justITW1W2W3W4W5','cciW6W7W8W9','ccinoITW6W7W8W9','justITW6W7W8W9']:
                var_type.loc[col,"type"] = 9

            # wave flags/weights (int and float)
            elif re.match("wave[0-9]+|"\
                          "w[0-9]+core|"\
                          "w[0-9]+full|"\
                          "wt_daily_W[0-9]+|"\
                          "wt_core_W[0-9]+|"\
                          "wt_full_[W0-9]+|"\
                          "wt_new_[W0-9]+|"\
                          "CampaignDay(W[0-9]+)?|"\
                          "miilabelcertainty(W[0-9]+)?|"\
                          "Dailyweight(W[0-9]+)?|"\
                          "new_full_weight|"\
                          "w8_wave6_and_wave7|w8_wave2_and_wave6|w8_wave2_and_wave6_and_wave7|w8_wave9_to_wave13|"\
                          "wt_new_|"\
                          "wt|"\
                          "waves_taken" , col) is not None: 

                var_type.loc[col,"type"] = -1

            # waveX - wave int wave 0/1 flag
            # wave 1-11: wt_full_W6, wt_core_W6, wt_full_W1W2W3W4W5W6W7W8W9), 
            # waves 10: wt_new_W10, wt_full_W1_W13
            # CampaignDayWX
            # miilabelcertaintyWX

            else:
                not_found = True
                type_range = set(variable_categories["type"].values)
                for typ in type_range:
                    pruned_variable_name = prune2( prune(col) )
                    if pruned_variable_name in var_cat_dict_pruned_2[typ]:
                        var_type.loc[col,"type"] = typ
                        var_type.loc[col,"pruned"] = pruned_variable_name
                        not_found = False

            if not_found == True:
                var_type.loc[col,"type"] = -99
                pruned_variable_name = prune2( prune(col) )
                var_type.loc[col,"pruned"] = pruned_variable_name
                missing_col_names.append(col)
    except Exception as e:
        print(col, e)            

    var_type["type"] = var_type["type"].astype("int8")

    # reset order of var_type rows to be same as BES_Panel
    var_type = var_type.loc[BES_Panel.columns]

    ####################################################

    missing_col_names_cat_only = []

    for col in missing_col_names:
        if BES_Panel[col].dtypes.name == 'category':
            missing_col_names_cat_only.append(col)

    ####################################################

    if missing_col_names:
        updated_variable_categories = variable_categories.copy()
        # question	frequency	question_length	question_options	column_name	type

        for i in missing_col_names_cat_only:
            str_list = [ str(cat) for cat in BES_Panel[i].cat.categories ]
            joined_list = "|".join(str_list)
            match  = (joined_list == updated_variable_categories["question"])

            if match.any(): # answer set already in records
                index = updated_variable_categories[match].index
                if len(index)>1: # answer set ("question") index should be unique!
                    raise ValueError('answer set ("question") index should be unique!')

                # add column name and increase frequency
                updated_variable_categories.loc[index,"frequency"] = updated_variable_categories.loc[index,"frequency"]+1
                current_list_col_names = updated_variable_categories.loc[index,"column_name"].values[0].split("|")
                current_list_col_names.append(i)
                updated_variable_categories.loc[index,"column_name"] = "|".join( current_list_col_names )

            else: # answer set not already in records - add new line to dataframe
                df = pd.DataFrame([],  columns = updated_variable_categories.columns )

                # no need to add index
                # updated_variable_categories.shape[0], 
                df.loc[0] = [joined_list,
                             1,
                             len(joined_list),
                             len(str_list),
                             i,-99]
                
#                 df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
                updated_variable_categories = pd.concat( [updated_variable_categories,df], ignore_index = True  )
#                 updated_variable_categories = updated_variable_categories.append(df, ignore_index=True)

        variable_categories = updated_variable_categories
        updated_variable_categories.to_csv(BES_small_data_files + "question_categories_correct_updatesneeded!.csv",
                                           encoding = encoding )


        display([x for x in zip(missing_col_names, BES_Panel[missing_col_names].dtypes)])

        manual_fixing_advice_string = "Stop - new variables detected\n"\
                                      "Go look at question_categories_correct_updatesneeded!.csv\n"\
                                      "fill in types, save as question_categories_correct.csv and rerun this code"


        raise Exception(manual_fixing_advice_string)
    ####################################################

    # [-5, -4, -3, -2, -1, 4, 7, 8, 9] -> meta list
    # [0, 1, 2, 3, 5, 6] ->     
    content_list = [0, 1, 2, 3, 5, 6]
    meta_list = [-5, -4, -3, -2, -1, 7, 8, 9] # -99, 4 excluded because could be categorical
    # 'numeric' columns (ones that can be transformed into numbers)
    num_cols     = BES_Panel.columns[ (var_type["type"].apply( lambda x: x in [0,1,2,3,5,6] )).values ]
    # can't be transformed into numbers / are numbers but are meta-data rather than raw content (e.g. weights)
    non_num_cols = BES_Panel.columns[ (var_type["type"].apply( lambda x: x in [-99,-5,-4,-3,-1 ]  )).values ]

    BES_numeric  = BES_Panel[num_cols].copy()
    for col in BES_numeric:

        if col not in var_type["type"].index:
            raise Exception( "variable not registered - and somehow slipped past!" )

        if var_type.loc[ col, "type" ] in [0,7]:
            continue

        # force all category elements into strings
        # ARE THEY EVER NOT?
        BES_numeric[col].cat.rename_categories( BES_numeric[col].cat.categories.map(str), inplace=True )

        join_list = "|".join( BES_numeric[col].cat.categories ) # create category_list_string "strongly agree|agree|neither|..."
        var_type.loc[ col, "original_cat_list" ] = join_list    

        # typos - things with weird characters
        fixed_cat_string = fix_a_hat_chars( join_list )
        if fixed_cat_string is not None:
            var_type.loc[ col, "typos" ]   = join_list      
            BES_numeric[col].cat.rename_categories( fixed_cat_string , inplace=True )
            join_list = "|".join( BES_numeric[col].cat.categories )        

        # rename categories
        if join_list in rename_cat_dict.keys():
            var_type.loc[ col, "renamed_cat_list" ]   = join_list        
            BES_numeric[col].cat.rename_categories(  rename_cat_dict[join_list], inplace=True )
            join_list = "|".join( BES_numeric[col].cat.categories )        

        # reorder categories
        if join_list in change_cat_dict.keys():
            var_type.loc[ col, "reordered_cat_list" ] = join_list        
            BES_numeric[col].cat.reorder_categories( change_cat_dict[join_list], inplace=True )
            join_list = "|".join( BES_numeric[col].cat.categories )        

        # remove "Don't Know"s that are in weird numerical form (eg. [ "9999.0", "997.0", "222.0", "99.0", "0.0" ])
        # de_weasel numbers
        numerical_dont_knows = de_weasel_nums( BES_numeric[col].cat.categories )
        if len(numerical_dont_knows) != 0:
            BES_numeric[col].cat.remove_categories( numerical_dont_knows , inplace=True )
            var_type.loc[ col, "numerical_dont_knows" ] = "|".join( numerical_dont_knows )

        # set all digits to floating point format, one decimal place
        BES_numeric[col].cat.rename_categories( de_num( BES_numeric[col].cat.categories ), inplace=True )

        # de_weasel
        weasel_words = BES_numeric[col].cat.categories.intersection(Weasel_set)
        if len(weasel_words) != 0:    
            BES_numeric[col].cat.remove_categories( weasel_words, inplace=True )
            var_type.loc[ col, "weasel_words" ] = "|".join( weasel_words )

        # Laziness - I want an extra column with the destination category sets
        # (should be a smaller set than original category sets)
        var_type.loc[ col, "final_cat_list" ] = "|".join( BES_numeric[col].cat.categories )        
    ####################################################

    # save category data
    cat_dictionary = {}
    for col in BES_numeric.columns:
        if var_type["type"][col] in [1, 2, 3, 5]: # not just cat, but one not already numerical!
            cat_dictionary[col] = BES_numeric[col].cat.categories


    # turn categories into numbers
    for col in BES_numeric:

        if var_type["type"][col] in [1,2,3,5]: # category type variables (other than indicators)
            BES_numeric[col] = BES_numeric[col].cat.codes

        if var_type["type"][col] in [0,1,2,3,5,6,7]:
            BES_numeric[col] = BES_numeric[col].astype('float64')

    BES_numeric.replace(-1,np.nan, inplace=True) # replace -1 cat code for NaN with actual NaN - downside, requires dtype float
    ####################################################

    fname = data_subfolder + "cat_dictionary.pkl"
    with open(fname, "wb") as f:
        pickle.dump( cat_dictionary, f )

    BES_non_numeric = BES_Panel[non_num_cols].copy()

    BES_non_numeric.to_pickle( data_subfolder + "BESnon_numeric.zip", compression='zip' )

    BES_numeric.to_pickle( data_subfolder + "BESnumeric.zip",  compression='zip' )

    var_type.to_csv( data_subfolder + "var_type.csv", encoding = encoding )
    # don't think the performance warning will be relevant on such a small dataframe

In [22]:
# types
# -99 - Uncategorised!
# -5 - id
# -4 - text
# -3 - datetimes
# -2 - ignore_list
# -1 - weights/wave indicators/campaign day indicators/miilabeluncertainty
# 0 - personality measures (in steps of .5?), personality_agreeableness ...etc, riskScale
# 1 - linear category, just use  (some made linear by dropping "Weasel_answers")
# 2 - categories need to be modified - order changed
# 3 - set of non-ordered options
# 4 - indirect variables - did someone fill something in in the free text box or not?
# 5 - categories need to modified - things removed
    # not so clear when this one applies - is it supposed to be whenever weasel words are removed?
    # or when variables are *changed*
# 6 - categories are integers - should maybe be transformed directly into numbers (mostly "how much money do people need minimum/well off"?)
# 7 - soc2010(W3-6_comb,W5_only), v1(W5_comb), RandomIDW1(W3-6_comb), mapNames(W3_only), mapNamesW3 (W3-10_comb,W13_comb)        
# 8 - pano, electoratepcon, <party>sh10pcon, turnout10pcon, winnersh10pcon, runnerupsh10pcon, marginsh10pcon
#     all relate to parliamentary constituency (pano applies to different waves - rest are about 2010 general election)
# 9 - 'cciW1W2W3W4W5','ccinoITW1W2W3W4W5','justITW1W2W3W4W5','cciW6W7W8W9','ccinoITW6W7W8W9','justITW6W7W8W9'
#     floats - otherwise, no idea what these variables are!
#     they are 0/1 - look like wave related indicator variables


# [-5, -4, -3, -2, -1, 4, 7, 8, 9] -> meta list
# [0, 1, 2, 3, 5, 6] -> 

# ordinal: 0, 1, 2, 5, 6
# non-ordinal: 3, 7

# load question_categories_correct.csv
# sanity check by type!
# turn into list of variables by type
# 1, 5 handled the same way -> cat.codes
# 6 -> int()
# 4 ignored
# 3 ignored for now (-> vectorized?)
# 2 direct modification

In [27]:
 data_subfolder + filename

'../BES_analysis_data/W26_only\\BES2019_W26_v0.0.zip'

In [None]:
# BES_Panel = hard_coded_fixes( dataset_name )
# BES_Panel[col].value_counts()

In [59]:
dataset_name = 'W26_only'


BES_file_manifest = pd.read_csv( BES_small_data_files + "BES_file_manifest.csv",encoding = "ISO-8859-1" )
manifest = BES_file_manifest[ BES_file_manifest["Name"] == dataset_name ]

data_subfolder = BES_data_folder + dataset_name + os.sep

filename = manifest["Stata_Filename"].values[0]

global BES_Panel
if ".zip" in filename:
    BES_Panel = pd.read_pickle( data_subfolder + filename, compression='zip')
else:
    BES_Panel = pd.read_stata( data_subfolder + filename)
####################################################

# use this dataframe to store *everything* we're doing to transform/ignore variables!
global var_type
var_type = pd.DataFrame(columns = ["dataset_name","dtype","cat_all_strings","type","pruned","original_cat_list",
                                   "renamed_cat_list","reordered_cat_list","final_cat_list",
                                   "dataset_specific_hardcoded_fix",
                                   "numerical_dont_knows",
                                   "weasel_words","typos" ] )
####################################################

BES_Panel = hard_coded_fixes( dataset_name ) # side effects on BES_Panel and var_type
number_and_string_sequences() # side effects on BES_Panel

variable_categories = pd.read_csv( BES_small_data_files + "question_categories_correct.csv",
                                   encoding = encoding,index_col=False )
variable_categories.drop('Unnamed: 0', axis=1,inplace=True)

( var_cat_dict_pruned , var_cat_dict_pruned_2 ) = create_var_list( variable_categories )
####################################################

missing_col_names = []
try:
    for col in BES_Panel.columns:
        print(col)
        dt =  BES_Panel[col].dtype.name # data type
#         not_found = False

        var_type.loc[col,"dataset_name"] = dataset_name
        # dtype is either nan because not set -> set
        if not isinstance(var_type.loc[col,"dtype"],str):
            var_type.loc[ col , "dtype"] = dt    
        # if dtype == category *and* cat_all_strings not already set, set
        if (var_type.loc[ col , "dtype" ] == 'category') and careful_isnan( var_type.loc[ col , "cat_all_strings" ] ):
            var_type.loc[ col , "cat_all_strings" ] = np.all([isinstance(x,str) for x in BES_Panel[ col ].cat.categories])

        not_found = False      

        if (col in ignore_list) or (var_type.loc[col,"type"] == -2): # exclude values from ignore_list *and manually coded errors*
            var_type.loc[col,"type"] = -2
            if var_type.loc[ col , "cat_all_strings" ]==True:
                var_type.loc[ col, "original_cat_list" ] = "|".join( BES_Panel[col].cat.categories )
            elif ('float' in dt) or ('int' in dt):
                var_type.loc[ col, "original_cat_list" ] = list(BES_Panel[col].unique())

        elif (col in ["id"] ): # id
            var_type.loc[col,"type"] = -5

        elif (dt == 'object'): # (probably) text
            var_type.loc[col,"type"] = -4

        elif ("datetime" in dt): # datetime
            var_type.loc[col,"type"] = -3

    # 0 - personality measures (in steps of .5?), personality_agreeableness ...etc, riskScaleW8        
        elif (col in ["personality_agreeableness",
                     "personality_conscientiousness",
                     "personality_extraversion",
                     "personality_neuroticism",
                     "personality_openness"]) or (re.match("(cogempathy|affempathy|zeroSum)IRT",col) is not None) or (re.match("riskScale(W[0-9]+)?",col) is not None) :
            
            var_type.loc[col,"type"] = 0

    # 7 - soc2010(W3-6_comb,W5_only), v1(W5_comb), RandomIDW1(W3-6_comb), mapNames(W3_only), mapNamesW3 (W3-10_comb,W13_comb)        
        elif re.match("soc2010|v1|RandomIDW1|mapNames(W[0-9]+)?" ,col) is not None:
            var_type.loc[col,"type"] = 7

    # 8 - pano, electoratepcon, <party>sh10pcon, turnout10pcon, winnersh10pcon, runnerupsh10pcon, marginsh10pcon
    # don't include 'runnerup10pcon', 'winner10pcon'- these are categorical!
    # all relate to parliamentary constituency (pano applies to different waves - rest are about 2010 general election)
        elif re.match( "pano(W[0-9]+)?|electoratepcon|[a-zA-Z]+sh10pcon|turnout10pcon" , col ) is not None:
            var_type.loc[col,"type"] = 8

        elif col in ['cciW1W2W3W4W5','ccinoITW1W2W3W4W5','justITW1W2W3W4W5','cciW6W7W8W9','ccinoITW6W7W8W9','justITW6W7W8W9']:
            var_type.loc[col,"type"] = 9

        # wave flags/weights (int and float)
        elif re.match("wave[0-9]+|"\
                      "w[0-9]+core|"\
                      "w[0-9]+full|"\
                      "wt_daily_W[0-9]+|"\
                      "wt_core_W[0-9]+|"\
                      "wt_full_[W0-9]+|"\
                      "wt_new_[W0-9]+|"\
                      "CampaignDay(W[0-9]+)?|"\
                      "miilabelcertainty(W[0-9]+)?|"\
                      "Dailyweight(W[0-9]+)?|"\
                      "new_full_weight|"\
                      "w8_wave6_and_wave7|w8_wave2_and_wave6|w8_wave2_and_wave6_and_wave7|w8_wave9_to_wave13|"\
                      "wt_new_|"\
                      "wt|"\
                      "waves_taken|wave|weight" , col) is not None: 

            var_type.loc[col,"type"] = -1

        # waveX - wave int wave 0/1 flag
        # wave 1-11: wt_full_W6, wt_core_W6, wt_full_W1W2W3W4W5W6W7W8W9), 
        # waves 10: wt_new_W10, wt_full_W1_W13
        # CampaignDayWX
        # miilabelcertaintyWX

        else:
            not_found = True
            type_range = set(variable_categories["type"].values)
            for typ in type_range:
                pruned_variable_name = prune2( prune(col) )
                if pruned_variable_name in var_cat_dict_pruned_2[typ]:
                    var_type.loc[col,"type"] = typ
                    var_type.loc[col,"pruned"] = pruned_variable_name
                    not_found = False

        if not_found == True:
            var_type.loc[col,"type"] = -99
            pruned_variable_name = prune2( prune(col) )
            var_type.loc[col,"pruned"] = pruned_variable_name
            missing_col_names.append(col)
except Exception as e:
    print(col, e)            

var_type["type"] = var_type["type"].astype("int8")

# reset order of var_type rows to be same as BES_Panel
var_type = var_type.loc[BES_Panel.columns]

####################################################

missing_col_names_cat_only = []

for col in missing_col_names:
    if BES_Panel[col].dtypes.name == 'category':
        missing_col_names_cat_only.append(col)

####################################################

if missing_col_names:
    updated_variable_categories = variable_categories.copy()
    # question	frequency	question_length	question_options	column_name	type

    for i in missing_col_names_cat_only:
        str_list = [ str(cat) for cat in BES_Panel[i].cat.categories ]
        joined_list = "|".join(str_list)
        match  = (joined_list == updated_variable_categories["question"])

        if match.any(): # answer set already in records
            index = updated_variable_categories[match].index
            if len(index)>1: # answer set ("question") index should be unique!
                raise ValueError('answer set ("question") index should be unique!')

            # add column name and increase frequency
            updated_variable_categories.loc[index,"frequency"] = updated_variable_categories.loc[index,"frequency"]+1
            current_list_col_names = updated_variable_categories.loc[index,"column_name"].values[0].split("|")
            current_list_col_names.append(i)
            updated_variable_categories.loc[index,"column_name"] = "|".join( current_list_col_names )

        else: # answer set not already in records - add new line to dataframe
            df = pd.DataFrame([],  columns = updated_variable_categories.columns )

            # no need to add index
            # updated_variable_categories.shape[0], 
            df.loc[0] = [joined_list,
                         1,
                         len(joined_list),
                         len(str_list),
                         i,-99]
#             updated_variable_categories = updated_variable_categories.append(df, ignore_index=True)
            updated_variable_categories = pd.concat( [updated_variable_categories,df], ignore_index = True  )

    variable_categories = updated_variable_categories
    updated_variable_categories.to_csv(BES_small_data_files + "question_categories_correct_updatesneeded!.csv",
                                       encoding = encoding )


    display([x for x in zip(missing_col_names, BES_Panel[missing_col_names].dtypes)])

    manual_fixing_advice_string = "Stop - new variables detected\n"\
                                  "Go look at question_categories_correct_updatesneeded!.csv\n"\
                                  "fill in types, save as question_categories_correct.csv and rerun this code"


    raise Exception(manual_fixing_advice_string)
####################################################

# [-5, -4, -3, -2, -1, 4, 7, 8, 9] -> meta list
# [0, 1, 2, 3, 5, 6] ->     
content_list = [0, 1, 2, 3, 5, 6]
meta_list = [-5, -4, -3, -2, -1, 7, 8, 9] # -99, 4 excluded because could be categorical
# 'numeric' columns (ones that can be transformed into numbers)
num_cols     = BES_Panel.columns[ (var_type["type"].apply( lambda x: x in [0,1,2,3,5,6] )).values ]
# can't be transformed into numbers / are numbers but are meta-data rather than raw content (e.g. weights)
non_num_cols = BES_Panel.columns[ (var_type["type"].apply( lambda x: x in [-99,-5,-4,-3,-1 ]  )).values ]

BES_numeric  = BES_Panel[num_cols].copy()
for col in BES_numeric:

    if col not in var_type["type"].index:
        raise Exception( "variable not registered - and somehow slipped past!" )

    if var_type.loc[ col, "type" ] in [0,7]:
        continue

    # force all category elements into strings
    # ARE THEY EVER NOT?
    BES_numeric[col] = BES_numeric[col].cat.rename_categories( BES_numeric[col].cat.categories.map(str) )

    join_list = "|".join( BES_numeric[col].cat.categories ) # create category_list_string "strongly agree|agree|neither|..."
    var_type.loc[ col, "original_cat_list" ] = join_list    

    # typos - things with weird characters
    fixed_cat_string = fix_a_hat_chars( join_list )
    if fixed_cat_string is not None:
        var_type.loc[ col, "typos" ]   = join_list      
        BES_numeric[col] = BES_numeric[col].cat.rename_categories( fixed_cat_string )
        join_list = "|".join( BES_numeric[col].cat.categories )        

    # rename categories
    if join_list in rename_cat_dict.keys():
        var_type.loc[ col, "renamed_cat_list" ]   = join_list        
        BES_numeric[col] = BES_numeric[col].cat.rename_categories(  rename_cat_dict[join_list] )
        join_list = "|".join( BES_numeric[col].cat.categories )        

    # reorder categories
    if join_list in change_cat_dict.keys():
        var_type.loc[ col, "reordered_cat_list" ] = join_list        
        BES_numeric[col] = BES_numeric[col].cat.reorder_categories( change_cat_dict[join_list] )
        join_list = "|".join( BES_numeric[col].cat.categories )        

    # remove "Don't Know"s that are in weird numerical form (eg. [ "9999.0", "997.0", "222.0", "99.0", "0.0" ])
    # de_weasel numbers
    numerical_dont_knows = de_weasel_nums( BES_numeric[col].cat.categories )
    if len(numerical_dont_knows) != 0:
        BES_numeric[col] = BES_numeric[col].cat.remove_categories( numerical_dont_knows )
        var_type.loc[ col, "numerical_dont_knows" ] = "|".join( numerical_dont_knows )

    # set all digits to floating point format, one decimal place
    BES_numeric[col] = BES_numeric[col].cat.rename_categories( de_num( BES_numeric[col].cat.categories ) )

    # de_weasel
    weasel_words = BES_numeric[col].cat.categories.intersection(Weasel_set)
    if len(weasel_words) != 0:    
        BES_numeric[col] = BES_numeric[col].cat.remove_categories( weasel_words )
        var_type.loc[ col, "weasel_words" ] = "|".join( weasel_words )

    # Laziness - I want an extra column with the destination category sets
    # (should be a smaller set than original category sets)
    var_type.loc[ col, "final_cat_list" ] = "|".join( BES_numeric[col].cat.categories )        
####################################################

# save category data
cat_dictionary = {}
for col in BES_numeric.columns:
    if var_type["type"][col] in [1, 2, 3, 5]: # not just cat, but one not already numerical!
        cat_dictionary[col] = BES_numeric[col].cat.categories


# turn categories into numbers
for col in BES_numeric:

    if var_type["type"][col] in [1,2,3,5]: # category type variables (other than indicators)
        BES_numeric[col] = BES_numeric[col].cat.codes

    if var_type["type"][col] in [0,1,2,3,5,6,7]:
        BES_numeric[col] = BES_numeric[col].astype('float64')

BES_numeric.replace(-1,np.nan, inplace=True) # replace -1 cat code for NaN with actual NaN - downside, requires dtype float
####################################################

fname = data_subfolder + "cat_dictionary.pkl"
with open(fname, "wb") as f:
    pickle.dump( cat_dictionary, f )

BES_non_numeric = BES_Panel[non_num_cols].copy()

BES_non_numeric.to_pickle( data_subfolder + "BESnon_numeric.zip", compression='zip' )

BES_numeric.to_pickle( data_subfolder + "BESnumeric.zip",  compression='zip' )

var_type.to_csv( data_subfolder + "var_type.csv", encoding = encoding )
# don't think the performance warning will be relevant on such a small dataframe

id
wave
weight
turnoutUKGeneral
generalElectionVote
partyIdStrength
partyId
bestOnMII
polAttention
likeConLeader
likeLabLeader
likeLDLeader
likePCLeader
likeBrexitLeader
likeSNPLeader
likeCon
likeLab
likeLD
likeSNP
likePC
likeBrexitParty
econPersonalRetro
econGenRetro
riskPoverty
riskUnemployment
changeNHS
EUIntegrationSelf
EUIntegrationCon
EUIntegrationLab
EUIntegrationLD
EUIntegrationSNP
EUIntegrationPC
immigEcon
euRefVoteAfter
redistSelf
redistCon
redistLab
redistLD
redistSNP
redistPC
enviroGrowth
trustMPs
immigSelf
immigCon
immigLab
immigLD
immigSNP
immigPC
britishness
scottishness
welshness
englishness
europeanness
scotReferendumVote
welshReferendumIntention
approveUKGovt
approveScotGovt
approveWelshGovt
dutyToVote2
efficacyNotUnderstand
efficacyPolCare
subjClass
speakWelsh
ns_sec_analytic
new_pcon
small_mii_cat
age
gor
pcon
p_work_stat
p_gross_household
p_housing
p_job_sector
p_marital
p_disability
p_religion
p_sexuality
p_ethnicity
p_edlevel
lr_scale
al_scale
gender


UnicodeEncodeError: 'latin-1' codec can't encode character '\u2013' in position 50: ordinal not in range(256)

In [60]:
%debug

> [1;32me:\users\gamer\documents\github\bes_analysis\bes_analysis_code\writers.pyx[0m(76)[0;36mpandas._libs.writers.write_csv_rows[1;34m()[0m

ipdb> data
*** NameError: name 'data' is not defined
ipdb> up
> [1;32me:\users\gamer\anaconda3\envs\test_tensorflow_install\lib\site-packages\pandas\io\formats\csvs.py[0m(324)[0;36m_save_chunk[1;34m()[0m
[1;32m    322 [1;33m[1;33m[0m[0m
[0m[1;32m    323 [1;33m        [0mix[0m [1;33m=[0m [0mself[0m[1;33m.[0m[0mdata_index[0m[1;33m[[0m[0mslicer[0m[1;33m][0m[1;33m.[0m[0m_get_values_for_csv[0m[1;33m([0m[1;33m**[0m[0mself[0m[1;33m.[0m[0m_number_format[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m--> 324 [1;33m        libwriters.write_csv_rows(
[0m[1;32m    325 [1;33m            [0mdata[0m[1;33m,[0m[1;33m[0m[1;33m[0m[0m
[0m[1;32m    326 [1;33m            [0mix[0m[1;33m,[0m[1;33m[0m[1;33m[0m[0m
[0m
ipdb> data
[array(['W26_only', 'W26_only', 'W26_only', 'W26_only', 'W26_only

ipdb> [x for x in data if '\u2013' in x]
[]
ipdb> quit


In [55]:
var_type#.to_csv( data_subfolder + "var_type.csv", encoding = encoding )

Unnamed: 0,dataset_name,dtype,cat_all_strings,type,pruned,original_cat_list,renamed_cat_list,reordered_cat_list,final_cat_list,dataset_specific_hardcoded_fix,numerical_dont_knows,weasel_words,typos
id,W26_only,int32,,-5,,,,,,,,,
wave,W26_only,int8,,-1,,,,,,,,,
weight,W26_only,float64,,-1,,,,,,,,,
turnoutUKGeneral,W26_only,category,True,1,turnoutUKGeneral,Very unlikely that I would vote|Fairly unlikel...,Very unlikely that I would vote|Fairly unlikel...,,Very unlikely that I will vote|Fairly unlikely...,,,Don't know,
generalElectionVote,W26_only,category,True,3,generalElectionVote,I would/did not vote|Conservative|Labour|Liber...,,,I would/did not vote|Conservative|Labour|Liber...,,,Other|Don't know,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
p_ethnicity,W26_only,category,True,3,p_ethnicity,White British|Any other white background|White...,,,White British|Any other white background|White...,,,Prefer not to say,
p_edlevel,W26_only,category,True,1,p_edlevel,No qualifications|Below GCSE|GCSE|A-level|Unde...,,,No qualifications|Below GCSE|GCSE|A-level|Unde...,,,,
lr_scale,W26_only,category,False,1,lr_scale,Left|0.5|1.0|1.5|2.0|2.5|3.0|3.5|4.0|4.5|5.0|5...,,,Left|0.5|1.0|1.5|2.0|2.5|3.0|3.5|4.0|4.5|5.0|5...,,,,
al_scale,W26_only,category,False,1,al_scale,Libertarian|0.5|1.0|1.5|2.0|2.5|3.0|3.5|4.0|4....,,,Libertarian|0.5|1.0|1.5|2.0|2.5|3.0|3.5|4.0|4....,,,,


In [57]:
var_type.apply(lambda x: '\u2013' in x)

dataset_name                      False
dtype                             False
cat_all_strings                   False
type                              False
pruned                            False
original_cat_list                 False
renamed_cat_list                  False
reordered_cat_list                False
final_cat_list                    False
dataset_specific_hardcoded_fix    False
numerical_dont_knows              False
weasel_words                      False
typos                             False
dtype: bool

Index(['genElecTurnoutRetro', 'likeUKIP', 'likeUKIPLeader', 'cvEconSelf',
       'euRefVote', 'govtHandleVaccine', 'govtHandlelockdown', 'immigCultural',
       'welshgovtHandleVaccine', 'welshgovtHandlelockdown',
       'scotgovtHandleVaccine', 'scotgovtHandlelockdown'],
      dtype='object')

In [None]:
manual_fixing_advice_string

In [None]:
BES_Panel[col].value_counts().sort_index()

In [None]:
# BES_numeric[col].cat.rename_categories( BES_numeric[col].cat.categories.map(str), inplace=True )

In [None]:
var_type

In [None]:
not_found

In [None]:
missing_col_names

In [None]:
variable_categories[variable_categories["column_name"].apply(lambda x: "turnoutUKGeneral" in x if not pd.isna(x) else False)]

In [None]:
type_range = set(variable_categories["type"].values)
# for typ in type_range:
#     pruned_variable_name = prune2( prune(col) )
#     if pruned_variable_name in var_cat_dict_pruned_2[typ]:
#         var_type.loc[col,"type"] = typ
#         var_type.loc[col,"pruned"] = pruned_variable_name
#         not_found = False

In [None]:
variable_categories

In [None]:
variable_categories = pd.read_csv( BES_small_data_files + "question_categories_correct.csv",
                                   encoding = encoding,index_col=False,
#                                   usecols=["question","frequency","question_length",
#                                                                                "question_options","column_name","type"]
                                 )
variable_categories.drop('Unnamed: 0', axis=1,inplace=True)

In [None]:
variable_categories.reset_index()

In [None]:
var_type.loc['age']