In [331]:
# British Election Study dataset

# Very big (30,000+), multiple waves (we're up to 9 now) tracking some of the same people
#

In [332]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import gc
import re

from sklearn import datasets
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from itertools import cycle
from IPython.display import display
import pickle, os

import seaborn as sns

BES_data_folder = "../BES_analysis_data/"
BES_code_folder = "../BES_analysis_code/"

from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, MICE

In [333]:
%%time
# Load BES Panel for Wave 8
BES_Panel = pd.read_stata(BES_data_folder+"BES2015_W8_v1.3.dta") # 130Mb
#(31409, 733)

# BES_Panel = pd.read_stata(BES_data_folder+BES2015_W8_v1.6.dta")

print( BES_Panel.shape )

(31409, 733)
Wall time: 13.7 s


In [334]:
# England_inds = BES_Panel[BES_Panel["country"]=="England"].index
# print ( "respondents not from England: ", len(BES_Panel) - len(England_inds) )
# Remain_inds  = BES_Panel[ BES_Panel["euRefVote"].cat.codes==0 ].index
# Leave_inds   = BES_Panel[ BES_Panel["euRefVote"].cat.codes==1 ].index
# Notvoters_inds  = BES_Panel[ BES_Panel["euRefVote"].cat.codes==2 ].index
# Dontknow_inds   = BES_Panel[ BES_Panel["euRefVote"].cat.codes==3 ].index
# print ( "respondents not Remain/Leave: ", len(BES_Panel) - len(Remain_inds) - len(Leave_inds) )
# Leave_Remain_inds = pd.Index( np.concatenate( (Leave_inds, Remain_inds), axis=0 ) )
# EngRemLea_inds = pd.Index( np.intersect1d( England_inds, Leave_Remain_inds ) )


In [335]:
%%time
# changing the order of some sets of categories
change_cat_dict = {"Bad time to buy|Good time to buy|Neither good nor bad time to buy|Don't know": ["Bad time to buy",
                                                                                                    "Neither good nor bad time to buy",
                                                                                                    "Good time to buy",
                                                                                                    "Don't know"],
                   "Larger|Smaller|About the same|Don't know": ["Larger", "About the same", "Smaller"],
                   "Yes|No|99.0":       ['No', 'Yes', '99.0'],
                   "Yes|No|Don't know": ['No', 'Yes', "Don't know"],
                   "Yes|No" :           ['No', 'Yes'],
                   "Yes, voted|No, did not vote|Donâ??t know" : ['No, did not vote', 'Yes, voted', 'Donâ??t know'],
                   "Yes, voted|No, did not vote|Don?t know"   : ['No, did not vote', 'Yes, voted', 'Don?t know'],                   
                   "I would/will not vote|Leave the EU|Stay/remain in the EU|Don't know": ['Stay/remain in the EU',
                                                                                           'Leave the EU', 'I would/will not vote', "Don't know"],
                   "Mainly leave|Mainly remain|Fairly evenly divided|Don't know": ["Mainly remain",
                                                                                   "Fairly evenly divided", "Mainly leave", "Don't know"],
                   'An individual share in a company|A portfolio of different company shares|The risk is the same|Don\x92t know|Prefer not to say':
                       ['An individual share in a company', 'The risk is the same', 'A portfolio of different company shares'],
                   "No, I have never been a member|Yes, I am a member of a party|I am not a member now but I used to be|Don't know":
                       ['No, I have never been a member', 'I am not a member now but I used to be', 'Yes, I am a member of a party', "Don't know"],
                   "Never or practically never|Less often than once a year|Less often but at least once a year|Less often but at least twice a year|Less often but at least once a month|Less often but at least once in two weeks|Once a week or more|Varies too much to say|I am not religious|Don't know": ['I am not religious', 'Never or practically never', 'Less often than once a year', 'Less often but at least once a year', 'Less often but at least twice a year', 'Less often but at least once a month', 'Less often but at least once in two weeks', 'Once a week or more']
                  }

                   
Weasel_answers = ["Don't know", 'Do\x92t know', 'Dont know', 'Donât know', 'Don??t know',
                  "Prefer not to say", "Prefer not to answer", "Refused", "Unknown",
                  "Neither", "Other", "I would/will not vote", "Will not vote",
                  "I would not vote", "It depends", "Other",
                  "Don’t follow politics on Facebook", "Don't follow politics on twitter",
                  "Yes, other", "Haven't thought about it",
                  "There wasn't a local election in my area", "No, haven't received it",
                  "I don't know what was negotiated", "I never received a response",
                  "There are not local elections in my area", "Can't remember",
                  "Varies too much to say", "Will not state a choice",
                  "All leaders equally good", "They are not eligible to vote",
                  "There are not local elections in my area"]

Weasel_number_answers = [ "9999.0", "997.0", "222.0", "99.0", "0.0" ]

# ADD
# 9999.0
# 99.0 - unfortunately, it also appears in some numerical answers
# Probably need to have a separate check - e.g. is the preceding category "98.0"/"98"

# non-answer answers
Weasel_set = set(Weasel_answers) # gets rid of duplicates!


## define 'de_Weasel' function to remove Weasel Words from lists of options
## ie. "Yes|No|Don't know" -> "Yes|No"

Weasel_answers = ["Don't know", 'Don?t know', 'Donâ??t know', 'Do\x92t know', 'Dont know', 'Donât know', "Prefer not to say", "Prefer not to answer", "Refused", "Unknown", "Neither", "Other", "I would/will not vote", "Will not vote", "No - not decided", "I would not vote", "It depends", "Other", "Don’t follow politics on Facebook", "Don't follow politics on twitter", "9999.0", "997.0", "222.0", "Yes, other", "Haven't thought about it", "There wasn't a local election in my area", "No, haven't received it", "I don't know what was negotiated", "I never received a response", "There are not local elections in my area", "Can't remember", "Varies too much to say" ]

# non-answer answers
Weasel_set = set(Weasel_answers) # gets rid of duplicates!

# remove weasel phrases
def de_weasel(ques): 

    return "|".join( [x for x in ques.split("|") if x not in Weasel_answers] )

# reorder categories
def re_order(ques):
    if ques in change_cat_dict.keys():
        return "|".join( change_cat_dict[ques] )
    else:
        return ques

def de_num_el(el):
    if el.isdigit():
        el = "%.1f" % int( el )
    return el

def de_number(ques):
    return "|".join( [de_num_el(x) for x in ques.split("|")] )

def de_num(ques):
    return [de_num_el(x) for x in ques]

def floatable(flt):
    try:
        float(flt)
        return True
    except:
        return False

# Weasel_number_answers
# Remove 'weasel' numbers
# but only if they are the last element
# or not the last element, but the next is not a number
# to avoid catching parts of sequential numerical categories
def de_weasel_numbers(ques):
    el_list = ques.split("|")
    el_list_len = len(el_list)
    remove_list = []
    for el_pos in range( 0, el_list_len ):
        if el_list[el_pos] in Weasel_number_answers:
            # last element, or not last element but next element is a not a number
            if el_pos==(el_list_len-1) or not floatable(el_list[el_pos+1]):
                remove_list.append(el_list[el_pos])

    return "|".join( [x for x in el_list if x not in remove_list] )


# version to act directly on cat.categories array
def de_weasel_nums(el_list):

    el_list_len = len(el_list)
    remove_list = []
    for el_pos in range( 0, el_list_len ):
        if el_list[el_pos] in Weasel_number_answers:
            # last element, or not last element but next element is a not a number
            if el_pos==(el_list_len-1) or not floatable(el_list[el_pos+1]):
                remove_list.append(el_list[el_pos])

    return remove_list

# s.cat.rename_categories([1,2,3])
# EUContactRemainConW8|EUContactRemainLabW8|EUContactRemainLDW8|
# EUContactRemainSNPW8|EUContactRemainPCW8|EUContactRemainUKIPW8|
# EUContactRemainGreenW8|EUContactRemainOthW8|EUContactRemainNoneW8|
# EUContactRemainDKW8|EUContactLeaveConW8|EUContactLeaveLabW8|
# EUContactLeaveLDW8|EUContactLeaveSNPW8|EUContactLeavePCW8|
# EUContactLeaveUKIPW8|EUContactLeaveGreenW8|EUContactLeaveOthW8|
# EUContactLeaveNoneW8|EUContactLeaveDKW8

# pattern match "EUContact*****W8"
# debateOneWatchW8|debateTwoWatchW8

# "1.0|2.0|99.0" -> 

# euRefVoteSqueezeW7 "Will not vote|Yes - Leave|Yes - Remain|No - not decided"
#    -> Stay/remain in the EU|Leave the EU|I would/will not vote|Don't know
#    HMM - RENAME AND REORDER!

# miieuW7
# "Issue stated|Nothing|Don't know" -> "Issue stated|None|Don't know"
# MIIEUW8
# "1.0|Nothing|Don't know" -> "Issue stated|None|Don't know"
# partyIdEUW7|partyIdEUW8
# "Mainly leave|Mainly remain|Fairly evenly split|Don't know" -> "Mainly remain|Fairly evenly divided|Mainly leave|Don't know"
#    HMM - RENAME AND REORDER!

# 1. campaignVisionYesW3|campaignVisionNoW3, govtNatSecuritySuccessW4
# Very unsuccessful|Fairly unsuccessful|Neither successful nor unsuccessful|Fairly successful|Very successful|Don't know
# Very unsuccessful|Somewhat unsuccessful|Neither successful or unsuccessful|Somewhat successful|Very successful|Don't know

# Fairly <-> Somewhat

# 2. euroTurnoutW1, scotReferendumTurnoutW1|scotReferendumTurnoutW2|welshTurnoutW7|scotTurnoutW7, turnoutUKGeneralW1|turnoutUKGeneralW2|turnoutUKGeneralW3|turnoutUKGeneralW4|turnoutUKGeneralW5|euRefTurnoutW7|euRefTurnoutW8
# Very unlikely that I vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I vote|Don't know
# Very unlikely that I would vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I would vote|Don't know
# There are not local elections in my area
    #|Very unlikely that I will vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I will vote|Don't know
# Very unlikely that I will vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I will vote|Don't know

# "Very unlikely that I vote", "Very unlikely that I would vote" ->  "Very unlikely that I will vote" 

rename_cat_dict = {"North East|North West": [ "No", "Yes" ],
                   "1.0|2.0|99.0": ["No", "Yes", "99.0"],
                   "Will not vote|Yes - Leave|Yes - Remain|No - not decided": ['I would/will not vote', 'Leave the EU',
                                                                               'Stay/remain in the EU', "Don't know"],
                   "Issue stated|Nothing|Don't know":  ['Issue stated', 'None', "Don't know"],
                   "1.0|Nothing|Don't know":           ['Issue stated', 'None', "Don't know"],
                   "a|b|C1|C2|d|e|Refused|Unknown" : ['A', 'B', 'C1', 'C2', 'D', 'E', 'Refused', 'Unknown'],
                   "Mainly leave|Mainly remain|Fairly evenly split|Don't know": ['Mainly leave',
                                                                                 'Mainly remain', 'Fairly evenly divided', "Don't know"],
                   "Very unsuccessful|Somewhat unsuccessful|Neither successful or unsuccessful|Somewhat successful|Very successful|Don't know": ['Very unsuccessful',
                        'Fairly unsuccessful', 'Neither successful nor unsuccessful', 'Fairly successful', 'Very successful', "Don't know"],
                   "Very unlikely that I vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I vote|Don't know": ['Very unlikely that I will vote',
                     'Fairly unlikely', 'Neither likely nor unlikely', 'Fairly likely', 'Very likely that I will vote', "Don't know"],
                   "Very unlikely that I would vote|Fairly unlikely|Neither likely nor unlikely|Fairly likely|Very likely that I would vote|Don't know": ['Very unlikely that I will vote',
                     'Fairly unlikely', 'Neither likely nor unlikely', 'Fairly likely', 'Very likely that I will vote', "Don't know"],
                   }


def re_name(ques):
    if ques in rename_cat_dict.keys():
        return "|".join( rename_cat_dict[ques] )
    else:
        return ques


ignore_list = ['approveEUW2',
               'whichPartiesHelped_99W6',
               'partyContactGrnW1',
               'partyContactGrnW2',
               'partyContactGrnW3',
               'reasonNotRegistered_noneW2',               
               'reasonNotRegistered_noneW3',
               'reasonNotRegistered_noneW4',
               'reasonNotRegistered_noneW6',
               'reasonNotRegistered_noneW7',
               'reasonNotRegistered_noneW8',
               'reasonNotRegistered_none',
               'partyContactSNPW1',
               'partyContactSNPW2',
               'changeIssue1W9',
               'conLeaderLikeW9',
               "locusControlW9"
              ]

#- approveEUW2 'Strongly disapprove|Disapprove|Don't know' - should be "approve|disapprove|don't know"??? NOT SURE (distribution weird)
#- whichPartiesHelped_99W6 - answer set = ["No"]
#- partyContactGrnW1 ... reasonNotRegistered_noneW8 answer set = ["No", "Don't know"]
# -partyContactSNPW1, partyContactSNPW2 - answer set = ["Don't know"]
# -changeIssue1W9|conLeaderLikeW9|locusControlW9 - answer set = ["No formal qualifications"]

## define 'prune' function to prune wave indicators and return question stubs
## ie. "ptvConW1|ptvLabW1" -> "ptvCon|ptvLab"

def prune(x):
    
    y= []
    for el in x.split("|"):
        match_attempt = re.match('(\w*?)_?(W[0-9]+)+' , el )   
        if match_attempt:
            el = match_attempt.groups()[0]
        y.append(el)
    # should we ditch identical repeats?
    # return "|".join(set(y)) NEEDS TO BE TESTED
    return "|".join(y)

               
def prune2(x):
    
    y= []
    for el in x.split("|"):
        # fgdfhfghg_5, fgdfhfghg_4, fgdfhfghg_3 -> fgdfhfghg
        # problem - indicator variables fgdfhfghg_99, fgdfhfghg_111 really are different!
        # solution - leave them distinct
        indicator_variable = re.match('(\w*?)_?(99|111)' , el )       
        match_attempt = re.match('(\w*?)_?[0-9]+' , el )   
        if (not indicator_variable) and (match_attempt):
            el = match_attempt.groups()[0]
        y.append(el)
    # should we ditch identical repeats?
    # return "|".join(set(y)) NEEDS TO BE TESTED
    return "|".join(y)
#variable_categories

Wall time: 0 ns


In [336]:
variable_categories = pd.read_csv("question_categories_correct.csv", encoding = "ISO-8859-1")

# flipping list
var_cat_dict = dict()
for typ in [0,1,2,3,4,5,6,7]:
    # 
    e = variable_categories[variable_categories.type==typ]["column_name"].values
    var_cat_dict[typ] = [item for sublist in [i.split("|") for i in e] for item in sublist]
    var_cat_dict[typ] = [item for item in var_cat_dict[typ] if item not in ignore_list]
    
    
# dictionary comprehension to prune column-names to wave non-specific stubs
# list(set()) gets rid of repetitions
var_cat_dict_pruned   = {k: list(set([prune(x)  for x in v])) for k, v in var_cat_dict.items()}
var_cat_dict_pruned_2 = {k: list(set([prune2(x) for x in v])) for k, v in var_cat_dict_pruned.items()}

In [337]:
var_type = pd.DataFrame(columns = ['type'] )
# df = DataFrame(columns=('lib', 'qty1', 'qty2'))

missing_col_names = []

for col in BES_Panel.columns:
    dt =  BES_Panel[col].dtype.name # data type
    not_found = False
    
    if col in ignore_list: # exclude values from ignore_list
        var_type.loc[col] = -2
        
    elif (col == "id"): # id
         var_type.loc[col] = -5

    elif (dt == 'object'): # text
        var_type.loc[col] = -4

    elif ("datetime" in dt): # datetime
        var_type.loc[col] = -3           

#     elif ( dt!='category' ) and ( dt!='float64' ):
#         var_type.loc[col] = -1
    
#     elif ( dt=='float64' ) and (col in ['personality_agreeableness',
#                                         'personality_conscientiousness',
#                                         'personality_extraversion',
#                                         'personality_neuroticism',
#                                         'personality_openness',
#                                         'mapNamesW3',
#                                         'riskScaleW8']):
#         var_type.loc[col] = 0

#     elif ( dt=='float64' ):
#         var_type.loc[col] = -1

#     elif ( dt=='category' ):
    else:
        not_found = True
        for typ in [0,1,2,3,4,5,6,7]:
            if prune2( prune(col) ) in var_cat_dict_pruned_2[typ]:
                var_type.loc[col] = typ
                not_found = False

    if not_found == True:
        var_type.loc[col] = -1
  #      print("what's up with this? " + col, prune2( prune(col) ) )
        #missing_col_names.append(col)
var_type["type"] = var_type["type"].astype("int8")
# missing_col_names

In [338]:
if missing_col_names:

    updated_variable_categories = variable_categories.copy()

    # question	frequency	question_length	question_options	column_name	type

    for i in missing_col_names:
        str_list = [ str(cat) for cat in BES_Panel[i].cat.categories ]
        joined_list = "|".join(str_list)
        match  = (joined_list == updated_variable_categories["question"])
        # print(i, " : " , "|".join(str_list ), " : ", len(str_list) )
        if match.any(): # answer set already in records
            index = updated_variable_categories[match].index
            if len(index)>1: # answer set ("question") index should be unique!
                raise ValueError('answer set ("question") index should be unique!')

            # add column name and increase frequency
            updated_variable_categories.loc[index,"frequency"] = updated_variable_categories.loc[index,"frequency"]+1
            current_list_col_names = updated_variable_categories.loc[index,"column_name"].values[0].split("|")
            current_list_col_names.append(i)
            updated_variable_categories.loc[index,"column_name"] = "|".join( current_list_col_names )
        else: # answer set not already in records - add new line to dataframe
            df = pd.DataFrame([],  columns = updated_variable_categories.columns )

            df.loc[0] = [updated_variable_categories.shape[0],
                         joined_list,
                         1,
                         len(joined_list),
                         len(str_list),
                         i,-1]
            updated_variable_categories = updated_variable_categories.append(df, ignore_index=True)

    variable_categories = updated_variable_categories
    updated_variable_categories.to_csv("question_categories_correct_updatesneeded!.csv", encoding = "ISO-8859-1")
    
    # rerun after updating list!
    
    #variable_categories = pd.read_csv("question_categories_correct.csv", encoding = "ISO-8859-1")

    # flipping list
    var_cat_dict = dict()
    for typ in [1,2,3,4,5,6]:
        e = variable_categories[variable_categories.type==typ]["column_name"].values
        var_cat_dict[typ] = [item for sublist in [i.split("|") for i in e] for item in sublist]

    # dictionary comprehension to prune column-names to wave non-specific stubs
    # list(set()) gets rid of repetitions
    var_cat_dict_pruned   = {k: list(set([prune(x)  for x in v])) for k, v in var_cat_dict.items()}
    var_cat_dict_pruned_2 = {k: list(set([prune2(x) for x in v])) for k, v in var_cat_dict_pruned.items()}        
        
    var_type = pd.DataFrame(columns = ['type'] )
    missing_col_names = []

    for col in BES_Panel.columns:
        dt =  BES_Panel[col].dtype.name # data type
        not_found = False
        if col in ignore_list: # exclude values from ignore_list
            var_type.loc[col] = -2
            
        elif (col == "id"): # id
            var_type.loc[col] = -5
            
        elif (dt == 'object'): # text
            var_type.loc[col] = -4

        elif ("datetime" in dt): # datetime
            var_type.loc[col] = -3          
            
#         elif ( dt!='category' ) and ( dt!='float64' ):
#             var_type.loc[col] = -1

#         elif ( dt=='float64' ) and (col in ['personality_agreeableness',
#                                             'personality_conscientiousness',
#                                             'personality_extraversion',
#                                             'personality_neuroticism',
#                                             'personality_openness',
#                                             'mapNamesW3',
#                                             'riskScaleW8']):
#             var_type.loc[col] = 0

#         elif ( dt=='float64' ):
#             var_type.loc[col] = -1


        not_found = True
        for typ in [0,1,2,3,4,5,6,7]:
            if prune2( prune(col) ) in var_cat_dict_pruned_2[typ]:
                var_type.loc[col] = typ
                not_found = False

        if not_found == True:
            var_type.loc[col] = -1
#             raise ValueError('Values still missing second time around! ', col)
    var_type["type"] = var_type["type"].astype("int8")

In [344]:
%%time

# ditch ignore_list values
# ditch indicator values

num_cols     = BES_Panel.columns[ (var_type["type"].apply( lambda x: x in [0,1,2,3,5,6,7] )).values ]
non_num_cols = BES_Panel.columns[ (var_type["type"].apply( lambda x: x in [-5,-4,-3,-1 ] )).values ]

BES_numeric     = BES_Panel[num_cols].copy()
BES_non_numeric = BES_Panel[non_num_cols].copy()

pos = 0
for col in BES_numeric:
    print(100*pos/len(BES_numeric.columns))
    pos = pos + 1
    
    if col not in var_type["type"].index:
        print(col, " not in var_type")
        continue
    typ = var_type["type"][col]
#     if (typ == -1) | (typ == 4) | (col =="id") | (typ==0) | (col in ignore_list): # skip
#         continue
        #BES_numeric.drop(col,axis=1,inplace=True)
        
    #print(col)
    
    if (typ==0) | (typ==7):
        continue
    
    # force all category elements into strings
    BES_numeric[col].cat.rename_categories( BES_numeric[col].cat.categories.map(str), inplace=True )
    
    # rename categories
    
    join_list = "|".join( BES_numeric[col].cat.categories )
    if join_list in rename_cat_dict.keys():
        BES_numeric[col].cat.rename_categories(  rename_cat_dict[join_list], inplace=True )
    # update join_list!
    
    # reorder categories
    join_list = "|".join( BES_numeric[col].cat.categories )
    if join_list in change_cat_dict.keys():
#         print(col)
#         print(join_list)
#         print(change_cat_dict[join_list])
        BES_numeric[col].cat.reorder_categories( change_cat_dict[join_list], inplace=True )    
    
    # de_weasel numbers
    BES_numeric[col].cat.remove_categories( de_weasel_nums( BES_numeric[col].cat.categories ), inplace=True )
    
    # set all digits to floating point format, one decimal place
    BES_numeric[col].cat.rename_categories( de_num( BES_numeric[col].cat.categories ), inplace=True )
                                          
    # de_weasel                                   
    BES_numeric[col].cat.remove_categories( BES_numeric[col].cat.categories.intersection(Weasel_set), inplace=True )

      
        
#     elif (typ == 2):
#         new_cat_order = change_cat_dict["|".join(BES_numeric[col].cat.categories)]
#         BES_numeric[col].cat.remove_categories( BES_numeric[col].cat.categories.intersection(Weasel_set), inplace=True )
#         BES_numeric[col].cat.reorder_categories( new_cat_order, inplace=True )

#         BES_numeric[col] = BES_numeric[col].cat.codes
# #        BES_numeric.loc[ BES_numeric[col]==-1, col ] = np.nan  
    
#     elif (typ == 5) | (typ == 1): # (typ == 1) | 
#         # remove weasel categories
#         BES_numeric[col].cat.remove_categories( BES_numeric[col].cat.categories.intersection(Weasel_set), inplace=True )
#         # change to numbers!
#         BES_numeric[col] = BES_numeric[col].cat.codes
# #        BES_numeric.loc[ BES_numeric[col]==-1, col ] = np.nan
        

#     elif (typ == 6):

#         BES_numeric[col].cat.remove_categories( BES_numeric[col].cat.categories.intersection(Weasel_set), inplace=True )
#         BES_numeric[col] = BES_numeric[col].astype('float64')

# types
# -5    id
# -4    text
# -3    datetimes
# -2 - ignore_list
# -1 - anything not a category or one of the below floats -> should all be weights!
# 0 - personality measures (in steps of .5?), personality_agreeableness ...etc, riskScale
# 1 - linear category, just use  (some made linear by dropping "Weasel_answers")
# 2 - categories need to be modified - order changed
# 3 - set of non-ordered options
# 4 - indirect variables - did someone fill something in in the free text box or not?
# 5 - categories need to modified - things removed
# 6 - categories are integers - should maybe be transformed directly into numbers (mostly "how much money do people need minimum/well off"?)
# 7 - pano, mapNames


# [-5, -4, -3, -2, -1] -> meta list
# [0, 1, 2, 3, 4, 5, 6, 7] -> 


# load question_categories_correct.csv
# sanity check by type!
# turn into list of variables by type
# 1, 5 handled the same way -> cat.codes
# 6 -> int()
# 4 ignored
# 3 ignored for now (-> vectorized?)
# 2 direct modification




0.0
0.15174506828528073
0.30349013657056145
0.4552352048558422
0.6069802731411229
0.7587253414264037
0.9104704097116844
1.062215477996965
1.2139605462822458
1.3657056145675266
1.5174506828528074
1.669195751138088
1.8209408194233687
1.9726858877086495
2.12443095599393
2.276176024279211
2.4279210925644916
2.579666160849772
2.731411229135053
2.8831562974203337
3.0349013657056148
3.1866464339908953
3.338391502276176
3.490136570561457
3.6418816388467374
3.793626707132018
3.945371775417299
4.09711684370258
4.24886191198786
4.400606980273142
4.552352048558422
4.704097116843703
4.855842185128983
5.007587253414264
5.159332321699544
5.311077389984826
5.462822458270106
5.614567526555387
5.7663125948406675
5.918057663125948
6.0698027314112295
6.22154779969651
6.373292867981791
6.525037936267071
6.676783004552352
6.828528072837633
6.980273141122914
7.132018209408194
7.283763277693475
7.435508345978755
7.587253414264036
7.738998482549317
7.890743550834598
8.04248861911988
8.19423368740516
8.34597875

In [345]:
%%time
# BES_num_and_cat = BES_numeric.copy()

# save category data
cat_dictionary = {}

for col in BES_numeric.columns:
    if var_type["type"][col] in [1, 2, 3, 5]: # not just cat, but one not already numerical!
        cat_dictionary[col] = BES_num_and_cat[col].cat.categories

fname = BES_data_folder+"cat_dictionary"+".pkl"
with open(fname, "wb") as f:
    pickle.dump( cat_dictionary, f )

# turn categories into numbers
    
pos = 0
for col in BES_numeric:
    print(100*pos/len(BES_numeric.columns))
    pos = pos + 1
    
    typ = var_type["type"][col]
    
#     if (typ == -1) | (typ == 4) | (col =="id") | (typ==0) | (col in ignore_list): # skip types that req no processing
#         continue 
    if (typ == 0): # not necessarily already float now!
        BES_numeric[col] = BES_numeric[col].astype('float64')
        
    elif (typ==1) | (typ==2) | (typ==5): # more or less ordinal, replace string categories with 
        BES_numeric[col] = BES_numeric[col].cat.codes
        BES_numeric[col] = BES_numeric[col].astype('float64')
     
    elif (typ==3): # categporical not ordinal
        BES_numeric[col] = BES_numeric[col].cat.codes
        BES_numeric[col] = BES_numeric[col].astype('float64')
        
    elif (typ==6): # categories are integers - better to translate directly
        BES_numeric[col] = BES_numeric[col].astype('float64')
        
    elif (typ==7): # integers - better to translate directly
        BES_numeric[col] = BES_numeric[col].astype('float64')        
        
BES_numeric.replace(-1,np.nan, inplace=True)

0.0
0.15174506828528073
0.30349013657056145
0.4552352048558422
0.6069802731411229
0.7587253414264037
0.9104704097116844
1.062215477996965
1.2139605462822458
1.3657056145675266
1.5174506828528074
1.669195751138088
1.8209408194233687
1.9726858877086495
2.12443095599393
2.276176024279211
2.4279210925644916
2.579666160849772
2.731411229135053
2.8831562974203337
3.0349013657056148
3.1866464339908953
3.338391502276176
3.490136570561457
3.6418816388467374
3.793626707132018
3.945371775417299
4.09711684370258
4.24886191198786
4.400606980273142
4.552352048558422
4.704097116843703
4.855842185128983
5.007587253414264
5.159332321699544
5.311077389984826
5.462822458270106
5.614567526555387
5.7663125948406675
5.918057663125948
6.0698027314112295
6.22154779969651
6.373292867981791
6.525037936267071
6.676783004552352
6.828528072837633
6.980273141122914
7.132018209408194
7.283763277693475
7.435508345978755
7.587253414264036
7.738998482549317
7.890743550834598
8.04248861911988
8.19423368740516
8.34597875

In [None]:
# BES_numerics_only = BES_numeric.drop( BES_numeric.columns[~( (var_type["type"]==0) |
#                                                              (var_type["type"]==1) |
#                                                              (var_type["type"]==2) |
#                                                              (var_type["type"]==5) |
#                                                              (var_type["type"]==6) ) ], axis=1 )

# BES_numerics_only.replace(-1,np.nan, inplace=True)
# # gender only column that has no nan -> still an int

In [None]:
# BES_all_categories = BES_numeric.drop( BES_numeric.columns[~( (var_type["type"]==0) |
#                                                              (var_type["type"]==1) |
#                                                              (var_type["type"]==2) |
#                                                              (var_type["type"]==3) |
#                                                              (var_type["type"]==5) |
#                                                              (var_type["type"]==6) ) ], axis=1 )

# BES_all_categories.replace(-1,np.nan, inplace=True)
# # gender only column that has no nan -> still an int

In [None]:
# BES_numerics_only = BES_all_categories

In [None]:
# BES_num_and_cat.to_stata( BES_data_folder+"BESW8num_and_cat.hdf" )

In [346]:
BES_non_numeric.to_hdf( BES_data_folder+"BESW8non_numeric.hdf", "BESW8non_numeric" )

In [347]:
BES_numeric.to_hdf( BES_data_folder+"BESW8numeric.hdf", "BESW8numeric" )

In [348]:
var_type.to_hdf( BES_data_folder+"var_type.hdf", "var_type" )

In [None]:
# TEST CODE

In [349]:
# Test to see if pruning "_wave" leads to differing variable categories
for key in var_cat_dict_pruned.keys():
    for val in var_cat_dict_pruned[key]:
        for other_key in var_cat_dict_pruned.keys():
            if key==other_key: # don't check same category
                continue
            if val in var_cat_dict_pruned[other_key]:
                print("problem: {0} {1} {2}".format(key, val, other_key) )
            
# mixing 1 and 5 is fine, because 5 (mis-ordered ordinal) is turned into 1 (correctly ordered ordinal)

problem: 1 euRefVoteSqueeze 5
problem: 5 euRefVoteSqueeze 1


In [350]:
# Test to see if further pruning "_num" leads to differing variable categories
for key in var_cat_dict_pruned_2.keys():
    for val in var_cat_dict_pruned_2[key]:
        for other_key in var_cat_dict_pruned_2.keys():
            if key==other_key: # don't check same category
                continue
            if val in var_cat_dict_pruned_2[other_key]:
                print("problem: {0} {1} {2}".format(key, val, other_key) )
            
                

problem: 1 euRefVoteSqueeze 5
problem: 1 euID 5
problem: 1 finlit 2
problem: 2 finlit 1
problem: 5 euID 1
problem: 5 euRefVoteSqueeze 1


In [None]:
# pano, mapNames
# weights
# personality
# datetimes -> remove on type
# text -> remove on type

# weights, datetimes, text -> saved separately