# Transforming Codes to Labels.

In [2]:
import pandas as pd
from striprtf.striprtf import rtf_to_text
import re

# Define your variables of interest
variables_of_interest = [
    'pidp', 'jkl_hidp', 'jkl_gor_dv', 'jkl_sex_dv'  # ... add all your variables
]

# Parse the RTF file
with open('jkl_indresp_ukda_data_dictionary.rtf', 'r') as file:
    rtf_text = file.read()
    text = rtf_to_text(rtf_text)

# Extract mappings for each variable (you'll need to adjust the regex to match your file's structure)
mappings = {}
for var in variables_of_interest:
    # You'll need a regex pattern that accurately captures the value-label pairs for each variable
    pattern = rf"{{var}}\s+.*?\s+Value\s+=\s+(\d+)\s+Label\s+=\s+(.*?)\s+"
    matches = re.findall(pattern, text, re.DOTALL)
    mappings[var] = {int(value): label for value, label in matches}

In [3]:
# Read the dataset
df = pd.read_csv('variables.csv')

# Apply the mappings to the DataFrame
for column, mapping in mappings.items():
    if column in df.columns:
        df[column] = df[column].map(mapping).astype('category')


In [4]:
df

Unnamed: 0,pidp,jkl_hidp,j_hidp,k_hidp,l_hidp,jkl_pno,jkl_mnpno,jkl_fnpno,jkl_mnpid,jkl_fnpid,...,jkl_sf12pcs_dv,jkl_health,jkl_scghq1_dv,jkl_scghq2_dv,jkl_hhsize,jkl_nkids_dv,jkl_hhtype_dv,jkl_tenure_dv,jkl_fihhmnnet1_dv,jkl_ieqmoecd_dv
0,,,-14,-14,68013622,1,0,0,-8,-8,...,37.58,1,12,0,1,0,2,2,2288.000000,1.0
1,,,-14,-14,68020422,1,0,0,-8,-8,...,56.37,1,11,0,4,2,11,1,2534.000000,2.1
2,,,-14,-14,68020422,2,0,0,-8,-8,...,61.73,1,15,4,4,2,11,1,2534.000000,2.1
3,,,-14,-14,68027222,1,0,0,-8,-8,...,51.64,1,11,0,2,0,6,2,4475.910156,1.5
4,,,-14,-14,68027222,2,0,0,-8,-8,...,56.15,2,10,0,2,0,6,2,4475.910156,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31116,,,1638079218,-14,-14,3,2,1,1652984890,1652984850,...,56.22,2,12,0,3,0,19,1,5818.000000,2.0
31117,,,1638126818,-14,-14,2,0,0,-8,-8,...,30.54,2,12,0,3,0,19,1,12875.400390,2.0
31118,,,1638126818,-14,-14,3,2,1,1639697692,1653127650,...,54.53,2,10,0,3,0,19,1,12875.400390,2.0
31119,,,1638126818,-14,-14,1,0,0,-8,-8,...,33.67,2,10,0,3,0,19,1,12875.400390,2.0


In [5]:
import pandas as pd

# Define mappings for each column
# This is an example structure of `mappings`
mappings = {
    'jkl_childpno': {
        2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -7.0: 'proxy',
        -1.0: 'don\'t know'
    }
    'jkl_jbstat':{
        1.0: 'Self employed',
        2.0: 'Paid employment(ft/pt)',
        'Unemployed',
        'Retired',
        'On maternity leave',
        'Family care or home',
        'Full-time student',
        'LT sick or disabled',
        'Govt training scheme',
        'Unpaid, family business',
        'On apprenticeship',
        'On furlough',
        'Temporarily laid off/short term working',
        'Doing something else',
        'refusal',
        'missing',
        'inapplicable',
        'don\'t know'}
}

# Read the dataset
df = pd.read_csv('variables.csv')

# Get the list of columns that have mappings defined
mappings_columns = list(mappings.keys())

# Apply the mappings
for column in mappings_columns:
    if column in df.columns:
        # Here, `mappings[column]` accesses the specific mapping dictionary for the column
        df[column] = df[column].map(mappings[column]).astype('category')


jkl_jbstat
	Value = 1.0	Label = Self employed
	Value = 2.0	Label = Paid employment(ft/pt)
	Value = 3.0	Label = Unemployed
	Value = 4.0	Label = Retired
	Value = 5.0	Label = On maternity leave
	Value = 6.0	Label = Family care or home
	Value = 7.0	Label = Full-time student
	Value = 8.0	Label = LT sick or disabled
	Value = 9.0	Label = Govt training scheme
	Value = 10.0	Label = Unpaid, family business
	Value = 11.0	Label = On apprenticeship
	Value = 12.0	Label = On furlough
	Value = 13.0	Label = Temporarily laid off/short term working
	Value = 97.0	Label = Doing something else
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know
    
jkl_health
	Value = 1.0	Label = Yes
	Value = 2.0	Label = No
	Value = -1.0	Label = don't know
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -2.0	Label = refusal

jkl_fnpid
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know

jkl_mnpid
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know
    
jkl_hhsize
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know
    
bornuk_dv
	Value = 1.0	Label = born in uk
	Value = 2.0	Label = not born in uk
	Value = -1.0	Label = don't know
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -2.0	Label = refusal
    
jkl_fimnlabnet_dv
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know
    
jkl_sex_dv
	Value = 0.0	Label = inconsistent
	Value = 1.0	Label = Male
	Value = 2.0	Label = Female
	Value = -1.0	Label = don't know
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -2.0	Label = refusal

jkl_age_dv
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know

jkl_doby_dv
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know
    
jkl_ethn_dv
	Value = 1.0	Label = british/english/scottish/welsh/northern irish
	Value = 2.0	Label = irish
	Value = 3.0	Label = gypsy or irish traveller
	Value = 4.0	Label = any other white background
	Value = 5.0	Label = white and black caribbean
	Value = 6.0	Label = white and black african
	Value = 7.0	Label = white and asian
	Value = 8.0	Label = any other mixed background
	Value = 9.0	Label = indian
	Value = 10.0	Label = pakistani
	Value = 11.0	Label = bangladeshi
	Value = 12.0	Label = chinese
	Value = 13.0	Label = any other asian background
	Value = 14.0	Label = caribbean
	Value = 15.0	Label = african
	Value = 16.0	Label = any other black background
	Value = 17.0	Label = arab
	Value = 97.0	Label = any other ethnic group
	Value = -1.0	Label = don't know
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -2.0	Label = refusal

kl_fimnnet_dv
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know

jkl_country
	Value = 1.0	Label = England
	Value = 2.0	Label = Wales
	Value = 3.0	Label = Scotland
	Value = 4.0	Label = Northern Ireland
	Value = -1.0	Label = don't know
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -2.0	Label = refusal


jkl_gor_dv
	Value = 1.0	Label = North East
	Value = 2.0	Label = North West
	Value = 3.0	Label = Yorkshire and the Humber
	Value = 4.0	Label = East Midlands
	Value = 5.0	Label = West Midlands
	Value = 6.0	Label = East of England
	Value = 7.0	Label = London
	Value = 8.0	Label = South East
	Value = 9.0	Label = South West
	Value = 10.0	Label = Wales
	Value = 11.0	Label = Scotland
	Value = 12.0	Label = Northern Ireland
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know

jkl_urban_dv
	Value = 1.0	Label = urban area
	Value = 2.0	Label = rural area
	Value = -1.0	Label = don't know
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -2.0	Label = refusal

jkl_mastat_dv
	Value = 0.0	Label = Child under 16
	Value = 1.0	Label = Single and never married/in civil partnership
	Value = 2.0	Label = Married
	Value = 3.0	Label = In a registered same-sex civil partnership
	Value = 4.0	Label = Separated but legally married
	Value = 5.0	Label = Divorced
	Value = 6.0	Label = Widowed
	Value = 7.0	Label = Separated from civil partner
	Value = 8.0	Label = A former civil partner
	Value = 9.0	Label = A surviving civil partner
	Value = 10.0	Label = Living as couple
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -7.0	Label = proxy respondent
	Value = -1.0	Label = don't know

jkl_hhtype_dv
	Value = 1.0	Label = 1 male, aged 65+, no children
	Value = 2.0	Label = 1 female, age 60+, no children
	Value = 3.0	Label = 1 adult under pensionable age, no children
	Value = 4.0	Label = 1 adult, 1 child
	Value = 5.0	Label = 1 adult, 2 or more children
	Value = 6.0	Label = Couple both under pensionable age, no children
	Value = 8.0	Label = Couple 1 or more over pensionable age,no children
	Value = 10.0	Label = Couple with 1 child
	Value = 11.0	Label = Couple with 2 children
	Value = 12.0	Label = Couple with 3 or more children
	Value = 16.0	Label = 2 adults, not a couple, both under pensionable age, no children
	Value = 17.0	Label = 2 adults, not a couple, one or more over pensionable age, no children
	Value = 18.0	Label = 2 adults, not a couple, 1 or more children
	Value = 19.0	Label = 3 or more adults, no children, incl. at least one couple
	Value = 20.0	Label = 3 or more adults, 1-2 children, incl. at least one couple
	Value = 21.0	Label = 3 or more adults, >2 children, incl. at least one couple
	Value = 22.0	Label = 3 or more adults, no children, excl. any couples
	Value = 23.0	Label = 3 or more adults, 1 or more children, excl. any couples
	Value = -1.0	Label = don't know
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -2.0	Label = refusal

jkl_nchild_dv
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know
    
jkl_ppid
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know

jkl_ppno
	Value = 0.0	Label = partner not in hh
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know

jkl_sppid
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know

jkl_sppno
	Value = 0.0	Label = spouse not in hh
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know

jkl_fnpno
	Value = 0.0	Label = not in hh
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know

jkl_fnspid
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know

jkl_fnspno
	Value = 0.0	Label = not in hh
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know

jkl_mnpno
	Value = 0.0	Label = not in hh
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know

jkl_mnspid
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know

jkl_mnspno
	Value = 0.0	Label = not in hh
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know

jkl_grfpno
	Value = 0.0	Label = not in hh
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know

jkl_grmpno
	Value = 0.0	Label = not in hh
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know    
    
jkl_nmpsp_dv
	Value = 0.0	Label = none
	Value = -1.0	Label = don't know
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -7.0	Label = proxy
	Value = -2.0	Label = refusal    
  
jkl_jbsoc00_cc
	Value = 521.0	Label = Metal forming, welding and related trades
	Value = 522.0	Label = Metal machining, fitting and instrument making trades
	Value = 523.0	Label = Vehicle trades
	Value = 524.0	Label = Electrical trades
	Value = 531.0	Label = Construction trades
	Value = 532.0	Label = Building trades
	Value = 541.0	Label = Textiles and garments trades
	Value = 542.0	Label = Printing trades
	Value = 543.0	Label = Food preparation trades
	Value = 549.0	Label = Skilled trades nec
	Value = 811.0	Label = Process operatives
	Value = 812.0	Label = Plant and machine operatives
	Value = 813.0	Label = Assemblers and routine operatives
	Value = 814.0	Label = Construction operatives
	Value = 821.0	Label = Transport drivers and operatives
	Value = 822.0	Label = Mobile machine drivers and operatives
	Value = 311.0	Label = Science and engineering technicians
	Value = 312.0	Label = Draughtspersons and building inspectors
	Value = 313.0	Label = It service delivery occupations
	Value = 321.0	Label = Health associate professionals
	Value = 322.0	Label = Therapists
	Value = 323.0	Label = Social welfare associate professionals
	Value = 331.0	Label = Protective service occupations
	Value = 923.0	Label = Elementary cleaning occupations
	Value = 611.0	Label = Healthcare and related personal services
	Value = 341.0	Label = Artistic and literary occupations
	Value = 342.0	Label = Design associate professionals
	Value = 343.0	Label = Media associate professionals
	Value = 344.0	Label = Sports and fitness occupations
	Value = 612.0	Label = Childcare and related personal services
	Value = 351.0	Label = Transport associate professionals
	Value = 352.0	Label = Legal associate professionals
	Value = 353.0	Label = Business and finance associate professionals
	Value = 354.0	Label = Sales and related associate professionals
	Value = 355.0	Label = Conservation associate professionals
	Value = 356.0	Label = Public service and other associate professionals
	Value = 613.0	Label = Animal care services
	Value = 621.0	Label = Leisure and travel service occupations
	Value = 622.0	Label = Hairdressers and related occupations
	Value = 111.0	Label = Corporate managers and senior officials
	Value = 112.0	Label = Production managers
	Value = 113.0	Label = Functional managers
	Value = 114.0	Label = Quality and customer care managers
	Value = 115.0	Label = Financial institution and office managers
	Value = 116.0	Label = Managers in distribution, storage and retailing
	Value = 117.0	Label = Protective service officers
	Value = 118.0	Label = Health and social services managers
	Value = 121.0	Label = Managers in farming, horticulture, forestry and services
	Value = 122.0	Label = Managers and proprietors in hospitality and leisure services
	Value = 123.0	Label = Managers and proprietors in other service industries
	Value = 911.0	Label = Elementary agricultural occupations
	Value = 912.0	Label = Elementary construction occupations
	Value = 913.0	Label = Elementary process plant occupations
	Value = 914.0	Label = Elementary goods storage occupations
	Value = 921.0	Label = Elementary administration occupations
	Value = 922.0	Label = Elementary personal services occupations
	Value = 623.0	Label = Housekeeping occupations
	Value = 412.0	Label = Administrative occupations: finance
	Value = 413.0	Label = Administrative occupations: records
	Value = 414.0	Label = Administrative occupations: communications
	Value = 415.0	Label = Administrative occupations: general
	Value = 411.0	Label = Administrative occupations: government and related organisations
	Value = 421.0	Label = Secretarial and related occupations
	Value = 924.0	Label = Elementary security occupations
	Value = 925.0	Label = Elementary sales occupations
	Value = 629.0	Label = Personal services occupations nec
	Value = 711.0	Label = Sales assistants and retail cashiers
	Value = 712.0	Label = Sales related occupations
	Value = -1.0	Label = don't know
	Value = 721.0	Label = Customer service occupations
	Value = 211.0	Label = Science professionals
	Value = 212.0	Label = Engineering professionals
	Value = 213.0	Label = Information and communication technology professionals
	Value = 221.0	Label = Health professionals
	Value = 231.0	Label = Teaching professionals
	Value = 232.0	Label = Research professionals
	Value = 241.0	Label = Legal professionals
	Value = 242.0	Label = Business and statistical professionals
	Value = 243.0	Label = Architects, town planners, surveyors
	Value = 244.0	Label = Public service professionals
	Value = 245.0	Label = Librarians and related professionals
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -7.0	Label = proxy respondent
	Value = -2.0	Label = refusal
	Value = 511.0	Label = Agricultural trades  
  
jkl_jbnssec8_dv
	Value = 1.0	Label = Large employers & higher management
	Value = 2.0	Label = Higher professional
	Value = 3.0	Label = Lower management & professional
	Value = 4.0	Label = Intermediate
	Value = 5.0	Label = Small employers & own account
	Value = 6.0	Label = Lower supervisory & technical
	Value = 7.0	Label = Semi-routine
	Value = 8.0	Label = Routine
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -7.0	Label = proxy respondent
	Value = -1.0	Label = don't know

jkl_jbnssec5_dv
	Value = 1.0	Label = Management & professional
	Value = 2.0	Label = Intermediate
	Value = 3.0	Label = Small employers & own account
	Value = 4.0	Label = Lower supervisory & technical
	Value = 5.0	Label = Semi-routine & routine
	Value = -1.0	Label = don't know
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -7.0	Label = proxy respondent
	Value = -2.0	Label = refusal

jkl_jbnssec3_dv
	Value = 1.0	Label = Management & professional
	Value = 2.0	Label = Intermediate
	Value = 3.0	Label = Routine
	Value = -1.0	Label = don't know
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -7.0	Label = proxy respondent
	Value = -2.0	Label = refusal  
  
jkl_scghq1_dv
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -7.0	Label = proxy
	Value = -1.0	Label = don't know

jkl_scghq2_dv
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -7.0	Label = proxy
	Value = -1.0	Label = don't know

jkl_sf12pcs_dv
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -7.0	Label = proxy
	Value = -1.0	Label = don't know

jkl_sf12mcs_dv
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -7.0	Label = proxy
	Value = -1.0	Label = don't know 
  
jkl_nkids_dv
	Value = 0.0	Label = none
	Value = -2.0	Label = refusal
	Value = -9.0	Label = missing
	Value = -8.0	Label = inapplicable
	Value = -1.0	Label = don't know  

In [10]:
import pandas as pd

# Complete mappings for all provided variables
mappings = {
    'jkl_childpno': {
        2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -7.0: 'proxy',
        -1.0: 'don\'t know'
    },
    'jkl_jbstat': {
        1.0: 'Self employed',
        2.0: 'Paid employment(ft/pt)',
        3.0: 'Unemployed',
        4.0: 'Retired',
        5.0: 'On maternity leave',
        6.0: 'Family care or home',
        7.0: 'Full-time student',
        8.0: 'LT sick or disabled',
        9.0: 'Govt training scheme',
        10.0: 'Unpaid, family business',
        11.0: 'On apprenticeship',
        12.0: 'On furlough',
        13.0: 'Temporarily laid off/short term working',
        97.0: 'Doing something else',
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_health': {
        1.0: 'Yes',
        2.0: 'No',
        -1.0: 'don\'t know',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -2.0: 'refusal'
    },
    'jkl_fnpid': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_mnpid': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_hhsize': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'bornuk_dv': {
        1.0: 'born in uk',
        2.0: 'not born in uk',
        -1.0: 'don\'t know',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -2.0: 'refusal'
    },
    'jkl_fimnlabnet_dv': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_sex_dv': {
        0.0: 'inconsistent',
        1.0: 'Male',
        2.0: 'Female',
        -1.0: 'don\'t know',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -2.0: 'refusal'
    },
    'jkl_age_dv': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_doby_dv': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_ethn_dv': {
        1.0: 'british/english/scottish/welsh/northern irish',
        2.0: 'irish',
        3.0: 'gypsy or irish traveller',
        4.0: 'any other white background',
        5.0: 'white and black caribbean',
        6.0: 'white and black african',
        7.0: 'white and asian',
        8.0: 'any other mixed background',
        9.0: 'indian',
        10.0: 'pakistani',
        11.0: 'bangladeshi',
        12.0: 'chinese',
        13.0: 'any other asian background',
        14.0: 'caribbean',
        15.0: 'african',
        16.0: 'any other black background',
        17.0: 'arab',
        97.0: 'any other ethnic group',
        -1.0: 'don\'t know',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -2.0: 'refusal'
    },
    'kl_fimnnet_dv': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_country': {
        1.0: 'England',
        2.0: 'Wales',
        3.0: 'Scotland',
        4.0: 'Northern Ireland',
        -1.0: 'don\'t know',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -2.0: 'refusal'
    },
    'jkl_gor_dv': {
        1.0: 'North East',
        2.0: 'North West',
        3.0: 'Yorkshire and the Humber',
        4.0: 'East Midlands',
        5.0: 'West Midlands',
        6.0: 'East of England',
        7.0: 'London',
        8.0: 'South East',
        9.0: 'South West',
        10.0: 'Wales',
        11.0: 'Scotland',
        12.0: 'Northern Ireland',
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_urban_dv': {
        1.0: 'urban area',
        2.0: 'rural area',
        -1.0: 'don\'t know',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -2.0: 'refusal'
    },
    'jkl_mastat_dv': {
        0.0: 'Child under 16',
        1.0: 'Single and never married/in civil partnership',
        2.0: 'Married',
        3.0: 'In a registered same-sex civil partnership',
        4.0: 'Separated but legally married',
        5.0: 'Divorced',
        6.0: 'Widowed',
        7.0: 'Separated from civil partner',
        8.0: 'A former civil partner',
        9.0: 'A surviving civil partner',
        10.0: 'Living as couple',
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -7.0: 'proxy respondent',
        -1.0: 'don\'t know'
    },
    'jkl_hhtype_dv': {
        1.0: '1 male, aged 65+, no children',
        2.0: '1 female, age 60+, no children',
        3.0: '1 adult under pensionable age, no children',
        4.0: '1 adult, 1 child',
        5.0: '1 adult, 2 or more children',
        6.0: 'Couple both under pensionable age, no children',
        8.0: 'Couple 1 or more over pensionable age,no children',
        10.0: 'Couple with 1 child',
        11.0: 'Couple with 2 children',
        12.0: 'Couple with 3 or more children',
        16.0: '2 adults, not a couple, both under pensionable age, no children',
        17.0: '2 adults, not a couple, one or more over pensionable age, no children',
        18.0: '2 adults, not a couple, 1 or more children',
        19.0: '3 or more adults, no children, incl. at least one couple',
               20.0: '3 or more adults, 1-2 children, incl. at least one couple',
        21.0: '3 or more adults, >2 children, incl. at least one couple',
        22.0: '3 or more adults, no children, excl. any couples',
        23.0: '3 or more adults, 1 or more children, excl. any couples',
        -1.0: 'don\'t know',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -2.0: 'refusal'
    },
    'jkl_nchild_dv': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_ppid': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_ppno': {
        0.0: 'partner not in hh',
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_sppid': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_sppno': {
        0.0: 'spouse not in hh',
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_fnpno': {
        0.0: 'not in hh',
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_fnspid': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_fnspno': {
        0.0: 'not in hh',
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_mnpno': {
        0.0: 'not in hh',
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_mnspid': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_mnspno': {
        0.0: 'not in hh',
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_grfpno': {
        0.0: 'not in hh',
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_grmpno': {
        0.0: 'not in hh',
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    },
    'jkl_nmpsp_dv': {
        0.0: 'none',
        -1.0: 'don\'t know',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -7.0: 'proxy',
        -2.0: 'refusal'
    },
    'jkl_jbsoc00_cc': {
        521.0: 'Metal forming, welding and related trades',
        522.0: 'Metal machining, fitting and instrument making trades',
        523.0: 'Vehicle trades',
        524.0: 'Electrical trades',
        531.0: 'Construction trades',
        532.0: 'Building trades',
        541.0: 'Textiles and garments trades',
        542.0: 'Printing trades',
        543.0: 'Food preparation trades',
        549.0: 'Skilled trades nec',
        811.0: 'Process operatives',
        812.0: 'Plant and machine operatives',
        813.0: 'Assemblers and routine operatives',
        814.0: 'Construction operatives',
        821.0: 'Transport drivers and operatives',
        822.0: 'Mobile machine drivers and operatives',
        311.0: 'Science and engineering technicians',
        312.0: 'Draughtspersons and building inspectors',
        313.0: 'IT service delivery occupations',
        321.0: 'Health associate professionals',
        322.0: 'Therapists',
        323.0: 'Social welfare associate professionals',
        331.0: 'Protective service occupations',
        923.0: 'Elementary cleaning occupations',
        611.0: 'Healthcare and related personal services',
        341.0: 'Artistic and literary occupations',
        342.0: 'Design associate professionals',
        343.0: 'Media associate professionals',
        344.0: 'Sports and fitness occupations',
        612.0: 'Childcare and related personal services',
        351.0: 'Transport associate professionals',
        352.0: 'Legal associate professionals',
        353.0: 'Business and finance associate professionals',
        354.0: 'Sales and related associate professionals',
        355.0: 'Conservation associate professionals',
        356.0: 'Public service and other associate professionals',
        613.0: 'Animal care services',
        621.0: 'Leisure and travel service occupations',
        622.0: 'Hairdressers and related occupations',
        111.0: 'Corporate managers and senior officials',
        112.0: 'Production managers',
        113.0: 'Functional managers',
        114.0: 'Quality and customer care managers',
        115.0: 'Financial institution and office managers',
        116.0: 'Managers in distribution, storage and retailing',
        117.0: 'Protective service officers',
        118.0: 'Health and social services managers',
        121.0: 'Managers in farming, horticulture, forestry and services',
        122.0: 'Managers and proprietors in hospitality and leisure services',
        123.0: 'Managers and proprietors in other service industries',
        911.0: 'Elementary agricultural occupations',
        912.0: 'Elementary construction occupations',
        913.0: 'Elementary process plant occupations',
        914.0: 'Elementary goods storage occupations',
        921.0: 'Elementary administration occupations',
        922.0: 'Elementary personal services occupations',
        623.0: 'Housekeeping occupations',
        412.0: 'Administrative occupations: finance',
        413.0: 'Administrative occupations: records',
        414.0: 'Administrative occupations: communications',
        415.0: 'Administrative occupations: general',
        411.0: 'Administrative occupations: government and related organisations',
        421.0: 'Secretarial and related occupations',
        924.0: 'Elementary security occupations',
        925.0: 'Elementary sales occupations',
        629.0: 'Personal services occupations nec',
        711.0: 'Sales assistants and retail cashiers',
        712.0: 'Sales related occupations',
        721.0: 'Customer service occupations',
        211.0: 'Science professionals',
        212.0: 'Engineering professionals',
        213.0: 'Information and communication technology professionals',
        221.0: 'Health professionals',
        231.0: 'Teaching professionals',
        232.0: 'Research professionals',
        241.0: 'Legal professionals',
        242.0: 'Business and statistical professionals',
        243.0: 'Architects, town planners, surveyors',
        244.0: 'Public service professionals',
                245.0: 'Librarians and related professionals',
        -1.0: 'don\'t know',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -7.0: 'proxy respondent',
        -2.0: 'refusal',
        511.0: 'Agricultural trades'
    },
    'jkl_jbnssec8_dv': {
        1.0: 'Large employers & higher management',
        2.0: 'Higher professional',
        3.0: 'Lower management & professional',
        4.0: 'Intermediate',
        5.0: 'Small employers & own account',
        6.0: 'Lower supervisory & technical',
        7.0: 'Semi-routine',
        8.0: 'Routine',
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -7.0: 'proxy respondent',
        -1.0: 'don\'t know'
    },
    'jkl_jbnssec5_dv': {
        1.0: 'Management & professional',
        2.0: 'Intermediate',
        3.0: 'Small employers & own account',
        4.0: 'Lower supervisory & technical',
        5.0: 'Semi-routine & routine',
        -1.0: 'don\'t know',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -7.0: 'proxy respondent',
        -2.0: 'refusal'
    },
    'jkl_jbnssec3_dv': {
        1.0: 'Management & professional',
        2.0: 'Intermediate',
        3.0: 'Routine',
        -1.0: 'don\'t know',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -7.0: 'proxy respondent',
        -2.0: 'refusal'
    },
    'jkl_scghq1_dv': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -7.0: 'proxy',
        -1.0: 'don\'t know'
    },
    'jkl_scghq2_dv': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -7.0: 'proxy',
        -1.0: 'don\'t know'
    },
    'jkl_sf12pcs_dv': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -7.0: 'proxy',
        -1.0: 'don\'t know'
    },
    'jkl_sf12mcs_dv': {
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -7.0: 'proxy',
        -1.0: 'don\'t know'
    },
    'jkl_nkids_dv': {
        0.0: 'none',
        -2.0: 'refusal',
        -9.0: 'missing',
        -8.0: 'inapplicable',
        -1.0: 'don\'t know'
    }
    # This concludes the mappings based on the provided list. 
}

# Example usage:
# Read the dataset
df = pd.read_csv('variables.csv')

# Apply the mappings
for column in mappings.keys():
    if column in df.columns:
        df[column] = df[column].map(mappings[column]).astype('category')


    



In [11]:
df

Unnamed: 0,pidp,jkl_hidp,j_hidp,k_hidp,l_hidp,jkl_pno,jkl_mnpno,jkl_fnpno,jkl_mnpid,jkl_fnpid,...,jkl_sf12pcs_dv,jkl_health,jkl_scghq1_dv,jkl_scghq2_dv,jkl_hhsize,jkl_nkids_dv,jkl_hhtype_dv,jkl_tenure_dv,jkl_fihhmnnet1_dv,jkl_ieqmoecd_dv
0,68008847,68013622,-14,-14,68013622,1,not in hh,not in hh,inapplicable,inapplicable,...,,Yes,,,,none,"1 female, age 60+, no children",2,2288.000000,1.0
1,68009527,68020422,-14,-14,68020422,1,not in hh,not in hh,inapplicable,inapplicable,...,,Yes,,,,,Couple with 2 children,1,2534.000000,2.1
2,68061288,68020422,-14,-14,68020422,2,not in hh,not in hh,inapplicable,inapplicable,...,,Yes,,,,,Couple with 2 children,1,2534.000000,2.1
3,68010887,68027222,-14,-14,68027222,1,not in hh,not in hh,inapplicable,inapplicable,...,,Yes,,,,none,"Couple both under pensionable age, no children",2,4475.910156,1.5
4,68068082,68027222,-14,-14,68027222,2,not in hh,not in hh,inapplicable,inapplicable,...,,No,,,,none,"Couple both under pensionable age, no children",2,4475.910156,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31116,1652984930,1638079218,1638079218,-14,-14,3,,,,,...,,No,,,,none,"3 or more adults, no children, incl. at least ...",1,5818.000000,2.0
31117,1639697692,1638126818,1638126818,-14,-14,2,not in hh,not in hh,inapplicable,inapplicable,...,,No,,,,none,"3 or more adults, no children, incl. at least ...",1,12875.400390,2.0
31118,1639697732,1638126818,1638126818,-14,-14,3,,,,,...,,No,,,,none,"3 or more adults, no children, incl. at least ...",1,12875.400390,2.0
31119,1653127650,1638126818,1638126818,-14,-14,1,not in hh,not in hh,inapplicable,inapplicable,...,,No,,,,none,"3 or more adults, no children, incl. at least ...",1,12875.400390,2.0


In [12]:
# Define the file path where you want to save the CSV (change 'your_path_here' to your desired path)
csv_file_path = 'sample_variables.csv'

# Save the DataFrame to a CSV file
df.to_csv(csv_file_path, index=False)

print(f"DataFrame saved to '{csv_file_path}'.")

DataFrame saved to 'sample_variables.csv'.


In [8]:
df["jkl_childpno"]

0        inapplicable
1        inapplicable
2        inapplicable
3        inapplicable
4        inapplicable
             ...     
31116    inapplicable
31117    inapplicable
31118    inapplicable
31119    inapplicable
31120             NaN
Name: jkl_childpno, Length: 31121, dtype: category
Categories (2, object): ['inapplicable', 'proxy']

In [None]:
childpno_column = df['jkl_childpno']

In [9]:
# Assuming 'df' is your pandas DataFrame and 'jkl_childpno' is the column of interest
category_counts = df['jkl_childpno'].value_counts()

# Print the counts
print(category_counts)

inapplicable    30099
proxy             294
Name: jkl_childpno, dtype: int64


In [None]:
mport pandas as pd

# Define mappings for each column
# This is an example structure of `mappings`
mappings = {
    'column1': {
        1: 'Label1',
        2: 'Label2',
        -1: 'Unknown'
    },
    'column2': {
        1: 'CategoryA',
        2: 'CategoryB',
        -1: 'Not Applicable'
    }
    # Add more mappings for other columns as needed
}

# Read the dataset
df = pd.read_csv('variables.csv')

# Get the list of columns that have mappings defined
mappings_columns = list(mappings.keys())

# Apply the mappings
for column in mappings_columns:
    if column in df.columns:
        # Here, `mappings[column]` accesses the specific mapping dictionary for the column
        df[column] = df[column].map(mappings[column]).astype('category')


In [None]:


Python might be more efficient in this case because:
- It has robust file handling capabilities, especially for text and binary files.
- Libraries like `pandas` are designed for efficient data manipulation on large datasets.
- The scripting process can be easier due to Python's syntax and extensive community support.

You'll need to fine-tune the regex and mapping logic based on the actual content of the RTF file, which may require a thorough inspection of the text you get after parsing the RTF content.