In [7]:
import pandas as pd
import numpy as np

In [8]:
# We will need a value map for each kind of categorical response

value_map_1 = {
    1: 'Always',
    2: 'Sometimes',
    3: 'Never',
    0: 'Inap',
    9: np.nan
}

value_map_2 = {
    1: 'Yes',
    5: 'No',
    0: 'Inap',
    9: np.nan
}

def numeric_map(x):
    if x == 99:
        return np.nan
    else:
        return x
    
# and a dictionary that classifies each column in the dataset
variable_dict = {
    'type1': ['var1', 'var2', 'var3'],
    'type2': ['var4', 'var5', 'var6'],
    'numeric': ['var7', 'var8']
}

map_dict = {
    'type1': value_map_1,
    'type2': value_map_2,
    'numeric': numeric_map
}

In [12]:
# This is a sample dataframe to show how the above dicts would be used:
sample_df = pd.DataFrame({
    'var1': [1, 2, 3, 0, 9, 2],
    'var2': [3, 2, 1, 9, 9, 0],
    'var3': [2, 2, 1, 9, 0, 1],
    'var4': [1, 1, 1, 5, 5, 9],
    'var5': [0, 0, 0, 5, 9, 5],
    'var6': [1, 1, 1, 5, 5, 9],
    'var7': [12, 13, 14, 99, 99, 2],
    'var8': [55, 99, 99, 2, 1, 3]
})

sample_df

Unnamed: 0,var1,var2,var3,var4,var5,var6,var7,var8
0,1,3,2,1,0,1,12,55
1,2,2,2,1,0,1,13,99
2,3,1,1,1,0,1,14,99
3,0,9,9,5,5,5,99,2
4,9,9,0,5,9,5,99,1
5,2,0,1,9,5,9,2,3


In [13]:
# This should be inside a function to prevent it running each time it is loaded.
new_df = pd.DataFrame()

for key in variable_dict.keys():
    variables = variable_dict[key]
    for variable in variables:
        new_df[variable] = sample_df[variable].map(map_dict[key])


In [14]:
new_df

Unnamed: 0,var1,var2,var3,var4,var5,var6,var7,var8
0,Always,Never,Sometimes,Yes,Inap,Yes,12.0,55.0
1,Sometimes,Sometimes,Sometimes,Yes,Inap,Yes,13.0,
2,Never,Always,Always,Yes,Inap,Yes,14.0,
3,Inap,,,No,No,No,,2.0
4,,,Inap,No,,No,,1.0
5,Sometimes,Inap,Always,,No,,2.0,3.0


In [15]:
def classify_columns_by_value_counts(df):
    
    cols = df.columns.tolist()
    
    column_class = {}
    
    for col in cols:
        value_count = df[col].value_counts()
        
        num_values = len(value_count)
        
        if num_values not in column_class.keys():
            # we found a new one
            column_class[num_values] = [col]
        else:
            # we have seen this before
            column_class[num_values].append(col)
    
    return column_class


In [16]:
classify_columns_by_value_counts(sample_df)

{5: ['var1', 'var2', 'var7', 'var8'], 4: ['var3'], 3: ['var4', 'var5', 'var6']}