<a href="https://colab.research.google.com/github/HassanDataSci/caricon-personality-career-matching/blob/main/CariCon_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# Load each file into its own DataFrame
df_mbti   = pd.read_excel("data/myer-briggs-data.xlsx")
df_onet  = pd.read_excel("data/complete_onet_data_with_human_characteristics.xlsx")

print("MBTI shape:", df_mbti.shape)
print("Human characteristics shape:", df_onet.shape)

MBTI shape: (8675, 2)
Human characteristics shape: (1016, 30)


In [2]:
print("=== MBTI Data ===")
print(df_mbti.dtypes)
print()

print("=== O*NET Human Characteristics ===")
print(df_onet.dtypes)
print()

=== MBTI Data ===
type     object
posts    object
dtype: object

=== O*NET Human Characteristics ===
Occupation Code              object
Title                        object
Human Characteristics        object
Description                  object
Sample Job Titles            object
Tasks                        object
Knowledge                    object
Skills                       object
Abilities                    object
Work Activities              object
Detailed Work Activities     object
Tools Used                   object
Technology Used              object
Job Zone                     object
Education Level              object
Experience Required          object
Job Training                 object
SVP Range                    object
Job Zone Examples            object
Interests                    object
Work Styles                  object
Work Values                  object
Work Context                 object
Additional Sources           object
Related Occupations          object

# Handling Missing Data

In [3]:
nan_count = np.sum(df_onet.isnull())
print(nan_count)

Occupation Code               0
Title                         0
Human Characteristics         0
Description                   0
Sample Job Titles           115
Tasks                        93
Knowledge                   143
Skills                      147
Abilities                   138
Work Activities             137
Detailed Work Activities     93
Tools Used                  119
Technology Used              93
Job Zone                     93
Education Level              93
Experience Required          93
Job Training                 93
SVP Range                    93
Job Zone Examples            93
Interests                    93
Work Styles                 137
Work Values                 142
Work Context                137
Additional Sources           67
Related Occupations          93
Annual 10th Percentile       26
Annual 25th Percentile       26
Annual Median Wage           26
Annual 75th Percentile       26
Annual 90th Percentile       26
dtype: int64


  return reduction(axis=axis, out=out, **passkwargs)


In [4]:
# many of the missing values come from rows with "All Other" in the title 
df_onet['Title'].str.contains("All Other", case=False).value_counts()

Title
False    939
True      77
Name: count, dtype: int64

In [5]:
# So we can drop rows with all other
df_onet = df_onet[~df_onet['Title'].str.contains("All Other", case=False)]
df_onet.shape

(939, 30)

In [6]:
nan_count = np.sum(df_onet.isnull())
print(nan_count)

Occupation Code              0
Title                        0
Human Characteristics        0
Description                  0
Sample Job Titles           38
Tasks                       16
Knowledge                   66
Skills                      70
Abilities                   61
Work Activities             60
Detailed Work Activities    16
Tools Used                  42
Technology Used             16
Job Zone                    16
Education Level             16
Experience Required         16
Job Training                16
SVP Range                   16
Job Zone Examples           16
Interests                   16
Work Styles                 60
Work Values                 65
Work Context                60
Additional Sources          17
Related Occupations         16
Annual 10th Percentile      22
Annual 25th Percentile      22
Annual Median Wage          22
Annual 75th Percentile      22
Annual 90th Percentile      22
dtype: int64


In [7]:
# 16 of the same rows all have missing values across mutiple columns, these are primarily military roles
df_onet[df_onet['Technology Used'].isnull()]['Title']


22                                      Air Crew Members
23                                     Air Crew Officers
26                 Aircraft Launch and Recovery Officers
27              Aircraft Launch and Recovery Specialists
54                  Armored Assault Vehicle Crew Members
55                      Armored Assault Vehicle Officers
59                    Artillery and Missile Crew Members
60                        Artillery and Missile Officers
167                  Command and Control Center Officers
168               Command and Control Center Specialists
363           First-Line Supervisors of Air Crew Members
387    First-Line Supervisors of Weapons Specialists/...
505                                             Infantry
506                                    Infantry Officers
906                                       Special Forces
907                              Special Forces Officers
Name: Title, dtype: object

In [8]:
# so we can drop those roles
df_onet = df_onet[~df_onet['Technology Used'].isnull()]

nan_count = np.sum(df_onet.isnull())
print(nan_count)

Occupation Code              0
Title                        0
Human Characteristics        0
Description                  0
Sample Job Titles           22
Tasks                        0
Knowledge                   50
Skills                      54
Abilities                   45
Work Activities             44
Detailed Work Activities     0
Tools Used                  26
Technology Used              0
Job Zone                     0
Education Level              0
Experience Required          0
Job Training                 0
SVP Range                    0
Job Zone Examples            0
Interests                    0
Work Styles                 44
Work Values                 49
Work Context                44
Additional Sources           1
Related Occupations          0
Annual 10th Percentile       6
Annual 25th Percentile       6
Annual Median Wage           6
Annual 75th Percentile       6
Annual 90th Percentile       6
dtype: int64


In [9]:
# Drop some irrelevant columns that we won't use as predictors
df_onet.drop(columns=['Tools Used','Sample Job Titles','Additional Sources'], inplace=True)
df_onet.drop(df_onet.columns[-5:], axis=1, inplace=True)

In [10]:
nan_count = np.sum(df_onet.isnull())
print(nan_count)

Occupation Code              0
Title                        0
Human Characteristics        0
Description                  0
Tasks                        0
Knowledge                   50
Skills                      54
Abilities                   45
Work Activities             44
Detailed Work Activities     0
Technology Used              0
Job Zone                     0
Education Level              0
Experience Required          0
Job Training                 0
SVP Range                    0
Job Zone Examples            0
Interests                    0
Work Styles                 44
Work Values                 49
Work Context                44
Related Occupations          0
dtype: int64


  return reduction(axis=axis, out=out, **passkwargs)


These are the roles that still have missing data in columns that we care about, and it's a total of 65 roles. To fill in these missing values, we would have to go online and manually search for skills, abilities, work values, etc.

It could be worth it to spend time doing so in the future. But for now, I think it's more efficient to just drop these roles. 

In [11]:
roles_missing_vals = df_onet[df_onet.isnull().any(axis=1)]['Title'].to_list()
roles_missing_vals

['Aircraft Service Attendants',
 'Appraisers of Personal and Business Property',
 'Blockchain Engineers',
 'Bus Drivers, School',
 'Calibration Technologists and Technicians',
 'Cardiologists',
 'Clinical Neuropsychologists',
 'Crematory Operators',
 'Cutters and Trimmers, Hand',
 'Data Scientists',
 'Digital Forensics Analysts',
 'Disc Jockeys, Except Radio',
 'Dishwashers',
 'Electrical and Electronic Equipment Assemblers',
 'Emergency Medical Technicians',
 'Emergency Medicine Physicians',
 'Entertainment and Recreation Managers, Except Gambling',
 'Facilities Managers',
 'Fallers',
 'Farmworkers and Laborers, Crop, Nursery, and Greenhouse',
 'Financial and Investment Analysts',
 'Financial Risk Specialists',
 'First-Line Supervisors of Entertainment and Recreation Workers, Except Gambling Services',
 'First-Line Supervisors of Passenger Attendants',
 'First-Line Supervisors of Security Workers',
 'Fundraising Managers',
 'Graders and Sorters, Agricultural Products',
 'Health Inform

In [12]:
df_onet = df_onet[~df_onet['Title'].isin(roles_missing_vals)]
np.sum(df_onet.isnull()).sum()

  return reduction(axis=axis, out=out, **passkwargs)


0

## Encoding

In [13]:
df_onet.columns

Index(['Occupation Code', 'Title', 'Human Characteristics', 'Description',
       'Tasks', 'Knowledge', 'Skills', 'Abilities', 'Work Activities',
       'Detailed Work Activities', 'Technology Used', 'Job Zone',
       'Education Level', 'Experience Required', 'Job Training', 'SVP Range',
       'Job Zone Examples', 'Interests', 'Work Styles', 'Work Values',
       'Work Context', 'Related Occupations'],
      dtype='object')

In [14]:
features = ['Human Characteristics', 'Interests', 'Work Styles', 'Work Values', 'Skills']

In [15]:
df_onet['Human Characteristics'].head(3)

0    Attention to Detail, Analytical Thinking, Inte...
1    Communication, Adaptability, Problem Solving, ...
2    Communication, Adaptability, Problem Solving, ...
Name: Human Characteristics, dtype: object

In [16]:
import re
import pandas as pd

def encode_comma_separated_cols(df, cols,
                                new_suffix='_list',
                                sep_pattern=r'[,\;\|\n]+',
                                to_lower=True,
                                unique=True,
                                replace=False):
    """
    Parse comma/semicolon/pipe/newline-separated string columns into lists.
    - df: pandas DataFrame
    - cols: list of column names to parse
    - new_suffix: suffix for the created columns (original preserved unless replace=True)
    - sep_pattern: regex for delimiters
    - to_lower: normalize tokens to lowercase
    - unique: remove duplicate tokens while preserving order
    - replace: if True, overwrite original columns with lists (instead of creating new columns)
    Returns a copy of df with new/updated columns.
    """
    df = df.copy()
    def _parse_cell(x):
        if pd.isna(x):
            return []
        if isinstance(x, (list, tuple)):
            items = list(x)
        else:
            items = [t.strip() for t in re.split(sep_pattern, str(x)) if t.strip()]
        if to_lower:
            items = [t.lower() for t in items]
        if unique:
            # preserve order
            seen = {}
            items = [seen.setdefault(it, it) for it in items if it not in seen]
        return items

    for col in cols:
        if col not in df.columns:
            continue
        parsed = df[col].apply(_parse_cell)
        if replace:
            df[col] = parsed
        else:
            df[col + new_suffix] = parsed
    return df

# Example usage (adjust column names to match your df_onet):
cols_to_encode = features  # List of columns to parse
df_onet = encode_comma_separated_cols(df_onet, cols_to_encode, new_suffix='_list', replace=False)
#set replace=True if you want to overwrite original columns

In [17]:
df_onet.columns

Index(['Occupation Code', 'Title', 'Human Characteristics', 'Description',
       'Tasks', 'Knowledge', 'Skills', 'Abilities', 'Work Activities',
       'Detailed Work Activities', 'Technology Used', 'Job Zone',
       'Education Level', 'Experience Required', 'Job Training', 'SVP Range',
       'Job Zone Examples', 'Interests', 'Work Styles', 'Work Values',
       'Work Context', 'Related Occupations', 'Human Characteristics_list',
       'Interests_list', 'Work Styles_list', 'Work Values_list',
       'Skills_list'],
      dtype='object')

In [18]:
df_onet['Human Characteristics_list'].head(3)

0    [attention to detail, analytical thinking, int...
1    [communication, adaptability, problem solving,...
2    [communication, adaptability, problem solving,...
Name: Human Characteristics_list, dtype: object

In [19]:
for col in df_onet.columns:
    if col.endswith('_list'):
        print(df_onet[col].head(1))

0    [attention to detail, analytical thinking, int...
Name: Human Characteristics_list, dtype: object
0    [conventional, enterprising, investigative]
Name: Interests_list, dtype: object
0    [attention to detail, integrity, dependability...
Name: Work Styles_list, dtype: object
0    [achievement, independence, recognition]
Name: Work Values_list, dtype: object
0    [reading comprehension, active listening, crit...
Name: Skills_list, dtype: object


In [20]:
# make df_onet include only the list columns plus Title
list_cols = [col for col in df_onet.columns if col.endswith('_list')]
df_onet = df_onet[['Title'] + list_cols]
df_onet.head(3)

Unnamed: 0,Title,Human Characteristics_list,Interests_list,Work Styles_list,Work Values_list,Skills_list
0,Accountants and Auditors,"[attention to detail, analytical thinking, int...","[conventional, enterprising, investigative]","[attention to detail, integrity, dependability...","[achievement, independence, recognition]","[reading comprehension, active listening, crit..."
1,Actors,"[communication, adaptability, problem solving,...","[artistic, social, enterprising]","[cooperation, persistence, adaptability/flexib...","[relationships, achievement, independence]","[reading comprehension, speaking, active liste..."
2,Actuaries,"[communication, adaptability, problem solving,...","[conventional, investigative, enterprising]","[analytical thinking, attention to detail, int...","[working conditions, achievement, independence]","[critical thinking, judgment and decision maki..."


## One Hot Encoding the features

In [21]:
# one hot encode the list columns
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
for col in list_cols:
    one_hot = pd.DataFrame(mlb.fit_transform(df_onet[col]),
                           columns=[f"{col}__{cls}" for cls in mlb.classes_],
                           index=df_onet.index)
    df_onet = pd.concat([df_onet, one_hot], axis=1) 

In [22]:
df_onet.shape

(858, 118)