## UCI Adult Income dataset- data cleaning and processing

This notebook is focused in the data preparation, cleaning, and processing for the UCI Adulit iIncome Data set


In [1]:
# import libraries
import pandas as pd 
import numpy as np
import os

Define and Create Paths

In [2]:
# Get working directory
current_dir = os.getcwd()

# Go one directory up to the root directory
project_root_dir = os.path.dirname(current_dir)

# Define paths to the data folder
data_dir = os.path.join(project_root_dir, 'data')
raw_dir = os.path.join(data_dir, 'raw')                        
processed_dir = os.path.join(data_dir, 'processed')

# Define paths to results folder
results_dir = os.path.join(project_root_dir, 'results')

# Define paths to docs folder
docs_dir = os.path.join(project_root_dir, 'docs')

# Create directories if they do not exist
os.makedirs(raw_dir, exist_ok = True)
os.makedirs(processed_dir, exist_ok = True)
os.makedirs(results_dir, exist_ok = True)
os.makedirs(docs_dir, exist_ok = True)

## Read in the data

In [3]:
adult_data_filename = os.path.join(raw_dir, "adult.csv")
adult_df = pd.read_csv(adult_data_filename, header= None, na_values= '?', skipinitialspace= True)
adult_df.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [4]:
adult_df.shape

(32561, 15)

In [5]:
adult_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       32561 non-null  int64 
 1   1       30725 non-null  object
 2   2       32561 non-null  int64 
 3   3       32561 non-null  object
 4   4       32561 non-null  int64 
 5   5       32561 non-null  object
 6   6       30718 non-null  object
 7   7       32561 non-null  object
 8   8       32561 non-null  object
 9   9       32561 non-null  object
 10  10      32561 non-null  int64 
 11  11      32561 non-null  int64 
 12  12      32561 non-null  int64 
 13  13      31978 non-null  object
 14  14      32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


## Data Cleaning
   ## Assign proper column names to the colums

In [6]:
adult_df.columns = ["age","workclass","fnlwgt", "education", "education_num","marital_status", "occupation","relationship", "race","sex", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income"]
adult_df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [7]:
adult_df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


## Understanding Dataset
 - Age: Represents the age of the person, ranging from 17 to 90
 -Workclass: Represents the employement status of the person. Takes the values: 'Federal-gov', 'Local-gov', 'Never-worked', 'Private', ''Self-emp-in',,''Self-emp-not-in', ''State-go',,''Without-pa', ' 'n'
 -fnlwgt: the weight the individual represents in the population 2'

In [8]:
np.unique(adult_df.age.to_list())

array([17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 90])

In [9]:
np.unique(adult_df.workclass.to_list())

array(['Federal-gov', 'Local-gov', 'Never-worked', 'Private',
       'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay',
       'nan'], dtype='<U32')

In [10]:
np.unique(adult_df.fnlwgt.to_list())

array([  12285,   13769,   14878, ..., 1366120, 1455435, 1484705])

In [11]:
np.unique(adult_df.education.to_list())

array(['10th', '11th', '12th', '1st-4th', '5th-6th', '7th-8th', '9th',
       'Assoc-acdm', 'Assoc-voc', 'Bachelors', 'Doctorate', 'HS-grad',
       'Masters', 'Preschool', 'Prof-school', 'Some-college'],
      dtype='<U12')

In [12]:
np.unique(adult_df.education_num.to_list())

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16])

In [13]:
np.unique(adult_df.marital_status.to_list())

array(['Divorced', 'Married-AF-spouse', 'Married-civ-spouse',
       'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'],
      dtype='<U21')

In [14]:
np.unique(adult_df.occupation.to_list())

array(['Adm-clerical', 'Armed-Forces', 'Craft-repair', 'Exec-managerial',
       'Farming-fishing', 'Handlers-cleaners', 'Machine-op-inspct',
       'Other-service', 'Priv-house-serv', 'Prof-specialty',
       'Protective-serv', 'Sales', 'Tech-support', 'Transport-moving',
       'nan'], dtype='<U32')

In [15]:
np.unique(adult_df.relationship.to_list())

array(['Husband', 'Not-in-family', 'Other-relative', 'Own-child',
       'Unmarried', 'Wife'], dtype='<U14')

In [16]:
np.unique(adult_df.race.to_list())

array(['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other',
       'White'], dtype='<U18')

In [17]:
np.unique(adult_df.sex.to_list())

array(['Female', 'Male'], dtype='<U6')

In [18]:
np.unique(adult_df.capital_gain.to_list())

array([    0,   114,   401,   594,   914,   991,  1055,  1086,  1111,
        1151,  1173,  1409,  1424,  1455,  1471,  1506,  1639,  1797,
        1831,  1848,  2009,  2036,  2050,  2062,  2105,  2174,  2176,
        2202,  2228,  2290,  2329,  2346,  2354,  2387,  2407,  2414,
        2463,  2538,  2580,  2597,  2635,  2653,  2829,  2885,  2907,
        2936,  2961,  2964,  2977,  2993,  3103,  3137,  3273,  3325,
        3411,  3418,  3432,  3456,  3464,  3471,  3674,  3781,  3818,
        3887,  3908,  3942,  4064,  4101,  4386,  4416,  4508,  4650,
        4687,  4787,  4865,  4931,  4934,  5013,  5060,  5178,  5455,
        5556,  5721,  6097,  6360,  6418,  6497,  6514,  6723,  6767,
        6849,  7298,  7430,  7443,  7688,  7896,  7978,  8614,  9386,
        9562, 10520, 10566, 10605, 11678, 13550, 14084, 14344, 15020,
       15024, 15831, 18481, 20051, 22040, 25124, 25236, 27828, 34095,
       41310, 99999])

In [19]:
np.unique(adult_df.capital_loss.to_list())

array([   0,  155,  213,  323,  419,  625,  653,  810,  880,  974, 1092,
       1138, 1258, 1340, 1380, 1408, 1411, 1485, 1504, 1539, 1564, 1573,
       1579, 1590, 1594, 1602, 1617, 1628, 1648, 1651, 1668, 1669, 1672,
       1719, 1721, 1726, 1735, 1740, 1741, 1755, 1762, 1816, 1825, 1844,
       1848, 1876, 1887, 1902, 1944, 1974, 1977, 1980, 2001, 2002, 2042,
       2051, 2057, 2080, 2129, 2149, 2163, 2174, 2179, 2201, 2205, 2206,
       2231, 2238, 2246, 2258, 2267, 2282, 2339, 2352, 2377, 2392, 2415,
       2444, 2457, 2467, 2472, 2489, 2547, 2559, 2603, 2754, 2824, 3004,
       3683, 3770, 3900, 4356])

In [20]:
np.unique(adult_df.hours_per_week.to_list())

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
       70, 72, 73, 74, 75, 76, 77, 78, 80, 81, 82, 84, 85, 86, 87, 88, 89,
       90, 91, 92, 94, 95, 96, 97, 98, 99])

In [21]:
np.unique(adult_df.native_country.to_list())

array(['Cambodia', 'Canada', 'China', 'Columbia', 'Cuba',
       'Dominican-Republic', 'Ecuador', 'El-Salvador', 'England',
       'France', 'Germany', 'Greece', 'Guatemala', 'Haiti',
       'Holand-Netherlands', 'Honduras', 'Hong', 'Hungary', 'India',
       'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico',
       'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines',
       'Poland', 'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan',
       'Thailand', 'Trinadad&Tobago', 'United-States', 'Vietnam',
       'Yugoslavia', 'nan'], dtype='<U32')

In [22]:
np.unique(adult_df.income.to_list())

array(['<=50K', '>50K'], dtype='<U5')

In [23]:
adult_df.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     583
income               0
dtype: int64

In [24]:
adult_df['workclass']=adult_df['workclass'].fillna('unknown')
adult_df['native_country']=adult_df['native_country'].fillna('other')
adult_df['occupation']=adult_df['occupation'].fillna('unknown')

### Deal with duplicates

In [25]:
adult_df.duplicated().sum()

24

In [26]:
adult_df[adult_df.duplicated(keep=False)]

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
2303,90,Private,52386,Some-college,10,Never-married,Other-service,Not-in-family,Asian-Pac-Islander,Male,0,0,35,United-States,<=50K
3917,19,Private,251579,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,14,United-States,<=50K
4325,25,Private,308144,Bachelors,13,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,Mexico,<=50K
4767,21,Private,250051,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,10,United-States,<=50K
4881,25,Private,308144,Bachelors,13,Never-married,Craft-repair,Not-in-family,White,Male,0,0,40,Mexico,<=50K
4940,38,Private,207202,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,48,United-States,>50K
5104,90,Private,52386,Some-college,10,Never-married,Other-service,Not-in-family,Asian-Pac-Islander,Male,0,0,35,United-States,<=50K
5579,27,Private,255582,HS-grad,9,Never-married,Machine-op-inspct,Not-in-family,White,Female,0,0,40,United-States,<=50K
5805,20,Private,107658,Some-college,10,Never-married,Tech-support,Not-in-family,White,Female,0,0,10,United-States,<=50K
5842,25,Private,195994,1st-4th,2,Never-married,Priv-house-serv,Not-in-family,White,Female,0,0,40,Guatemala,<=50K


In [27]:
adult_df.shape

(32561, 15)

### Standardized Categorical Variables

In [28]:
categorical_cols= adult_df.columns[adult_df.dtypes == object]
for col in categorical_cols:
    adult_df[col]= adult_df[col]. str.strip().str.lower()
categorical_cols

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')

In [29]:
adult_df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,state-gov,77516,bachelors,13,never-married,adm-clerical,not-in-family,white,male,2174,0,40,united-states,<=50k
1,50,self-emp-not-inc,83311,bachelors,13,married-civ-spouse,exec-managerial,husband,white,male,0,0,13,united-states,<=50k
2,38,private,215646,hs-grad,9,divorced,handlers-cleaners,not-in-family,white,male,0,0,40,united-states,<=50k
3,53,private,234721,11th,7,married-civ-spouse,handlers-cleaners,husband,black,male,0,0,40,united-states,<=50k
4,28,private,338409,bachelors,13,married-civ-spouse,prof-specialty,wife,black,female,0,0,40,cuba,<=50k
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,private,257302,assoc-acdm,12,married-civ-spouse,tech-support,wife,white,female,0,0,38,united-states,<=50k
32557,40,private,154374,hs-grad,9,married-civ-spouse,machine-op-inspct,husband,white,male,0,0,40,united-states,>50k
32558,58,private,151910,hs-grad,9,widowed,adm-clerical,unmarried,white,female,0,0,40,united-states,<=50k
32559,22,private,201490,hs-grad,9,never-married,adm-clerical,own-child,white,male,0,0,20,united-states,<=50k


In [30]:
adult_df['workclass'].unique()

array(['state-gov', 'self-emp-not-inc', 'private', 'federal-gov',
       'local-gov', 'unknown', 'self-emp-inc', 'without-pay',
       'never-worked'], dtype=object)

In [31]:
adult_df['workclass'] = adult_df['workclass'].replace({
    'state-gov':'government',
    'local-gov':'government',
    'federal-gov': 'government',
    'self-emp-not-inc': 'self-employed',
    'self-emp-inc': 'self-employed',
    'never-worked': 'unemployed',
    'without-pay': 'voluntary',
  
})
adult_df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,government,77516,bachelors,13,never-married,adm-clerical,not-in-family,white,male,2174,0,40,united-states,<=50k
1,50,self-employed,83311,bachelors,13,married-civ-spouse,exec-managerial,husband,white,male,0,0,13,united-states,<=50k
2,38,private,215646,hs-grad,9,divorced,handlers-cleaners,not-in-family,white,male,0,0,40,united-states,<=50k
3,53,private,234721,11th,7,married-civ-spouse,handlers-cleaners,husband,black,male,0,0,40,united-states,<=50k
4,28,private,338409,bachelors,13,married-civ-spouse,prof-specialty,wife,black,female,0,0,40,cuba,<=50k
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,private,257302,assoc-acdm,12,married-civ-spouse,tech-support,wife,white,female,0,0,38,united-states,<=50k
32557,40,private,154374,hs-grad,9,married-civ-spouse,machine-op-inspct,husband,white,male,0,0,40,united-states,>50k
32558,58,private,151910,hs-grad,9,widowed,adm-clerical,unmarried,white,female,0,0,40,united-states,<=50k
32559,22,private,201490,hs-grad,9,never-married,adm-clerical,own-child,white,male,0,0,20,united-states,<=50k


In [32]:
adult_df['education'].unique()

array(['bachelors', 'hs-grad', '11th', 'masters', '9th', 'some-college',
       'assoc-acdm', 'assoc-voc', '7th-8th', 'doctorate', 'prof-school',
       '5th-6th', '10th', '1st-4th', 'preschool', '12th'], dtype=object)

In [None]:
adult_df['education_level'] = adult_df['education'].map({
    'bachelors':'tertiary',
    'masters':'tertiary',
    'doctorate': 'tertiary',
    'prof-school': 'tertiary',
    'some-college': 'unemployed',
    'assoc-acdm': 'associate',
    'assoc-voc': 'associate',
    'hs-grad': 'secondary graduate',
    '12th': 'secondary',
    '11th': 'secondary',
    '10th': 'secondary',
    '9th': 'secondary',
    '7th-8th':'primary',
    '5th-6th': 'primary',
    '1st-4th': 'primary',
    'preschool':'preschool',
  
})

In [None]:
adult_df.columns

In [None]:
adult_df['education'] = adult_df['education'].replace({
    'bachelors':'tertiary',
    'masters':'tertiary',
    'doctorate': 'tertiary',
    'prof-school': 'tertiary',
    'some-college': 'unemployed',
    'assoc-acdm': 'associate',
    'assoc-voc': 'associate',
    'hs-grad': 'secondary graduate',
    '12th': 'secondary',
    '11th': 'secondary',
    '10th': 'secondary',
    '9th': 'secondary',
    '7th-8th':'primary',
    '5th-6th': 'primary',
    '1st-4th': 'primary',
  
})
adult_df 

In [None]:
adult_df['marital_status'].unique()

In [None]:
adult_df['marital_status'] = adult_df['marital_status'].replace({
    'never-married': 'single', 
    'married-civ-spouse': 'married',
    'married-spouse-absent': 'divorced or separated',
    'divorced': 'divorced or separated',
    'separated': 'divorced or separated',
    'married-af-spouse':'married',
})
adult_df

In [None]:
adult_df['marital_status'].unique()

In [None]:
adult_df['occupation'].unique()

In [None]:
adult_df['occupation-grouped'] = adult_df['occupation'].map({
    'adm-clerical': 'white collor', 
    'exec-managerial': 'white collor', 
    'handlers-cleaners': 'blue collor',
    'prof-specialty' : 'white color', 
    'other-service': 'service', 
    'sales': 'white collor', 
    'craft-repair': 'blue collar',
    'transport-moving':'blue collor',
    'farming-fishing': 'blue collar',
    'machine-op-inspct':'whitecollar',
    'tech-support': 'white collar',
    'unknown':'unknown',
    'protective-serv':'service',
    'armed-forces': 'military',
    'priv-house-serv':'service',
})


In [None]:
adult_df['occupation-grouped'].unique()

In [None]:
adult_df['occupation-grouped'].unique()

In [None]:
adult_df['relationship'].unique()

In [None]:
adult_df['relationship'] = adult_df['relationship'].replace({
    'not-in-family': 'single',
    'husband': 'male spouse',
    'wife': 'female spouse',
    'own-child': 'child',
    'unmarried': 'single',
    'other-relative': 'extended relative',
})

In [None]:
adult_df['relationship'].unique()

In [None]:
adult_df['race'].unique()

In [None]:
adult_df ['race'] = adult_df['race'].replace({
    'white': 'white',
    'black': 'black',
    'asian-pac-islander': 'asian or pacific islander',
    'amer-indian-eskimo': 'american indian or eskimo',
    'other': 'other',
})

In [None]:
adult_df['race'].unique()

In [None]:
adult_df['native_country'].unique()

In [None]:
adult_df['native_region'] = adult_df['native_country'].map({
    'united-states': 'north america',
    'cuba': 'central america',
    'jamaica': 'central america',
    'india': 'asia',
    'other': 'other',
    'mexico': 'north america',
    'south': 'south america',
    'puerto-rico': 'north america',
    'honduras': 'central america',
    'england': 'europe',
    'canada': 'north america',
    'germany': 'europe',
    'iran': 'asia',
    'philippines': 'asia',
    'italy': 'europe',
    'poland': 'europe',
    'columbia': 'south america',
    'cambodia': 'asia',
    'thailand': 'asia',
    'ecuador': 'south america',
    'laos': 'asia',
    'taiwan': 'asia',
    'haiti': 'central america',
    'portugal': 'europe',
    'dominican-republic': 'central america',
    'el-salvador': 'central america',
    'france': 'europe',
    'guatemala': 'central america',
    'china': 'asia',
    'japan': 'asia',
    'yugoslavia': 'europe',
    'peru': 'south america',
    'outlying-us(guam-usvi-etc)': 'north america',
    'scotland': 'europe',
    'trinadad&tobago': 'central america',
    'greece': 'europe',
    'nicaragua': 'central america',
    'vietnam': 'asia',
    'hong': 'asia',
    'ireland': 'europe',
    'hungary': 'europe',
    'holand-netherlands': 'europe',
})

In [None]:
adult_df['native_region'].unique()

In [None]:
adult_df.columns

In [None]:
adult_df

In [None]:
clean_filename = os.path.join(processed_dir, "adult_cleaned.csv")
adult_df.to_csv(29.csv, index=False)
print(f"\nCleaned data saved to: {clean_filename}")