# Importing Libraries and dataset

In [3]:
#Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# Loading the dataset
df=pd.read_csv('adult_with_headers.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
df.shape

(32561, 15)

In [6]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income'],
      dtype='object')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [8]:
#checking Missing Values
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [9]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object

In [10]:
# Identify numerical features
numerical_features=df.select_dtypes(include=['int64','float']).columns
numerical_features

Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week'],
      dtype='object')

In [11]:
# Identify categorical features
categorical_features=df.select_dtypes(include=['object']).columns
categorical_features

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')

## Applying scaling techniques to numerical features

In [12]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler

In [13]:
scaler=StandardScaler()

In [14]:
scaled_data_df=df.copy()

In [15]:
scaled_data_df=scaler.fit_transform(df[numerical_features])
scaled_data_df

array([[ 0.03067056, -1.06361075,  1.13473876,  0.1484529 , -0.21665953,
        -0.03542945],
       [ 0.83710898, -1.008707  ,  1.13473876, -0.14592048, -0.21665953,
        -2.22215312],
       [-0.04264203,  0.2450785 , -0.42005962, -0.14592048, -0.21665953,
        -0.03542945],
       ...,
       [ 1.42360965, -0.35877741, -0.42005962, -0.14592048, -0.21665953,
        -0.03542945],
       [-1.21564337,  0.11095988, -0.42005962, -0.14592048, -0.21665953,
        -1.65522476],
       [ 0.98373415,  0.92989258, -0.42005962,  1.88842434, -0.21665953,
        -0.03542945]])

In [16]:
min_sca=MinMaxScaler()

In [17]:
min_data_df=df.copy()

In [18]:
min_data_df=min_sca.fit_transform(df[numerical_features])
min_data_df

array([[0.30136986, 0.0443019 , 0.8       , 0.02174022, 0.        ,
        0.39795918],
       [0.45205479, 0.0482376 , 0.8       , 0.        , 0.        ,
        0.12244898],
       [0.28767123, 0.13811345, 0.53333333, 0.        , 0.        ,
        0.39795918],
       ...,
       [0.56164384, 0.09482688, 0.53333333, 0.        , 0.        ,
        0.39795918],
       [0.06849315, 0.12849934, 0.53333333, 0.        , 0.        ,
        0.19387755],
       [0.47945205, 0.18720338, 0.53333333, 0.1502415 , 0.        ,
        0.39795918]])

## Encoding Technique

### One Hot Encoder

In [19]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [20]:
categorical_features

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')

In [21]:
df['workclass'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

In [22]:
df['education'].unique()

array([' Bachelors', ' HS-grad', ' 11th', ' Masters', ' 9th',
       ' Some-college', ' Assoc-acdm', ' Assoc-voc', ' 7th-8th',
       ' Doctorate', ' Prof-school', ' 5th-6th', ' 10th', ' 1st-4th',
       ' Preschool', ' 12th'], dtype=object)

In [23]:
df['marital_status'].unique()

array([' Never-married', ' Married-civ-spouse', ' Divorced',
       ' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
       ' Widowed'], dtype=object)

In [24]:
df['occupation'].unique()

array([' Adm-clerical', ' Exec-managerial', ' Handlers-cleaners',
       ' Prof-specialty', ' Other-service', ' Sales', ' Craft-repair',
       ' Transport-moving', ' Farming-fishing', ' Machine-op-inspct',
       ' Tech-support', ' ?', ' Protective-serv', ' Armed-Forces',
       ' Priv-house-serv'], dtype=object)

In [25]:
df['relationship'].unique()

array([' Not-in-family', ' Husband', ' Wife', ' Own-child', ' Unmarried',
       ' Other-relative'], dtype=object)

In [26]:
df['race'].unique()

array([' White', ' Black', ' Asian-Pac-Islander', ' Amer-Indian-Eskimo',
       ' Other'], dtype=object)

In [27]:
df['sex'].unique()

array([' Male', ' Female'], dtype=object)

In [28]:
df['native_country'].unique()

array([' United-States', ' Cuba', ' Jamaica', ' India', ' ?', ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands'], dtype=object)

In [29]:
df['income'].unique()

array([' <=50K', ' >50K'], dtype=object)

In [30]:
# Apply One-Hot Encoding to categorical variables with less than 5 categories
OHE= OneHotEncoder(sparse_output=False, drop='first')
df_one_hot_encoded = df.copy()

for col in categorical_features:
    if df[col].nunique() < 5:
        one_hot_encoded_data = OHE.fit_transform(df[[col]])
        one_hot_encoded_df = pd.DataFrame(one_hot_encoded_data, columns=[f"{col}_{i}" for i in range(one_hot_encoded_data.shape[1])])
        df_one_hot_encoded = pd.concat([df_one_hot_encoded, one_hot_encoded_df], axis=1).drop(columns=[col])   

In [31]:
df_one_hot_encoded.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_0,income_0
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,2174,0,40,United-States,1.0,0.0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,0,0,13,United-States,1.0,0.0


In [32]:
# Apply Label Encoding to categorical variables with more than 5 categories
le = LabelEncoder()
df_label_encoded = df_one_hot_encoded.copy()

for col in categorical_features:
    if df[col].nunique() >= 5:
        df_label_encoded[col] = le.fit_transform(df[col])

df_label_encoded.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_0,income_0
0,39,7,77516,9,13,4,1,1,4,2174,0,40,39,1.0,0.0
1,50,6,83311,9,13,2,4,0,4,0,0,13,39,1.0,0.0
2,38,4,215646,11,9,0,6,1,4,0,0,40,39,1.0,0.0
3,53,4,234721,1,7,2,6,0,2,0,0,40,39,1.0,0.0
4,28,4,338409,9,13,2,10,5,2,0,0,40,5,0.0,0.0


## Feature engineering

In [33]:
# Creating new features
df_label_encoded['age_bin']=pd.cut(df_label_encoded['age'],bins=[0,25,45,65,100],labels=['young','middle_aged','senior','elderly'])
df_label_encoded['hours_per_week_bin']=pd.cut(df_label_encoded['hours_per_week'],bins=[0,20,40,60,80],labels=['part_time','full_time','over_time','extreme'])
# Encoding new features
df_label_encoded['age_bin']=le.fit_transform(df_label_encoded['age_bin'])
df_label_encoded['hours_per_week_bin']=le.fit_transform(df_label_encoded['hours_per_week_bin'])
df_label_encoded.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_0,income_0,age_bin,hours_per_week_bin
0,39,7,77516,9,13,4,1,1,4,2174,0,40,39,1.0,0.0,1,1
1,50,6,83311,9,13,2,4,0,4,0,0,13,39,1.0,0.0,2,3


In [34]:
# Check skewness of numerical features
skewness = df[numerical_features].skew()
skewness

age                0.558743
fnlwgt             1.446980
education_num     -0.311676
capital_gain      11.953848
capital_loss       4.594629
hours_per_week     0.227643
dtype: float64

In [35]:
# Apply log transformation to a skewed feature
skewed_feature='capital_gain'
df_label_encoded[skewed_feature+'_log']=np.log1p(df_label_encoded[skewed_feature]) 

In [36]:
# Verify transformation
df_label_encoded[[skewed_feature, skewed_feature + '_log']].head()

Unnamed: 0,capital_gain,capital_gain_log
0,2174,7.684784
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0


## IsolationForest

In [37]:
from sklearn.ensemble import IsolationForest

In [38]:
# Apply Isolation Forest
iso_forest = IsolationForest(contamination=0.05)
outliers = iso_forest.fit_predict(df_label_encoded[numerical_features])

In [39]:
outliers 

array([1, 1, 1, ..., 1, 1, 1])

In [40]:
# Remove outliers
df_no_outliers = df_label_encoded[outliers == 1]
df_no_outliers.shape

(30933, 18)

In [41]:
df_no_outliers

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,capital_gain,capital_loss,hours_per_week,native_country,sex_0,income_0,age_bin,hours_per_week_bin,capital_gain_log
0,39,7,77516,9,13,4,1,1,4,2174,0,40,39,1.0,0.0,1,1,7.684784
1,50,6,83311,9,13,2,4,0,4,0,0,13,39,1.0,0.0,2,3,0.000000
2,38,4,215646,11,9,0,6,1,4,0,0,40,39,1.0,0.0,1,1,0.000000
3,53,4,234721,1,7,2,6,0,2,0,0,40,39,1.0,0.0,2,1,0.000000
4,28,4,338409,9,13,2,10,5,2,0,0,40,5,0.0,0.0,1,1,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,4,257302,7,12,2,13,5,4,0,0,38,39,0.0,0.0,1,1,0.000000
32557,40,4,154374,11,9,2,7,0,4,0,0,40,39,1.0,1.0,1,1,0.000000
32558,58,4,151910,11,9,6,1,4,4,0,0,40,39,0.0,0.0,2,1,0.000000
32559,22,4,201490,11,9,4,1,3,4,0,0,20,39,1.0,0.0,3,3,0.000000


In [42]:
df_no_outliers.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'capital_gain',
       'capital_loss', 'hours_per_week', 'native_country', 'sex_0', 'income_0',
       'age_bin', 'hours_per_week_bin', 'capital_gain_log'],
      dtype='object')

## PPS score

In [43]:
#install the package
#!pip install ppscore

In [44]:
import ppscore as pps

In [45]:
# Calculate PPS matrix
pps_matrix = pps.matrix(df_no_outliers)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')

In [46]:
# Calculate correlation matrix
correlation_matrix = df_no_outliers.corr()

pps_matrix, correlation_matrix

(x                        age   age_bin  capital_gain  capital_gain_log  \
 y                                                                        
 age                 1.000000  0.606979      0.002211          0.000888   
 age_bin             1.000000  1.000000      0.000000          0.000000   
 capital_gain        0.000000  0.000000      1.000000          0.991154   
 capital_gain_log    0.000000  0.000000      0.996589          1.000000   
 capital_loss        0.000000  0.000000      0.000000          0.000000   
 education           0.000000  0.000000      0.000000          0.000000   
 education_num       0.000000  0.000000      0.000000          0.000000   
 fnlwgt              0.000000  0.000000      0.000000          0.000000   
 hours_per_week      0.000000  0.000000      0.000000          0.000000   
 hours_per_week_bin  0.000000  0.000000      0.000000          0.000000   
 income_0            0.000000  0.000000      0.000000          0.000000   
 marital_status      0.15