In [1]:
import pandas as pd

# Load the Adult dataset (assuming you have a file or URL)
df = pd.read_csv('/content/adult_with_headers.csv')

# Display the first few rows to get an overview of the data
print(df.head())

# Summary statistics for numerical features
print(df.describe())

# Check data types and missing values
print(df.info())

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50

In [3]:
# Checking for missing values
missing_values = df.isnull().sum()
print(missing_values)

# Imputation for numerical columns with missing values
df['age'].fillna(df['age'].mean(), inplace=True)

# Removal for rows with missing target variable
df.dropna(subset=['income'], inplace=True)

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].mean(), inplace=True)


In [4]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Select numerical columns for scaling
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Standard Scaling
scaler_standard = StandardScaler()
df[numerical_cols] = scaler_standard.fit_transform(df[numerical_cols])

# Min-Max Scaling
scaler_minmax = MinMaxScaler()
df[numerical_cols] = scaler_minmax.fit_transform(df[numerical_cols])

In [6]:
# Applying One-Hot Encoding for categorical columns with less than 5 categories
categorical_cols_less_than_5 = [col for col in df.select_dtypes(include=['object']).columns if df[col].nunique() < 5]
df = pd.get_dummies(df, columns=categorical_cols_less_than_5, drop_first=True)

In [7]:
from sklearn.preprocessing import LabelEncoder

# Apply Label Encoding for categorical columns with more than 5 categories
categorical_cols_more_than_5 = [col for col in df.select_dtypes(include=['object']).columns if df[col].nunique() > 5]

label_encoder = LabelEncoder()
for col in categorical_cols_more_than_5:
    df[col] = label_encoder.fit_transform(df[col])

In [10]:
import numpy as np

# Check the skewness of numerical features
skewed_features = df[numerical_cols].skew()
print(skewed_features)

# Apply log transformation to skewed features
df['log_age'] = np.log1p(df['age'])  # Log transformation

age                0.558743
fnlwgt             1.446980
education_num     -0.311676
capital_gain      11.953848
capital_loss       4.594629
hours_per_week     0.227643
dtype: float64


In [11]:
from sklearn.ensemble import IsolationForest

# Fit the model to the data
isolation_forest = IsolationForest(contamination=0.05)  # 5% of data is assumed to be outliers
outliers = isolation_forest.fit_predict(df[numerical_cols])

# Mark the outliers as 1 and inliers as -1, then remove outliers
df = df[outliers == 1]

In [17]:
!pip install ppscore
import ppscore as pps

# Compute the PPS matrix for specific columns
pps_matrix = pps.matrix(df)
print(pps_matrix)

for col in df.select_dtypes(include=['object', 'category']).columns:
    # exploring One-Hot Encoding for better performance
    df[col] = label_encoder.fit_transform(df[col])

# Now calculating the correlation matrix
correlation_matrix = df.corr()
print(correlation_matrix)

           x               y  ppscore            case  is_valid_score  \
0        age             age      1.0  predict_itself            True   
1        age       workclass      0.0      regression            True   
2        age          fnlwgt      0.0      regression            True   
3        age       education      0.0      regression            True   
4        age   education_num      0.0      regression            True   
..       ...             ...      ...             ...             ...   
284  log_age  native_country      0.0      regression            True   
285  log_age       sex_ Male      0.0  classification            True   
286  log_age    income_ >50K      0.0  classification            True   
287  log_age       age_group      1.0      regression            True   
288  log_age         log_age      1.0  predict_itself            True   

                  metric  baseline_score  model_score  \
0                   None        0.000000     1.000000   
1    mean

In [16]:
# Comparing with correlation matrix
correlation_matrix = df.corr()
print(correlation_matrix)

                     age  workclass    fnlwgt  education  education_num  \
age             1.000000   0.014502 -0.082803   0.003951       0.044444   
workclass       0.014502   1.000000 -0.017433   0.017235       0.044253   
fnlwgt         -0.082803  -0.017433  1.000000  -0.020963      -0.044524   
education       0.003951   0.017235 -0.020963   1.000000       0.342278   
education_num   0.044444   0.044253 -0.044524   0.342278       1.000000   
marital_status -0.293158  -0.059609  0.028820  -0.033112      -0.055839   
occupation     -0.013750   0.244524 -0.002705  -0.027488       0.104450   
relationship   -0.262807  -0.091288  0.008587  -0.010864      -0.088866   
race            0.027365   0.048764 -0.022403   0.012901       0.028212   
capital_gain    0.092701   0.023195 -0.008861   0.019562       0.132556   
capital_loss    0.025868  -0.005008 -0.025926   0.031701       0.044884   
hours_per_week  0.105737   0.128202 -0.012236   0.046144       0.130512   
native_country  0.001333 