In [None]:
!C:\ProgramData\anaconda3\python.exe -m pip install --upgrade pip setuptools wheel

In [None]:
!pip install ppscore

In [None]:
!pip install --upgrade pandas scikit-learn ppscore

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import IsolationForest
import ppscore as pps
# Load dataset
file_path = r'C:\Users\admin\Downloads\EDA2\EDA2\adult_with_headers.csv'
df = pd.read_csv(file_path)

# Select only numerical columns for scaling
num_cols = df.select_dtypes(include=['number']).columns  

# Standard Scaling
scaler_standard = StandardScaler()
df_standard_scaled = df.copy()
df_standard_scaled[num_cols] = scaler_standard.fit_transform(df[num_cols])

# Min-Max Scaling
scaler_minmax = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[num_cols] = scaler_minmax.fit_transform(df[num_cols])

print("Scaling applied successfully.")



# Step 1: Data Exploration and Preprocessing
print("Initial Data Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())

# Handle missing values
for col in df.select_dtypes(include=['number']).columns:
    df[col].fillna(df[col].median(), inplace=True)
for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Scaling numerical features
num_cols = df.select_dtypes(include=['number']).columns
scaler_standard = StandardScaler()
df_standard_scaled = df.copy()
df_standard_scaled[num_cols] = scaler_standard.fit_transform(df[num_cols])
scaler_minmax = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[num_cols] = scaler_minmax.fit_transform(df[num_cols])

# Step 2: Encoding Techniques
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    if df[col].nunique() <= 5:
        encoder = OneHotEncoder(drop='first', sparse_output=False)

        encoded_data = encoder.fit_transform(df[[col]])
        encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out([col]))
        df = pd.concat([df.drop(columns=[col]), encoded_df], axis=1)
    else:
        df[col] = LabelEncoder().fit_transform(df[col])

# Step 3: Feature Engineering
# Creating new features (Example: age groups & work_hours_per_week_category)
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 45, 65, 100], labels=['Young', 'Adult', 'Middle_Aged', 'Senior'])
df['work_hours_category'] = pd.cut(df['hours_per_week'], bins=[0, 20, 40, 60, 100], labels=['Part-time', 'Full-time', 'Overtime', 'Extreme'])

# Apply log transformation to skewed numerical data
df['capital_gain'] = np.log1p(df['capital_gain'])
df['capital_loss'] = np.log1p(df['capital_loss'])

# Step 4: Feature Selection
# Detect outliers using Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
outliers = iso_forest.fit_predict(df[num_cols])
df = df[outliers == 1]  # Keep only inliers
# PPS Score Analysis
pps_matrix = pps.matrix(df)
print(pps_matrix)

  


# Compute correlation matrix instead
correlation_matrix = df.corr()
print("Correlation Matrix:")
print(correlation_matrix)


# Final preprocessed dataset
print("Final Data Preview:")
print(df.head())

Scaling applied successfully.
Initial Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None

Missing Values:
age               0
wo

  correlation_matrix = df.corr()
