In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest

# Load the dataset
df = pd.read_csv(r'c:\Users\HP PROBOOK\Downloads\EDA2\EDA2\adult_with_headers.csv')

# Data Exploration and Preprocessing

# Basic data exploration
print("Summary Statistics:")
print(df.describe())

print("\nData Types:")
print(df.dtypes)

print("\nMissing Values:")
print(df.isnull().sum())

# Handle missing values (if there are any)
# For simplicity, if there are any missing values, they will be filled with the mode (most frequent value)
for column in df.columns:
    if df[column].isnull().any():
        df[column].fillna(df[column].mode()[0], inplace=True)

# Encoding Techniques

# Applying One-Hot Encoding and Label Encoding
onehot_encoder = OneHotEncoder(drop='first', sparse_output=False)  # Correct use of parameter name
label_encoder = LabelEncoder()

for column in df.select_dtypes(include='object').columns:
    if df[column].nunique() < 5:
        # One-hot encode
        encoded_df = pd.DataFrame(onehot_encoder.fit_transform(df[[column]]), columns=onehot_encoder.get_feature_names_out())
        df = df.join(encoded_df).drop(column, axis=1)
    else:
        # Label encode
        df[column] = label_encoder.fit_transform(df[column])

# Apply scaling techniques to numerical features

# Standard Scaling
scaler_standard = StandardScaler()
df_numeric_columns = df.select_dtypes(include=np.number).columns
df[df_numeric_columns] = scaler_standard.fit_transform(df[df_numeric_columns])

# Min-Max Scaling
scaler_minmax = MinMaxScaler()
df[df_numeric_columns] = scaler_minmax.fit_transform(df[df_numeric_columns])

# Feature Selection

# Use Isolation Forest algorithm to identify and remove outliers
isolation_forest = IsolationForest(random_state=42)
outliers = isolation_forest.fit_predict(df[df_numeric_columns])
df_no_outliers = df[outliers != -1]

# Compare its findings with the correlation matrix
correlation_matrix = df_no_outliers.corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)


Summary Statistics:
                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  

Data Types:
age                int64
workc