In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import IsolationForest
import ppscore as pps

# Load the dataset
df = pd.read_csv('C:/Users/HP PROBOOK/Downloads/EDA2/EDA2/adult_with_headers.csv')

# Data Exploration and Preprocessing

# Basic data exploration
print("Summary Statistics:")
print(df.describe())

print("\nData Types:")
print(df.dtypes)

print("\nMissing Values:")
print(df.isnull().sum())

# Handle missing values (e.g., imputation or removal)
# Example:
# df.dropna(inplace=True)

# Encoding Techniques

# One-Hot Encoding for categorical variables with less than 5 categories
onehot_encoder = OneHotEncoder(drop='first', sparse=False)
for column in df.select_dtypes(include='object').columns:
    if df[column].nunique() < 5:
        encoded_features = onehot_encoder.fit_transform(df[[column]])
        df_encoded = pd.concat([df, pd.DataFrame(encoded_features, columns=onehot_encoder.get_feature_names_out([column]))], axis=1)
        df = df_encoded.drop(columns=[column])

# Label Encoding for categorical variables with more than 5 categories
label_encoder = LabelEncoder()
for column in df.select_dtypes(include='object').columns:
    if df[column].nunique() > 5:
        df[column] = label_encoder.fit_transform(df[column])

# Apply scaling techniques to numerical features

# Standard Scaling
scaler_standard = StandardScaler()
df[df.select_dtypes(include=np.number).columns] = scaler_standard.fit_transform(df.select_dtypes(include=np.number))

# Min-Max Scaling
scaler_minmax = MinMaxScaler()
df[df.select_dtypes(include=np.number).columns] = scaler_minmax.fit_transform(df.select_dtypes(include=np.number))

# Feature Engineering

# Create new features
# Example:
# df['new_feature_1'] = df['feature_1'] + df['feature_2']
# df['new_feature_2'] = df['feature_3'] * df['feature_4']

# Apply transformation to skewed numerical feature
# Example:
# df['skewed_feature'] = np.log(df['skewed_feature'])

# Feature Selection

# Use Isolation Forest algorithm to identify and remove outliers
isolation_forest = IsolationForest()
outliers = isolation_forest.fit_predict(df.select_dtypes(include=np.number))
df_no_outliers = df[outliers != -1]

# Discuss how outliers can affect model performance.
# Outliers can skew the distribution and affect the mean and standard deviation, leading to biased model predictions.

# Apply PPS to find and discuss relationships between features
pps_matrix = pps.matrix(df)
print("\nPPS Matrix:")
print(pps_matrix)

# Compare its findings with the correlation matrix
correlation_matrix = df.corr()
print("\nCorrelation Matrix:")
print(correlation_matrix)