## Data Cleaning and Processing Plan

### Data Collection and Import

In [1]:
# Load dataset
df = pd.read_csv("data.csv", encoding="utf-8")  # Adjust encoding if needed

# Display basic information
df.info()
df.head()

NameError: name 'pd' is not defined

### Handling Missing Values

In [None]:
# Check missing values
print(df.isnull().sum())

# Drop columns with more than 50% missing values
df = df.dropna(thresh=len(df) * 0.5, axis=1)

# Fill missing numerical values with median
df.fillna(df.median(numeric_only=True), inplace=True)

# Fill categorical missing values with mode
df.fillna(df.mode().iloc[0], inplace=True)

# Forward and backward fill for time series data
df.fillna(method='ffill', inplace=True)
df.fillna(method='bfill', inplace=True)

### Handling Duplicates

In [None]:
# Check for duplicate rows
print(f"Duplicate Rows: {df.duplicated().sum()}")

# Remove duplicates
df = df.drop_duplicates()

### Data Type Conversion

In [None]:
# Convert categorical columns to category type
categorical_columns = ['gender', 'region']
df[categorical_columns] = df[categorical_columns].astype('category')

# Convert boolean columns (Yes/No to 1/0)
df['is_employed'] = df['is_employed'].map({'Yes': 1, 'No': 0})

# Convert date columns
df['date'] = pd.to_datetime(df['date'], errors='coerce')

### Standardization and Normalization

In [None]:
# Normalize numerical features using Min-Max Scaling
scaler = MinMaxScaler()
df[['income', 'age']] = scaler.fit_transform(df[['income', 'age']])

# Standardize features using Z-score
scaler = StandardScaler()
df[['income', 'age']] = scaler.fit_transform(df[['income', 'age']])

# One-hot encode categorical variables
df = pd.get_dummies(df, columns=['region'], drop_first=True)

# Label encode categorical features
label_encoder = LabelEncoder()
df['gender'] = label_encoder.fit_transform(df['gender'])

### Outlier Detection and Handling

In [None]:
# Using Z-score to identify outliers
from scipy.stats import zscore

df['income_zscore'] = zscore(df['income'])
df = df[(df['income_zscore'].abs() < 3)]  # Remove extreme outliers

# Using IQR to cap outliers
Q1 = df['age'].quantile(0.25)
Q3 = df['age'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['age'] >= (Q1 - 1.5 * IQR)) & (df['age'] <= (Q3 + 1.5 * IQR))]

### Feature Engineering

In [None]:
# Extract date-based features
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df['weekday'] = df['date'].dt.weekday

# Create interaction terms
df['income_age_interaction'] = df['income'] * df['age']

# Binning numerical values
df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 50, 65, 100], labels=['<18', '18-35', '36-50', '51-65', '65+'])

# Apply log transformation for skewed features
df['log_income'] = np.log1p(df['income'])

### Handling Inconsistent Data

In [None]:
# Standardize text categories
df['country'] = df['country'].replace({'USA': 'US', 'United States': 'US', 'U.S.': 'US'})

# Validate ranges
df = df[df['age'] >= 0]  # Ensure age is non-negative
df = df[df['income'] >= 0]  # Ensure income is non-negative

### Data Splitting for Model Training

In [None]:
from sklearn.model_selection import train_test_split

# Define target variable and features
target = 'is_employed'
features = df.drop(columns=[target])

# Split into train and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(features, df[target], test_size=0.2, random_state=42, stratify=df[target])

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

### Final Data Export

In [None]:
# Save cleaned dataset
df.to_csv("cleaned_data.csv", index=False)
df.to_parquet("cleaned_data.parquet", index=False)