<a href="https://colab.research.google.com/github/Maruf346/AI-ML-with-python/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Use the dataset and apply all necessary data preprocessing techniques to prepare the dataset for machine learning models. This includes handling missing values, scaling, encoding categorical variables (if any), and any other relevant preprocessing steps.**

In [1]:
# ==============================
# Step 1: Import Libraries
# ==============================
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# ==============================
# Step 2: Load Dataset
# ==============================
# Upload your dataset in Colab, or directly use the file path if available
df = pd.read_csv("synthetic_employee_data.csv")

# View basic info
print("Dataset Shape:", df.shape)
print("\nPreview of dataset:\n", df.head())
print("\nMissing values count:\n", df.isnull().sum())
print("\nData types:\n", df.dtypes)

# ==============================
# Step 3: Handle Missing Values
# ==============================
# Separate categorical and numerical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include=['object']).columns

print("\nNumerical columns:", list(num_cols))
print("Categorical columns:", list(cat_cols))

# Imputer for missing values
num_imputer = SimpleImputer(strategy="mean")   # replace missing numerical values with mean
cat_imputer = SimpleImputer(strategy="most_frequent")  # replace missing categorical with most frequent

# ==============================
# Step 4: Encoding Categorical Variables
# ==============================
# We'll use OneHotEncoding for categorical features
# (If you have high-cardinality features, LabelEncoding could also be used)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_imputer, num_cols),
        ("cat", Pipeline([
            ("imputer", cat_imputer),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols)
    ]
)

# ==============================
# Step 5: Scaling Numerical Features
# ==============================
# We'll add StandardScaler for numerical values
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("scaler", StandardScaler(with_mean=False)) # with_mean=False because sparse data after OneHot
])

# Apply transformations
processed_array = pipeline.fit_transform(df)

# Convert back to DataFrame with proper feature names
ohe_features = list(pipeline.named_steps["preprocessor"]
                    .named_transformers_["cat"]
                    .named_steps["encoder"].get_feature_names_out(cat_cols))

processed_df = pd.DataFrame(
    processed_array.toarray() if hasattr(processed_array, "toarray") else processed_array,
    columns=list(num_cols) + ohe_features
)

print("\nProcessed Dataset Shape:", processed_df.shape)
print("\nPreview of processed dataset:\n", processed_df.head())


Dataset Shape: (1000, 13)

Preview of dataset:
   Employee_ID Employee_Name    Age      Salary  Years_at_Company  \
0      E00001   Alex Miller  150.0  2000000.00              29.0   
1      E00002    Alex Smith  200.0  1500000.00              37.0   
2      E00003   Jane Garcia    5.0  1200000.00               3.0   
3      E00004   Emily Brown  200.0    23118.19              42.0   
4      E00005    Mike Jones  150.0    20852.42              33.0   

   Number_of_Projects  Performance_Score  Is_Manager  Works_Remotely  \
0                 2.0                3.0           0             0.0   
1                 7.0                3.0           0             0.0   
2                 3.0                3.0           0             0.0   
3                 7.0                3.0           1             0.0   
4                 2.0                5.0           0             0.0   

    Department Education_Level Location                   Hire_Date  
0    marketing             PHD   AUSTIN 