<a href="https://colab.research.google.com/github/Kamal0628/Kamal_january_training/blob/main/Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Q1. DATASET SELECTION

In [11]:
import pandas as pd

df = pd.read_csv("/content/sample_dataset_2000.csv")

print("Dataset Shape:", df.shape)
print(df.head())


Dataset Shape: (2000, 6)
   ID  Age  Salary  Experience_Years  Department  Performance_Score
0   1   56  116520                 4          IT                  5
1   2   46   76272                28  Operations                 10
2   3   32  107152                27          IT                 10
3   4   60   82649                10  Operations                 10
4   5   25   79674                 0          IT                  8


Q2. DATA PREPROCESSING

In [12]:
# Handling missing values
df.fillna({
    "Age": df["Age"].median(),
    "Salary": df["Salary"].mean(),
    "Experience_Years": df["Experience_Years"].median(),
    "Department": df["Department"].mode()[0],
    "Performance_Score": df["Performance_Score"].median()
}, inplace=True)

# Fix data types
df["Age"] = df["Age"].astype(int)
df["Experience_Years"] = df["Experience_Years"].astype(int)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Drop irrelevant column
# Check if 'ID' column exists before dropping to prevent KeyError on re-execution
if 'ID' in df.columns:
    df.drop(columns=["ID"], inplace=True)

print("Missing values after cleaning:")
print(df.isnull().sum())

Missing values after cleaning:
Age                  0
Salary               0
Experience_Years     0
Department           0
Performance_Score    0
dtype: int64


Q3. CATEGORICAL VARIABLE HANDLING

In [13]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

# Label Encoding
le = LabelEncoder()
df["Department_Label"] = le.fit_transform(df["Department"])

# Ordinal Encoding
oe = OrdinalEncoder()
df["Department_Ordinal"] = oe.fit_transform(df[["Department"]])

# One-Hot Encoding
df_onehot = pd.get_dummies(df, columns=["Department"], drop_first=True)

# Frequency Encoding
freq = df["Department"].value_counts(normalize=True)
df["Department_Frequency"] = df["Department"].map(freq)

# Target Encoding
target_mean = df.groupby("Department")["Performance_Score"].mean()
df["Department_Target"] = df["Department"].map(target_mean)

print(df[[
    "Department",
    "Department_Label",
    "Department_Ordinal",
    "Department_Frequency",
    "Department_Target"
]].head())


   Department  Department_Label  Department_Ordinal  Department_Frequency  \
0          IT                 2                 2.0                0.1965   
1  Operations                 4                 4.0                0.2040   
2          IT                 2                 2.0                0.1965   
3  Operations                 4                 4.0                0.2040   
4          IT                 2                 2.0                0.1965   

   Department_Target  
0           5.646310  
1           5.669118  
2           5.646310  
3           5.669118  
4           5.646310  


Q4. FEATURE SCALING

In [14]:
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler, Normalizer

num_cols = ["Age", "Salary", "Experience_Years"]

minmax = MinMaxScaler()
standard = StandardScaler()

df["Age_MinMax"] = minmax.fit_transform(df[["Age"]])
df["Salary_Zscore"] = standard.fit_transform(df[["Salary"]])

print(df[["Age", "Age_MinMax", "Salary", "Salary_Zscore"]].head())


   Age  Age_MinMax  Salary  Salary_Zscore
0   56    0.826087  116520       1.628652
1   46    0.608696   76272       0.308016
2   32    0.304348  107152       1.321265
3   60    0.913043   82649       0.517261
4   25    0.152174   79674       0.419644


Q5. ADDITIONAL STEPS (TRAIN-TEST & SKEWNESS)


In [15]:
from sklearn.model_selection import train_test_split
import numpy as np

X = df_onehot.drop("Performance_Score", axis=1)
y = df_onehot["Performance_Score"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

df["Salary_Log"] = np.log1p(df["Salary"])

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)
print(df[["Salary", "Salary_Log"]].head())


Train size: (1600, 9)
Test size: (400, 9)
   Salary  Salary_Log
0  116520   11.665827
1   76272   11.242074
2  107152   11.582013
3   82649   11.322370
4   79674   11.285711
