In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [3]:
!pip install scikit-learn



In [4]:
df=pd.read_csv(r"D:\documents\Data Analysis With Python\datasets\employee_dataset_200_with_missing.csv")
df

Unnamed: 0,EmployeeID,Name,Age,Department,Salary,Experience,JoiningYear,Attrition
0,1,Employee_1,50.0,IT,98385.0,16.0,2011.0,Yes
1,2,Employee_2,36.0,HR,70158.0,11.0,2018.0,No
2,3,Employee_3,29.0,Finance,95417.0,12.0,2009.0,No
3,4,Employee_4,42.0,Marketing,53289.0,,2008.0,Yes
4,5,Employee_5,40.0,HR,39823.0,16.0,2004.0,No
...,...,...,...,...,...,...,...,...
195,196,Employee_196,40.0,Sales,82528.0,2.0,2005.0,Yes
196,197,Employee_197,41.0,IT,68413.0,15.0,2007.0,No
197,198,Employee_198,53.0,IT,82638.0,8.0,2022.0,No
198,199,Employee_199,28.0,Sales,82736.0,8.0,,Yes


In [5]:
df.shape


(200, 8)

In [6]:
df.isnull().sum()

EmployeeID      0
Name            0
Age            30
Department     30
Salary         30
Experience     30
JoiningYear    30
Attrition       0
dtype: int64

In [7]:
df.dtypes

EmployeeID       int64
Name            object
Age            float64
Department      object
Salary         float64
Experience     float64
JoiningYear    float64
Attrition       object
dtype: object

In [8]:
# These are numerical columns so we replace missing values with median(middle value in sorted column)
df["Age"]=df["Age"].fillna(df["Age"].median())
df["Salary"]=df["Salary"].fillna(df["Salary"].median())
df["Experience"]=df["Experience"].fillna(df["Experience"].median())

In [9]:
df.isnull().sum()

EmployeeID      0
Name            0
Age             0
Department     30
Salary          0
Experience      0
JoiningYear    30
Attrition       0
dtype: int64

In [10]:
df["Department"]=df["Department"].fillna(df["Department"].mode()[0])


In [11]:
df["JoiningYear"]=df["JoiningYear"].fillna(df["JoiningYear"].min())

In [12]:
df.isnull().sum()

EmployeeID     0
Name           0
Age            0
Department     0
Salary         0
Experience     0
JoiningYear    0
Attrition      0
dtype: int64

In [13]:
df["Age"] = df["Age"].astype(int)

df["Salary"] = df["Salary"].astype(int)

df["Experience"] = df["Experience"].astype(int)

# Change 'Salary' from float to integer
df["JoiningYear"] = df["JoiningYear"].astype(int)

In [14]:
df.dtypes

EmployeeID      int64
Name           object
Age             int64
Department     object
Salary          int64
Experience      int64
JoiningYear     int64
Attrition      object
dtype: object

# 1. Label Encoding

In [15]:
# What it does: Converts categories into numeric values (like 0, 1, 2, …).
# Each unique category is assigned an integer.

encoder = LabelEncoder()
df["Attrition"] = encoder.fit_transform(df["Attrition"])

#  0=No, 1=Yes

In [16]:
df.head(10)

Unnamed: 0,EmployeeID,Name,Age,Department,Salary,Experience,JoiningYear,Attrition
0,1,Employee_1,50,IT,98385,16,2011,1
1,2,Employee_2,36,HR,70158,11,2018,0
2,3,Employee_3,29,Finance,95417,12,2009,0
3,4,Employee_4,42,Marketing,53289,11,2008,1
4,5,Employee_5,40,HR,39823,16,2004,0
5,6,Employee_6,44,HR,90160,8,2005,0
6,7,Employee_7,32,IT,71975,6,2019,0
7,8,Employee_8,41,IT,39540,12,2001,1
8,9,Employee_9,41,HR,73102,8,2012,1
9,10,Employee_10,57,Marketing,34611,4,2000,1


# 2. One-Hot Encoding

In [None]:
# What it does: Creates a new column for each category and assigns 1 or 0 (True/False).
# No ordering problem.

df = pd.get_dummies(df, columns=["Attrition"])
print(df)

In [18]:
df

Unnamed: 0,EmployeeID,Name,Age,Department,Salary,Experience,JoiningYear,Attrition_0,Attrition_1
0,1,Employee_1,50,IT,98385,16,2011,False,True
1,2,Employee_2,36,HR,70158,11,2018,True,False
2,3,Employee_3,29,Finance,95417,12,2009,True,False
3,4,Employee_4,42,Marketing,53289,11,2008,False,True
4,5,Employee_5,40,HR,39823,16,2004,True,False
...,...,...,...,...,...,...,...,...,...
195,196,Employee_196,40,Sales,82528,2,2005,False,True
196,197,Employee_197,41,IT,68413,15,2007,True,False
197,198,Employee_198,53,IT,82638,8,2022,True,False
198,199,Employee_199,28,Sales,82736,8,2000,False,True


In [26]:
numeric_cols=df.select_dtypes(include=["int64","float64"]).columns.tolist()
numeric_cols.remove('EmployeeID')
numeric_cols

['Age', 'Salary', 'Experience', 'JoiningYear']

In [27]:
X=df[["Age"]]
Y=df["Salary"]

In [28]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

In [30]:
model=LinearRegression()
model.fit(X_train,Y_train)

Y_pred=model.predict(X_test)

In [33]:
print(f"Model Coefficients: ")
print(f"Intercept: {model.intercept_}")
print(f"Slope: {model.coef_[0]}")

Model Coefficients: 
Intercept: 74399.76086524942
Slope: -19.638096472853878


In [34]:
new_ages=pd.DataFrame({"Age":[25,30,35,40,45]})
new_salaries=model.predict(new_ages)

print("\nPredicted Salaries for given Ages:")
for age,salary in zip(new_ages["Age"],new_salaries):
    print(f"Age: {age}, Predicted Salary: {salary:.2f}")


Predicted Salaries for given Ages:
Age: 25, Predicted Salary: 73908.81
Age: 30, Predicted Salary: 73810.62
Age: 35, Predicted Salary: 73712.43
Age: 40, Predicted Salary: 73614.24
Age: 45, Predicted Salary: 73516.05
