In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score



In [9]:
np.random.seed(42)

data = pd.DataFrame({
    "Experience": np.random.randint(0, 15, 500),
    "Salary": np.random.randint(20000, 150000, 500),
    "Education": np.random.choice(["Graduate", "PostGraduate", "PhD"], 500),
    "Department": np.random.choice(["IT", "HR", "Finance", "Sales"], 500),
    "Performance_Score": np.random.randint(40, 100, 500)
})


In [10]:
for col in data.columns:
    data.loc[data.sample(frac=0.1).index, col] = np.nan


In [11]:
data

Unnamed: 0,Experience,Salary,Education,Department,Performance_Score
0,6.0,70993.0,PhD,HR,47.0
1,3.0,,PhD,Sales,59.0
2,12.0,30647.0,Graduate,HR,68.0
3,,28716.0,Graduate,IT,93.0
4,10.0,108891.0,,IT,92.0
...,...,...,...,...,...
495,12.0,82292.0,Graduate,IT,80.0
496,9.0,,Graduate,Finance,75.0
497,6.0,24158.0,PhD,Sales,97.0
498,9.0,82680.0,PhD,Sales,92.0


In [12]:
data.isnull()

Unnamed: 0,Experience,Salary,Education,Department,Performance_Score
0,False,False,False,False,False
1,False,True,False,False,False
2,False,False,False,False,False
3,True,False,False,False,False
4,False,False,True,False,False
...,...,...,...,...,...
495,False,False,False,False,False
496,False,True,False,False,False
497,False,False,False,False,False
498,False,False,False,False,False


In [13]:
data.isnull().sum()

Experience           50
Salary               50
Education            50
Department           50
Performance_Score    50
dtype: int64

In [14]:
data["Salary"].median()


85996.5

In [16]:
data["Experience"].fillna(data["Experience"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Experience"].fillna(data["Experience"].median(), inplace=True)


In [17]:
data.isnull().sum()

Experience            0
Salary               50
Education            50
Department           50
Performance_Score    50
dtype: int64

In [18]:
data["Salary"].median()

85996.5

In [19]:
data["Salary"].fillna(data["Salary"].median(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Salary"].fillna(data["Salary"].median(),inplace=True)


In [20]:
data.isnull().sum()

Experience            0
Salary                0
Education            50
Department           50
Performance_Score    50
dtype: int64

In [21]:
data["Performance_Score"].fillna(data["Performance_Score"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Performance_Score"].fillna(data["Performance_Score"].median(), inplace=True)


In [22]:
data["Education"].fillna(data["Education"].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Education"].fillna(data["Education"].mode()[0],inplace=True)


In [23]:
data["Department"].fillna(data["Department"].mode()[0],inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Department"].fillna(data["Department"].mode()[0],inplace=True)


In [24]:
data.isnull().sum()

Experience           0
Salary               0
Education            0
Department           0
Performance_Score    0
dtype: int64

In [25]:
data.head(10)

Unnamed: 0,Experience,Salary,Education,Department,Performance_Score
0,6.0,70993.0,PhD,HR,47.0
1,3.0,85996.5,PhD,Sales,59.0
2,12.0,30647.0,Graduate,HR,68.0
3,7.0,28716.0,Graduate,IT,93.0
4,10.0,108891.0,PhD,IT,92.0
5,7.0,107545.0,PostGraduate,IT,90.0
6,12.0,137638.0,PhD,IT,84.0
7,4.0,109912.0,PhD,Sales,85.0
8,6.0,85996.5,PhD,IT,49.0
9,9.0,90316.0,Graduate,IT,82.0


In [26]:
data

Unnamed: 0,Experience,Salary,Education,Department,Performance_Score
0,6.0,70993.0,PhD,HR,47.0
1,3.0,85996.5,PhD,Sales,59.0
2,12.0,30647.0,Graduate,HR,68.0
3,7.0,28716.0,Graduate,IT,93.0
4,10.0,108891.0,PhD,IT,92.0
...,...,...,...,...,...
495,12.0,82292.0,Graduate,IT,80.0
496,9.0,85996.5,Graduate,Finance,75.0
497,6.0,24158.0,PhD,Sales,97.0
498,9.0,82680.0,PhD,Sales,92.0


In [27]:
data.head()

Unnamed: 0,Experience,Salary,Education,Department,Performance_Score
0,6.0,70993.0,PhD,HR,47.0
1,3.0,85996.5,PhD,Sales,59.0
2,12.0,30647.0,Graduate,HR,68.0
3,7.0,28716.0,Graduate,IT,93.0
4,10.0,108891.0,PhD,IT,92.0


In [28]:
pd.get_dummies(data)

Unnamed: 0,Experience,Salary,Performance_Score,Education_Graduate,Education_PhD,Education_PostGraduate,Department_Finance,Department_HR,Department_IT,Department_Sales
0,6.0,70993.0,47.0,False,True,False,False,True,False,False
1,3.0,85996.5,59.0,False,True,False,False,False,False,True
2,12.0,30647.0,68.0,True,False,False,False,True,False,False
3,7.0,28716.0,93.0,True,False,False,False,False,True,False
4,10.0,108891.0,92.0,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...
495,12.0,82292.0,80.0,True,False,False,False,False,True,False
496,9.0,85996.5,75.0,True,False,False,True,False,False,False
497,6.0,24158.0,97.0,False,True,False,False,False,False,True
498,9.0,82680.0,92.0,False,True,False,False,False,False,True


In [29]:
pd.get_dummies(data)


Unnamed: 0,Experience,Salary,Performance_Score,Education_Graduate,Education_PhD,Education_PostGraduate,Department_Finance,Department_HR,Department_IT,Department_Sales
0,6.0,70993.0,47.0,False,True,False,False,True,False,False
1,3.0,85996.5,59.0,False,True,False,False,False,False,True
2,12.0,30647.0,68.0,True,False,False,False,True,False,False
3,7.0,28716.0,93.0,True,False,False,False,False,True,False
4,10.0,108891.0,92.0,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...
495,12.0,82292.0,80.0,True,False,False,False,False,True,False
496,9.0,85996.5,75.0,True,False,False,True,False,False,False
497,6.0,24158.0,97.0,False,True,False,False,False,False,True
498,9.0,82680.0,92.0,False,True,False,False,False,False,True


In [30]:
data

Unnamed: 0,Experience,Salary,Education,Department,Performance_Score
0,6.0,70993.0,PhD,HR,47.0
1,3.0,85996.5,PhD,Sales,59.0
2,12.0,30647.0,Graduate,HR,68.0
3,7.0,28716.0,Graduate,IT,93.0
4,10.0,108891.0,PhD,IT,92.0
...,...,...,...,...,...
495,12.0,82292.0,Graduate,IT,80.0
496,9.0,85996.5,Graduate,Finance,75.0
497,6.0,24158.0,PhD,Sales,97.0
498,9.0,82680.0,PhD,Sales,92.0


In [31]:
pd.get_dummies(data)


Unnamed: 0,Experience,Salary,Performance_Score,Education_Graduate,Education_PhD,Education_PostGraduate,Department_Finance,Department_HR,Department_IT,Department_Sales
0,6.0,70993.0,47.0,False,True,False,False,True,False,False
1,3.0,85996.5,59.0,False,True,False,False,False,False,True
2,12.0,30647.0,68.0,True,False,False,False,True,False,False
3,7.0,28716.0,93.0,True,False,False,False,False,True,False
4,10.0,108891.0,92.0,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...
495,12.0,82292.0,80.0,True,False,False,False,False,True,False
496,9.0,85996.5,75.0,True,False,False,True,False,False,False
497,6.0,24158.0,97.0,False,True,False,False,False,False,True
498,9.0,82680.0,92.0,False,True,False,False,False,False,True


In [32]:
pd.get_dummies(data, drop_first=True)


Unnamed: 0,Experience,Salary,Performance_Score,Education_PhD,Education_PostGraduate,Department_HR,Department_IT,Department_Sales
0,6.0,70993.0,47.0,True,False,True,False,False
1,3.0,85996.5,59.0,True,False,False,False,True
2,12.0,30647.0,68.0,False,False,True,False,False
3,7.0,28716.0,93.0,False,False,False,True,False
4,10.0,108891.0,92.0,True,False,False,True,False
...,...,...,...,...,...,...,...,...
495,12.0,82292.0,80.0,False,False,False,True,False
496,9.0,85996.5,75.0,False,False,False,False,False
497,6.0,24158.0,97.0,True,False,False,False,True
498,9.0,82680.0,92.0,True,False,False,False,True


In [33]:
data_encoded.head()


NameError: name 'data_encoded' is not defined

In [34]:
data.head()

Unnamed: 0,Experience,Salary,Education,Department,Performance_Score
0,6.0,70993.0,PhD,HR,47.0
1,3.0,85996.5,PhD,Sales,59.0
2,12.0,30647.0,Graduate,HR,68.0
3,7.0,28716.0,Graduate,IT,93.0
4,10.0,108891.0,PhD,IT,92.0
