In [55]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [56]:
dataset = pd.read_csv('Final_Employees_Data.csv')

In [57]:
len(dataset)

298

In [58]:
dataset.head()

Unnamed: 0,EducationLevel,Age,Salary,YearsOfExperience,Gender,JobTitle
0,Bachelor's,32.0,90000.0,5.0,Male,Accountant
1,Master's,28.0,65000.0,3.0,Female,Business Analyst
2,PhD,45.0,,15.0,Male,Data Scientist
3,Bachelor's,36.0,60000.0,,Female,Software Engineer
4,Master's,52.0,200000.0,20.0,Male,Software Engineer


In [59]:
dataset.describe()

Unnamed: 0,Age,Salary,YearsOfExperience
count,295.0,290.0,294.0
mean,37.281356,98587.413793,10.127551
std,7.283297,48168.944918,6.670184
min,23.0,350.0,0.0
25%,31.0,55000.0,4.0
50%,36.0,95000.0,9.0
75%,43.5,130000.0,15.0
max,53.0,250000.0,25.0


In [60]:
dataset.dtypes

EducationLevel        object
Age                  float64
Salary               float64
YearsOfExperience    float64
Gender                object
JobTitle              object
dtype: object

In [61]:
dataset.isnull()

Unnamed: 0,EducationLevel,Age,Salary,YearsOfExperience,Gender,JobTitle
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,True,False,False,False
3,False,False,False,True,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
293,False,False,False,False,False,False
294,False,False,False,False,False,False
295,False,False,False,False,False,False
296,False,False,False,False,False,False


In [62]:
dataset.isnull().any()

EducationLevel       False
Age                   True
Salary                True
YearsOfExperience     True
Gender               False
JobTitle             False
dtype: bool

In [63]:
dataset.isnull().sum()

EducationLevel       0
Age                  3
Salary               8
YearsOfExperience    4
Gender               0
JobTitle             0
dtype: int64

In [64]:
dataset.isnull().sum().sum()

15

In [65]:
dataEncoding = dataset.replace({
    'Gender':{'Male':0,'Female':1},
    'EducationLevel':{'Bachelor\'s':0,'Master\'s':1,'PhD':2}}
)
dataEncoding

Unnamed: 0,EducationLevel,Age,Salary,YearsOfExperience,Gender,JobTitle
0,0,32.0,90000.0,5.0,0,Accountant
1,1,28.0,65000.0,3.0,1,Business Analyst
2,2,45.0,,15.0,0,Data Scientist
3,0,36.0,60000.0,,1,Software Engineer
4,1,52.0,200000.0,20.0,0,Software Engineer
...,...,...,...,...,...,...
293,0,33.0,60000.0,4.0,0,Software Engineer
294,0,40.0,130000.0,12.0,1,Data Scientist
295,2,44.0,160000.0,16.0,0,Software Engineer
296,0,30.0,40000.0,2.0,1,Marketing Manager


In [66]:
dataEncoding2 = pd.get_dummies(dataEncoding)
dataEncoding2.head()

Unnamed: 0,EducationLevel,Age,Salary,YearsOfExperience,Gender,JobTitle_Accountant,JobTitle_Business Analyst,JobTitle_Data Scientist,JobTitle_Marketing Manager,JobTitle_Software Engineer
0,0,32.0,90000.0,5.0,0,True,False,False,False,False
1,1,28.0,65000.0,3.0,1,False,True,False,False,False
2,2,45.0,,15.0,0,False,False,True,False,False
3,0,36.0,60000.0,,1,False,False,False,False,True
4,1,52.0,200000.0,20.0,0,False,False,False,False,True


In [67]:
from sklearn.impute import SimpleImputer
X = dataset.iloc[:,:-1].values
X[:, 1:4]
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:4])
X[:, 1:4] = imputer.transform(X[:, 1:4])
X

array([["Bachelor's", 32.0, 90000.0, 5.0, 'Male'],
       ["Master's", 28.0, 65000.0, 3.0, 'Female'],
       ['PhD', 45.0, 98587.41379310345, 15.0, 'Male'],
       ...,
       ['PhD', 44.0, 160000.0, 16.0, 'Male'],
       ["Bachelor's", 30.0, 40000.0, 2.0, 'Female'],
       ["Bachelor's", 37.0, 100000.0, 9.0, 'Male']], dtype=object)

In [68]:
# Splitting the dataset into the training set and test set
from sklearn.model_selection import train_test_split
Y=dataset.iloc[:,-1].values
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
print(X_train)
print(X_test)
print(y_train)
# print(y_test) 
print(y_test)

[['PhD' 44.0 145000.0 15.0 'Female']
 ["Bachelor's" 36.0 60000.0 7.0 'Male']
 ["Bachelor's" 29.0 35000.0 1.5 'Male']
 ...
 ['PhD' 45.0 98587.41379310345 16.0 'Male']
 ["Bachelor's" 37.0 100000.0 9.0 'Female']
 ["Master's" 42.0 115000.0 14.0 'Male']]
[["Bachelor's" 36.0 60000.0 7.0 'Male']
 ["Bachelor's" 32.0 45000.0 3.0 'Male']
 ["Master's" 33.0 85000.0 7.0 'Male']
 ["Bachelor's" 35.0 80000.0 7.0 'Male']
 ["Bachelor's" 34.0 70000.0 6.0 'Female']
 ["Bachelor's" 35.0 95000.0 9.0 'Male']
 ["Bachelor's" 38.0 120000.0 9.0 'Female']
 ["Bachelor's" 40.0 80000.0 12.0 'Female']
 ["Bachelor's" 24.0 40000.0 1.0 'Male']
 ["Bachelor's" 33.0 50000.0 5.0 'Female']
 ["Master's" 42.0 110000.0 15.0 'Female']
 ['PhD' 48.0 170000.0 20.0 'Male']
 ["Bachelor's" 48.0 140000.0 18.0 'Female']
 ['PhD' 50.0 180000.0 22.0 'Female']
 ["Master's" 41.0 100000.0 14.0 'Female']
 ["Bachelor's" 34.0 100000.0 9.0 'Male']
 ['PhD' 44.0 160000.0 16.0 'Male']
 ["Bachelor's" 45.0 110000.0 18.0 'Male']
 ["Master's" 47.0 180000