In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('Data.csv')
dataset

Unnamed: 0,employee_id,name,job_title,department,email,phone_number,date_of_hiring,salary
0,396941,Tammy Valdez,"Administrator, charities/voluntary organisations",supply-chains,oanderson@gibson.com,512-507-0524x1231,2019-05-12,77744
1,289507,Debbie Castaneda,"Designer, blown glass/stained glass",deliverables,davidbrown@krueger-harper.com,(590)110-8719x53241,2013-12-18,85059
2,500857,James Rodriguez,"Psychologist, counselling",users,laurenwilliams@knapp.com,(983)416-3026x6694,2014-05-31,86053
3,501196,Hunter Brown,Data scientist,action-items,kimberly31@anderson.com,300-921-0488,2022-02-03,59217
4,325944,Jamie Williams,Production manager,communities,david13@smith.com,741.564.4209x04454,2021-11-23,62276
...,...,...,...,...,...,...,...,...
1995,994876,Michael Beck,Human resources officer,functionalities,nicholsonjoseph@hood-spencer.com,+1-690-355-9016x164,2021-06-03,97006
1996,996490,Alexandra Fuller,"Merchandiser, retail",applications,caroline74@rush-blankenship.com,(150)181-0844,2018-01-30,58635
1997,409079,Juan Campbell,Training and development officer,functionalities,anthony31@alvarez.biz,001-117-571-6559x6177,2014-12-04,136633
1998,329310,Jessica Howard,Press photographer,e-business,joshua51@mitchell.net,273.080.8744,2019-03-02,124931


In [3]:
# Memilih kolom yang akan digunakan sebagai DataFrame

data = dataset[["job_title", "department", "salary"]]
data

Unnamed: 0,job_title,department,salary
0,"Administrator, charities/voluntary organisations",supply-chains,77744
1,"Designer, blown glass/stained glass",deliverables,85059
2,"Psychologist, counselling",users,86053
3,Data scientist,action-items,59217
4,Production manager,communities,62276
...,...,...,...
1995,Human resources officer,functionalities,97006
1996,"Merchandiser, retail",applications,58635
1997,Training and development officer,functionalities,136633
1998,Press photographer,e-business,124931


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   job_title   2000 non-null   object
 1   department  2000 non-null   object
 2   salary      2000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 47.0+ KB


In [5]:
# Mengecek apakah ada NaN atau missing values

data.isna().sum().sum()

0

In [6]:
# Memisahkan data categorical dan int

x = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [7]:
print(x)

[['Administrator, charities/voluntary organisations' 'supply-chains']
 ['Designer, blown glass/stained glass' 'deliverables']
 ['Psychologist, counselling' 'users']
 ...
 ['Training and development officer' 'functionalities']
 ['Press photographer' 'e-business']
 ['Mining engineer' 'users']]


In [8]:
print(y)

[ 77744  85059  86053 ... 136633 124931 104141]


In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 1])], remainder='passthrough', sparse_threshold=0)
x = np.array(ct.fit_transform(x))

In [10]:
print(x)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 1. 0. 0.]]


In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [12]:
print(x_train)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [13]:
print(x_test)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [14]:
print(y_train)

[ 77744  92026 122972 ...  63122  57529 145238]


In [15]:
print(y_test)

[148074 146471  68483 138854  76756  93986 137478 121938  48827  86301
 127750 129307 100187  83339 128708 108980 103392  84429  70974  61096
 148188 100839 101171 123865  54960  46790  93211 146803 112334  54342
 127349  67798 104352  53998  38305  80295  37653 139544  49953 124978
  57977  43717  58463  96379  43020  93791  90181  50949 116261 126091
  96687  86830  73557 139210 128717  87725  30428  34268 121596  77622
  50793 100957  88551  33648 103980  38985 141338 101613 136819  31567
  88974  48764 135749  74937 141706  66511  48153  49601  64125 134757
  98184  63038  98958  99347  87248  36858  33168 120863 133844 121144
  40330 133854  59419  33148  37764 143118  98866 111570  37721 123350
  97129  52260  44571  60928  94128 146692  34757 134204  37400  91175
 128396  38254 123255  78253  65502 144882  85118 135131 121371 133443
 104141 139899 128676 129216  66315  52376 107504  93998  61445 135924
  55337 105332  44017  47577 143819  48258 143838 144852 106885 132801
 13961

In [16]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train[:, 3:] = sc.fit_transform(x_train[:, 3:])
x_test[:, 3:] = sc.transform(x_test[:, 3:])

In [17]:
print(x_train)

[[ 0.          0.          0.         ... -0.16012815 -0.14285714
  -0.14056247]
 [ 0.          0.          0.         ... -0.16012815 -0.14285714
  -0.14056247]
 [ 0.          0.          0.         ... -0.16012815 -0.14285714
  -0.14056247]
 ...
 [ 0.          0.          0.         ... -0.16012815 -0.14285714
  -0.14056247]
 [ 0.          0.          0.         ... -0.16012815 -0.14285714
  -0.14056247]
 [ 0.          0.          0.         ... -0.16012815 -0.14285714
  -0.14056247]]


In [18]:
print(x_test)

[[ 0.          0.          0.         ... -0.16012815 -0.14285714
  -0.14056247]
 [ 0.          0.          0.         ... -0.16012815 -0.14285714
  -0.14056247]
 [ 0.          0.          0.         ... -0.16012815 -0.14285714
  -0.14056247]
 ...
 [ 0.          0.          0.         ... -0.16012815 -0.14285714
  -0.14056247]
 [ 0.          1.          0.         ... -0.16012815 -0.14285714
  -0.14056247]
 [ 0.          0.          0.         ... -0.16012815 -0.14285714
  -0.14056247]]
