# Read Data

In [26]:
import pandas as pd
import numpy as np

data = pd.read_csv('ds_salaries.csv', header=None)
data.columns = ['work_year', 'experience_level', 'employment_type', 'job_title',
                'salary', 'salary_currency', 'salary_in_usd',
                'employee_residence', 'remote_ratio', 'company_location','company_size']

data = data.drop(['work_year'],axis=1)
print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))
data.head()

Number of instances = 3755
Number of attributes = 10


Unnamed: 0,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


# Sampling

In [27]:
from sklearn.model_selection import train_test_split
import random
from sklearn.impute import SimpleImputer

data = data.replace('?',np.NaN)
imputer = SimpleImputer(strategy='mean')

print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))

print('Number of missing values:')
for col in data.columns:
    print('\t%s: %d' % (col,data[col].isna().sum()))
    
features = ['salary', 'salary_in_usd', 'remote_ratio'] #特徵列
data[features] = imputer.fit_transform(data[features]) #轉換數值

X = data[features]
y = data["salary_in_usd"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) #test_size測試資料比例

Number of instances = 3755
Number of attributes = 10
Number of missing values:
	experience_level: 0
	employment_type: 0
	job_title: 0
	salary: 0
	salary_currency: 0
	salary_in_usd: 0
	employee_residence: 0
	remote_ratio: 0
	company_location: 0
	company_size: 0


In [28]:
# 使用 sample_without_replacement()
sampled_data = random.sample(list(X_train.index), k=100)  #100個樣本

# 取樣
sampled_X_train = X_train.loc[sampled_data]
sampled_y_train = y_train.loc[sampled_data]

sampled_X_train.head()

Unnamed: 0,salary,salary_in_usd,remote_ratio
1583,145000.0,145000.0,100.0
3121,160000.0,160000.0,0.0
77,168100.0,168100.0,100.0
3529,150000.0,150000.0,100.0
647,180180.0,180180.0,0.0


# Binarize

In [29]:
from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold=50) #二元化分界
binary_data = binarizer.transform(data['remote_ratio'].values.reshape(-1, 1))
data['remote_ratio_binary'] = binary_data

data['remote_ratio'] = data['remote_ratio_binary']
data.drop('remote_ratio_binary', axis=1, inplace=True)

print(binary_data)

[[1.]
 [1.]
 [1.]
 ...
 [1.]
 [1.]
 [0.]]


In [30]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()

categorical_columns = ['company_size']
encoded_data = encoder.fit_transform(data[categorical_columns])
encoded_columns = encoder.get_feature_names_out(categorical_columns)

print(encoded_data)

  (0, 0)	1.0
  (1, 2)	1.0
  (2, 2)	1.0
  (3, 1)	1.0
  (4, 1)	1.0
  (5, 0)	1.0
  (6, 0)	1.0
  (7, 1)	1.0
  (8, 1)	1.0
  (9, 1)	1.0
  (10, 1)	1.0
  (11, 1)	1.0
  (12, 1)	1.0
  (13, 0)	1.0
  (14, 0)	1.0
  (15, 1)	1.0
  (16, 1)	1.0
  (17, 1)	1.0
  (18, 1)	1.0
  (19, 1)	1.0
  (20, 1)	1.0
  (21, 1)	1.0
  (22, 1)	1.0
  (23, 1)	1.0
  (24, 1)	1.0
  :	:
  (3730, 2)	1.0
  (3731, 2)	1.0
  (3732, 0)	1.0
  (3733, 0)	1.0
  (3734, 0)	1.0
  (3735, 2)	1.0
  (3736, 0)	1.0
  (3737, 0)	1.0
  (3738, 0)	1.0
  (3739, 0)	1.0
  (3740, 1)	1.0
  (3741, 1)	1.0
  (3742, 1)	1.0
  (3743, 1)	1.0
  (3744, 2)	1.0
  (3745, 2)	1.0
  (3746, 1)	1.0
  (3747, 0)	1.0
  (3748, 0)	1.0
  (3749, 0)	1.0
  (3750, 0)	1.0
  (3751, 0)	1.0
  (3752, 2)	1.0
  (3753, 0)	1.0
  (3754, 0)	1.0


# Discretization

In [31]:
from sklearn.preprocessing import KBinsDiscretizer

n_bins = 10  # 分箱數
strategy = 'uniform'  # 分箱策略: uniform、quantile、kmeans
discretizer = KBinsDiscretizer(n_bins=n_bins, encode='onehot', strategy=strategy)

# 選擇要分箱的特徵列
feature_to_discretize = ['salary_in_usd']

data_binned = discretizer.fit_transform(data[feature_to_discretize])
feature_names = discretizer.get_feature_names_out(feature_to_discretize)

print(data_binned)

  (0, 1)	1.0
  (1, 0)	1.0
  (2, 0)	1.0
  (3, 3)	1.0
  (4, 2)	1.0
  (5, 4)	1.0
  (6, 2)	1.0
  (7, 4)	1.0
  (8, 3)	1.0
  (9, 3)	1.0
  (10, 1)	1.0
  (11, 2)	1.0
  (12, 2)	1.0
  (13, 4)	1.0
  (14, 2)	1.0
  (15, 3)	1.0
  (16, 1)	1.0
  (17, 3)	1.0
  (18, 3)	1.0
  (19, 3)	1.0
  (20, 2)	1.0
  (21, 6)	1.0
  (22, 3)	1.0
  (23, 5)	1.0
  (24, 3)	1.0
  :	:
  (3730, 1)	1.0
  (3731, 1)	1.0
  (3732, 2)	1.0
  (3733, 5)	1.0
  (3734, 0)	1.0
  (3735, 0)	1.0
  (3736, 1)	1.0
  (3737, 1)	1.0
  (3738, 1)	1.0
  (3739, 0)	1.0
  (3740, 4)	1.0
  (3741, 2)	1.0
  (3742, 2)	1.0
  (3743, 2)	1.0
  (3744, 0)	1.0
  (3745, 3)	1.0
  (3746, 2)	1.0
  (3747, 9)	1.0
  (3748, 0)	1.0
  (3749, 3)	1.0
  (3750, 9)	1.0
  (3751, 3)	1.0
  (3752, 2)	1.0
  (3753, 2)	1.0
  (3754, 2)	1.0


# Standardize

In [7]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

feature_to_scale = ['salary_in_usd']
data[feature_to_scale] = scaler.fit_transform(data[feature_to_scale])

print(data[feature_to_scale])

      salary_in_usd
0         -1.563635
1         -1.563635
2         -1.563635
3          0.639535
4          0.639535
...             ...
3750       0.639535
3751       0.639535
3752       0.639535
3753      -1.563635
3754      -1.563635

[3755 rows x 1 columns]


# Normalize

In [229]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

feature_to_scale = ['salary_in_usd']
data[feature_to_scale] = scaler.fit_transform(data[feature_to_scale])

print(data[feature_to_scale])

      salary_in_usd
0          0.181436
1          0.055900
2          0.045784
3          0.381839
4          0.258207
...             ...
3750       0.914581
3751       0.327891
3752       0.224489
3753       0.213250
3754       0.201257

[3755 rows x 1 columns]


# Dimension reduction

In [230]:
from sklearn.decomposition import PCA

features_to_pca = ['salary', 'salary_in_usd', 'remote_ratio']

n_components = 2  # 指定降维后的维度
pca = PCA(n_components=n_components)

data_pca = pca.fit_transform(data[features_to_pca])

data[['PCA_1', 'PCA_2']] = data_pca

data.drop(features_to_pca, axis=1, inplace=True)

print(data_pca)

[[-1.10695572e+05 -5.39584513e+01]
 [-1.60695572e+05 -5.40623941e+01]
 [-1.65195572e+05 -5.40717487e+01]
 ...
 [-8.56955717e+04 -5.39064836e+01]
 [-9.06955717e+04 -5.39168776e+01]
 [ 6.80930443e+06  1.04240061e+01]]


# Feature selection